Coverage for src/qdrant_loader/core/chunking/strategy/markdown/document_parser.py: 96%

113 statements  

« prev     ^ index     » next       coverage.py v7.10.0, created at 2025-07-25 11:39 +0000

1"""Document parsing for markdown chunking strategy.""" 

2 

3import re 

4from dataclasses import dataclass, field 

5from enum import Enum 

6from typing import Any, Optional 

7 

8import structlog 

9 

10logger = structlog.get_logger(__name__) 

11 

12 

13class SectionType(Enum): 

14 """Types of sections in a markdown document.""" 

15 

16 HEADER = "header" 

17 CODE_BLOCK = "code_block" 

18 LIST = "list" 

19 TABLE = "table" 

20 QUOTE = "quote" 

21 PARAGRAPH = "paragraph" 

22 

23 

24@dataclass 

25class Section: 

26 """Represents a section in a markdown document.""" 

27 

28 content: str 

29 level: int = 0 

30 type: SectionType = SectionType.PARAGRAPH 

31 parent: Optional["Section"] = None 

32 children: list["Section"] = field(default_factory=list) 

33 

34 def add_child(self, child: "Section"): 

35 """Add a child section.""" 

36 self.children.append(child) 

37 child.parent = self 

38 

39 

40class SectionIdentifier: 

41 """Identifies section types based on content patterns.""" 

42 

43 @staticmethod 

44 def identify_section_type(content: str) -> SectionType: 

45 """Identify the type of section based on its content. 

46 

47 Args: 

48 content: The section content to analyze 

49 

50 Returns: 

51 SectionType enum indicating the type of section 

52 """ 

53 if re.match(r"^#{1,6}\s+", content): 

54 return SectionType.HEADER 

55 elif re.match(r"^```", content): 

56 return SectionType.CODE_BLOCK 

57 elif re.match(r"^[*-]\s+", content): 

58 return SectionType.LIST 

59 elif re.match(r"^\|", content): 

60 return SectionType.TABLE 

61 elif re.match(r"^>", content): 

62 return SectionType.QUOTE 

63 return SectionType.PARAGRAPH 

64 

65 

66class HierarchyBuilder: 

67 """Builds hierarchical section relationships.""" 

68 

69 @staticmethod 

70 def build_section_breadcrumb(section: Section) -> str: 

71 """Build a breadcrumb path of section titles to capture hierarchy. 

72 

73 Args: 

74 section: The section to build breadcrumb for 

75 

76 Returns: 

77 String representing the hierarchical path 

78 """ 

79 breadcrumb_parts = [] 

80 current = section 

81 

82 # Walk up the parent chain to build the breadcrumb 

83 while current.parent: 

84 header_match = re.match(r"^(#+)\s+(.*?)(?:\n|$)", current.parent.content) 

85 if header_match: 

86 parent_title = header_match.group(2).strip() 

87 breadcrumb_parts.insert(0, parent_title) 

88 current = current.parent 

89 

90 # Add current section 

91 header_match = re.match(r"^(#+)\s+(.*?)(?:\n|$)", section.content) 

92 if header_match: 

93 title = header_match.group(2).strip() 

94 breadcrumb_parts.append(title) 

95 

96 return " > ".join(breadcrumb_parts) 

97 

98 @staticmethod 

99 def get_section_path( 

100 header_item: dict[str, Any], structure: list[dict[str, Any]] 

101 ) -> list[str]: 

102 """Get the path of parent headers for a section. 

103 

104 Args: 

105 header_item: The header item 

106 structure: The document structure 

107 

108 Returns: 

109 List of parent section titles 

110 """ 

111 path = [] 

112 current_level = header_item["level"] 

113 

114 # Go backward through structure to find parent headers 

115 for item in reversed(structure[: structure.index(header_item)]): 

116 if item["type"] == "header" and item["level"] < current_level: 

117 path.insert(0, item["title"]) 

118 current_level = item["level"] 

119 

120 return path 

121 

122 

123class DocumentParser: 

124 """Parses markdown documents into structured representations.""" 

125 

126 def __init__(self): 

127 """Initialize the document parser.""" 

128 self.section_identifier = SectionIdentifier() 

129 self.hierarchy_builder = HierarchyBuilder() 

130 

131 def parse_document_structure(self, text: str) -> list[dict[str, Any]]: 

132 """Parse document into a structured representation. 

133 

134 Args: 

135 text: The document text 

136 

137 Returns: 

138 List of dictionaries representing document elements 

139 """ 

140 elements = [] 

141 lines = text.split("\n") 

142 current_block = [] 

143 in_code_block = False 

144 

145 for line in lines: 

146 # Check for code block markers 

147 if line.startswith("```"): 

148 in_code_block = not in_code_block 

149 current_block.append(line) 

150 continue 

151 

152 # Inside code block, just accumulate lines 

153 if in_code_block: 

154 current_block.append(line) 

155 continue 

156 

157 # Check for headers 

158 header_match = re.match(r"^(#{1,6})\s+(.*?)$", line) 

159 if header_match and not in_code_block: 

160 # If we have a current block, save it 

161 if current_block: 

162 elements.append( 

163 { 

164 "type": "content", 

165 "text": "\n".join(current_block), 

166 "level": 0, 

167 } 

168 ) 

169 current_block = [] 

170 

171 # Save the header 

172 level = len(header_match.group(1)) 

173 elements.append( 

174 { 

175 "type": "header", 

176 "text": line, 

177 "level": level, 

178 "title": header_match.group(2).strip(), 

179 } 

180 ) 

181 else: 

182 current_block.append(line) 

183 

184 # Save the last block if not empty 

185 if current_block: 

186 elements.append( 

187 {"type": "content", "text": "\n".join(current_block), "level": 0} 

188 ) 

189 

190 return elements 

191 

192 def extract_section_metadata(self, section: Section) -> dict[str, Any]: 

193 """Extract metadata from a section. 

194 

195 Args: 

196 section: The section to analyze 

197 

198 Returns: 

199 Dictionary containing section metadata 

200 """ 

201 metadata = { 

202 "type": section.type.value, 

203 "level": section.level, 

204 "word_count": len(section.content.split()), 

205 "char_count": len(section.content), 

206 "has_code": bool(re.search(r"```", section.content)), 

207 "has_links": bool(re.search(r"\[.*?\]\(.*?\)", section.content)), 

208 "has_images": bool(re.search(r"!\[.*?\]\(.*?\)", section.content)), 

209 "is_top_level": section.level <= 2, # Mark top-level sections 

210 } 

211 

212 # Add parent section info if available 

213 if section.parent: 

214 header_match = re.match(r"^(#+)\s+(.*?)(?:\n|$)", section.parent.content) 

215 if header_match: 

216 parent_title = header_match.group(2).strip() 

217 metadata["parent_title"] = parent_title 

218 metadata["parent_level"] = section.parent.level 

219 

220 # Add breadcrumb path for hierarchical context 

221 breadcrumb = self.hierarchy_builder.build_section_breadcrumb(section) 

222 if breadcrumb: 

223 metadata["breadcrumb"] = breadcrumb 

224 

225 return metadata 

226 

227 def extract_section_title(self, chunk: str) -> str: 

228 """Extract section title from a chunk. 

229 

230 Args: 

231 chunk: The text chunk 

232 

233 Returns: 

234 Section title or default title 

235 """ 

236 # Try to find header at the beginning of the chunk 

237 header_match = re.match(r"^(#{1,6})\s+(.*?)(?:\n|$)", chunk) 

238 if header_match: 

239 return header_match.group(2).strip() 

240 

241 # Try to find the first sentence if no header 

242 first_sentence_match = re.match(r"^([^\.!?]+[\.!?])", chunk) 

243 if first_sentence_match: 

244 title = first_sentence_match.group(1).strip() 

245 # Truncate if too long 

246 if len(title) > 50: 

247 title = title[:50] + "..." 

248 return title 

249 

250 return "Untitled Section"