Coverage for src/qdrant_loader/core/chunking/strategy/html/html_document_parser.py: 77%

160 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""HTML-specific document parser for DOM structure analysis.""" 

2 

3import re 

4from enum import Enum 

5from typing import Any 

6 

7from bs4 import BeautifulSoup, Tag 

8 

9from qdrant_loader.core.chunking.strategy.base.document_parser import BaseDocumentParser 

10 

11 

12class SectionType(Enum): 

13 """Types of sections in an HTML document.""" 

14 

15 HEADER = "header" 

16 ARTICLE = "article" 

17 SECTION = "section" 

18 NAV = "nav" 

19 ASIDE = "aside" 

20 MAIN = "main" 

21 PARAGRAPH = "paragraph" 

22 LIST = "list" 

23 TABLE = "table" 

24 CODE_BLOCK = "code_block" 

25 BLOCKQUOTE = "blockquote" 

26 DIV = "div" 

27 FOOTER = "footer" 

28 

29 

30class HTMLDocumentParser(BaseDocumentParser): 

31 """Parser for HTML documents with semantic analysis.""" 

32 

33 def __init__(self): 

34 """Initialize the HTML document parser.""" 

35 # Define semantic HTML elements that should be treated as section boundaries 

36 self.section_elements = { 

37 "article", 

38 "section", 

39 "main", 

40 "header", 

41 "footer", 

42 "nav", 

43 "aside", 

44 } 

45 

46 # Define heading elements for hierarchy 

47 self.heading_elements = {"h1", "h2", "h3", "h4", "h5", "h6"} 

48 

49 # Define block-level elements that can form chunks 

50 self.block_elements = { 

51 "div", 

52 "p", 

53 "blockquote", 

54 "pre", 

55 "ul", 

56 "ol", 

57 "li", 

58 "table", 

59 "figure", 

60 "form", 

61 } 

62 

63 def parse_document_structure(self, content: str) -> dict[str, Any]: 

64 """Parse HTML DOM structure and extract semantic information.""" 

65 try: 

66 soup = BeautifulSoup(content, "html.parser") 

67 

68 # Remove script and style elements for cleaner analysis 

69 for script in soup(["script", "style"]): 

70 script.decompose() 

71 

72 # Extract document outline 

73 headings = self._extract_heading_hierarchy(soup) 

74 semantic_elements = self._identify_semantic_elements(soup) 

75 links = self._extract_links(soup) 

76 accessibility = self._analyze_accessibility(soup) 

77 

78 return { 

79 "heading_hierarchy": headings, 

80 "semantic_elements": semantic_elements, 

81 "internal_links": len( 

82 [link for link in links if link.get("internal", False)] 

83 ), 

84 "external_links": len( 

85 [link for link in links if not link.get("internal", False)] 

86 ), 

87 "has_navigation": bool(soup.find("nav")), 

88 "has_main_content": bool(soup.find("main")), 

89 "has_header": bool(soup.find("header")), 

90 "has_footer": bool(soup.find("footer")), 

91 "has_aside": bool(soup.find("aside")), 

92 "structure_type": "html", 

93 "accessibility_features": accessibility, 

94 "form_count": len(soup.find_all("form")), 

95 "table_count": len(soup.find_all("table")), 

96 "image_count": len(soup.find_all("img")), 

97 "list_count": len(soup.find_all(["ul", "ol"])), 

98 "content_sections": len(soup.find_all(list(self.section_elements))), 

99 } 

100 except Exception as e: 

101 # Fallback structure for malformed HTML 

102 return { 

103 "heading_hierarchy": [], 

104 "semantic_elements": [], 

105 "internal_links": 0, 

106 "external_links": 0, 

107 "has_navigation": False, 

108 "has_main_content": False, 

109 "has_header": False, 

110 "has_footer": False, 

111 "has_aside": False, 

112 "structure_type": "html_malformed", 

113 "accessibility_features": {}, 

114 "form_count": 0, 

115 "table_count": 0, 

116 "image_count": 0, 

117 "list_count": 0, 

118 "content_sections": 0, 

119 "parse_error": str(e), 

120 } 

121 

122 def extract_section_metadata(self, section: Any) -> dict[str, Any]: 

123 """Extract metadata from an HTML section.""" 

124 if isinstance(section, dict): 

125 # Already processed section metadata 

126 return section 

127 

128 if isinstance(section, Tag): 

129 return self._extract_tag_metadata(section) 

130 

131 # Fallback for string content 

132 return { 

133 "tag_name": "div", 

134 "section_type": SectionType.DIV.value, 

135 "level": 0, 

136 "attributes": {}, 

137 "has_links": bool(re.search(r"<a\s+[^>]*href", str(section))), 

138 "has_images": bool(re.search(r"<img\s+[^>]*src", str(section))), 

139 } 

140 

141 def _extract_heading_hierarchy(self, soup: BeautifulSoup) -> list[dict[str, Any]]: 

142 """Extract document heading hierarchy.""" 

143 headings = [] 

144 

145 for heading in soup.find_all(list(self.heading_elements)): 

146 level = int(heading.name[1]) # Extract number from h1, h2, etc. 

147 text = heading.get_text(strip=True) 

148 

149 headings.append( 

150 { 

151 "level": level, 

152 "text": text, 

153 "tag": heading.name, 

154 "id": heading.get("id"), 

155 "classes": heading.get("class", []), 

156 } 

157 ) 

158 

159 return headings 

160 

161 def _identify_semantic_elements(self, soup: BeautifulSoup) -> list[dict[str, Any]]: 

162 """Identify semantic HTML elements and their roles.""" 

163 semantic_elements = [] 

164 

165 for element in soup.find_all(list(self.section_elements)): 

166 semantic_elements.append( 

167 { 

168 "tag": element.name, 

169 "role": element.get("role"), 

170 "id": element.get("id"), 

171 "classes": element.get("class", []), 

172 "text_length": len(element.get_text(strip=True)), 

173 "has_children": bool(element.find_all()), 

174 } 

175 ) 

176 

177 return semantic_elements 

178 

179 def _extract_links(self, soup: BeautifulSoup) -> list[dict[str, Any]]: 

180 """Extract and categorize links.""" 

181 links = [] 

182 

183 for link in soup.find_all("a", href=True): 

184 href = link["href"] 

185 text = link.get_text(strip=True) 

186 

187 # Determine if link is internal or external 

188 is_internal = ( 

189 href.startswith("#") 

190 or href.startswith("/") 

191 or href.startswith("./") 

192 or href.startswith("../") 

193 or not href.startswith(("http://", "https://", "mailto:", "tel:")) 

194 ) 

195 

196 links.append( 

197 { 

198 "href": href, 

199 "text": text, 

200 "internal": is_internal, 

201 "title": link.get("title"), 

202 "target": link.get("target"), 

203 } 

204 ) 

205 

206 return links 

207 

208 def _analyze_accessibility(self, soup: BeautifulSoup) -> dict[str, Any]: 

209 """Analyze accessibility features of the HTML document.""" 

210 accessibility = { 

211 "has_lang_attribute": bool(soup.find("html", lang=True)), 

212 "has_title": bool(soup.find("title")), 

213 "images_with_alt": 0, 

214 "images_without_alt": 0, 

215 "headings_properly_nested": True, 

216 "has_skip_links": False, 

217 "form_labels": 0, 

218 "form_inputs": 0, 

219 } 

220 

221 # Analyze images 

222 for img in soup.find_all("img"): 

223 if img.get("alt") is not None: 

224 accessibility["images_with_alt"] += 1 

225 else: 

226 accessibility["images_without_alt"] += 1 

227 

228 # Check for skip links 

229 skip_link_indicators = ["skip", "jump", "goto"] 

230 for link in soup.find_all("a", href=True): 

231 link_text = link.get_text(strip=True).lower() 

232 if any(indicator in link_text for indicator in skip_link_indicators): 

233 accessibility["has_skip_links"] = True 

234 break 

235 

236 # Analyze forms 

237 accessibility["form_inputs"] = len( 

238 soup.find_all(["input", "textarea", "select"]) 

239 ) 

240 accessibility["form_labels"] = len(soup.find_all("label")) 

241 

242 # Check heading nesting (simplified) 

243 headings = soup.find_all(list(self.heading_elements)) 

244 if len(headings) > 1: 

245 prev_level = 0 

246 for heading in headings: 

247 level = int(heading.name[1]) 

248 if prev_level > 0 and level > prev_level + 1: 

249 accessibility["headings_properly_nested"] = False 

250 break 

251 prev_level = level 

252 

253 return accessibility 

254 

255 def _extract_tag_metadata(self, tag: Tag) -> dict[str, Any]: 

256 """Extract metadata from a BeautifulSoup tag.""" 

257 tag_name = tag.name.lower() 

258 section_type = self._identify_section_type(tag) 

259 

260 # Get attributes (limited for performance) 

261 attributes = {} 

262 if tag.attrs: 

263 # Only keep essential attributes 

264 for attr in ["id", "class", "role", "data-*"]: 

265 if attr in tag.attrs: 

266 attributes[attr] = tag.attrs[attr] 

267 elif attr == "data-*": 

268 # Collect data attributes 

269 data_attrs = { 

270 k: v for k, v in tag.attrs.items() if k.startswith("data-") 

271 } 

272 if data_attrs: 

273 attributes["data_attributes"] = data_attrs 

274 

275 text_content = tag.get_text(strip=True) 

276 

277 return { 

278 "tag_name": tag_name, 

279 "section_type": section_type.value, 

280 "level": self._get_heading_level(tag), 

281 "attributes": attributes, 

282 "text_content": text_content, 

283 "word_count": len(text_content.split()), 

284 "char_count": len(text_content), 

285 "has_code": section_type == SectionType.CODE_BLOCK, 

286 "has_links": bool(tag.find_all("a")), 

287 "has_images": bool(tag.find_all("img")), 

288 "is_semantic": tag_name in self.section_elements, 

289 "is_heading": tag_name in self.heading_elements, 

290 "child_count": len(tag.find_all()), 

291 } 

292 

293 def _identify_section_type(self, tag: Tag) -> SectionType: 

294 """Identify the type of section based on the HTML tag.""" 

295 tag_name = tag.name.lower() 

296 

297 if tag_name in self.heading_elements: 

298 return SectionType.HEADER 

299 elif tag_name == "article": 

300 return SectionType.ARTICLE 

301 elif tag_name == "section": 

302 return SectionType.SECTION 

303 elif tag_name == "nav": 

304 return SectionType.NAV 

305 elif tag_name == "aside": 

306 return SectionType.ASIDE 

307 elif tag_name == "main": 

308 return SectionType.MAIN 

309 elif tag_name == "footer": 

310 return SectionType.FOOTER 

311 elif tag_name in ["ul", "ol", "li"]: 

312 return SectionType.LIST 

313 elif tag_name == "table": 

314 return SectionType.TABLE 

315 elif tag_name in ["pre", "code"]: 

316 return SectionType.CODE_BLOCK 

317 elif tag_name == "blockquote": 

318 return SectionType.BLOCKQUOTE 

319 elif tag_name == "p": 

320 return SectionType.PARAGRAPH 

321 else: 

322 return SectionType.DIV 

323 

324 def _get_heading_level(self, tag: Tag) -> int: 

325 """Get the heading level from an HTML heading tag.""" 

326 if tag.name.lower() in self.heading_elements: 

327 return int(tag.name[1]) # Extract number from h1, h2, etc. 

328 return 0 

329 

330 def extract_section_title(self, content: str) -> str: 

331 """Extract a title from HTML content.""" 

332 try: 

333 soup = BeautifulSoup(content, "html.parser") 

334 

335 # Try to find title in various elements 

336 for tag in ["h1", "h2", "h3", "h4", "h5", "h6", "title"]: 

337 element = soup.find(tag) 

338 if element: 

339 title = element.get_text(strip=True) 

340 if title: 

341 return title[:100] # Limit title length 

342 

343 # Try to find text in semantic elements 

344 for tag in ["article", "section", "main"]: 

345 element = soup.find(tag) 

346 if element: 

347 text = element.get_text(strip=True) 

348 if text: 

349 return self._extract_title_from_content(text) 

350 

351 # Fallback to first text content 

352 text = soup.get_text(strip=True) 

353 if text: 

354 return self._extract_title_from_content(text) 

355 

356 return "Untitled Section" 

357 

358 except Exception: 

359 return "Untitled Section" 

360 

361 def _extract_title_from_content(self, content: str) -> str: 

362 """Extract a title from content text.""" 

363 if not content: 

364 return "Untitled Section" 

365 

366 # Take first line or first 50 characters, whichever is shorter 

367 lines = content.strip().split("\n") 

368 first_line = lines[0].strip() if lines else "" 

369 

370 if first_line: 

371 # Limit title length for performance 

372 return first_line[:100] if len(first_line) > 100 else first_line 

373 

374 return "Untitled Section"