Coverage for src/qdrant_loader/core/chunking/strategy/html/html_document_parser.py: 77%

160 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-13 09:19 +0000

1"""HTML-specific document parser for DOM structure analysis.""" 

2 

3import re 

4from enum import Enum 

5from typing import Any 

6 

7from bs4 import BeautifulSoup, Tag 

8 

9from qdrant_loader.core.chunking.strategy.base.document_parser import BaseDocumentParser 

10 

11 

12class SectionType(Enum): 

13 """Types of sections in an HTML document.""" 

14 

15 HEADER = "header" 

16 ARTICLE = "article" 

17 SECTION = "section" 

18 NAV = "nav" 

19 ASIDE = "aside" 

20 MAIN = "main" 

21 PARAGRAPH = "paragraph" 

22 LIST = "list" 

23 TABLE = "table" 

24 CODE_BLOCK = "code_block" 

25 BLOCKQUOTE = "blockquote" 

26 DIV = "div" 

27 FOOTER = "footer" 

28 

29 

30class HTMLDocumentParser(BaseDocumentParser): 

31 """Parser for HTML documents with semantic analysis.""" 

32 

33 def __init__(self): 

34 """Initialize the HTML document parser.""" 

35 # Define semantic HTML elements that should be treated as section boundaries 

36 self.section_elements = { 

37 "article", 

38 "section", 

39 "main", 

40 "header", 

41 "footer", 

42 "nav", 

43 "aside", 

44 } 

45 

46 # Define heading elements for hierarchy 

47 self.heading_elements = {"h1", "h2", "h3", "h4", "h5", "h6"} 

48 

49 # Define block-level elements that can form chunks 

50 self.block_elements = { 

51 "div", 

52 "p", 

53 "blockquote", 

54 "pre", 

55 "ul", 

56 "ol", 

57 "li", 

58 "table", 

59 "figure", 

60 "form", 

61 } 

62 

63 def parse_document_structure(self, content: str) -> dict[str, Any]: 

64 """Parse HTML DOM structure and extract semantic information.""" 

65 try: 

66 soup = BeautifulSoup(content, "html.parser") 

67 

68 # Remove script and style elements for cleaner analysis 

69 for script in soup(["script", "style"]): 

70 script.decompose() 

71 

72 # Extract document outline 

73 headings = self._extract_heading_hierarchy(soup) 

74 semantic_elements = self._identify_semantic_elements(soup) 

75 links = self._extract_links(soup) 

76 accessibility = self._analyze_accessibility(soup) 

77 

78 return { 

79 "heading_hierarchy": headings, 

80 "semantic_elements": semantic_elements, 

81 "internal_links": len([l for l in links if l.get("internal", False)]), 

82 "external_links": len( 

83 [l for l in links if not l.get("internal", False)] 

84 ), 

85 "has_navigation": bool(soup.find("nav")), 

86 "has_main_content": bool(soup.find("main")), 

87 "has_header": bool(soup.find("header")), 

88 "has_footer": bool(soup.find("footer")), 

89 "has_aside": bool(soup.find("aside")), 

90 "structure_type": "html", 

91 "accessibility_features": accessibility, 

92 "form_count": len(soup.find_all("form")), 

93 "table_count": len(soup.find_all("table")), 

94 "image_count": len(soup.find_all("img")), 

95 "list_count": len(soup.find_all(["ul", "ol"])), 

96 "content_sections": len(soup.find_all(list(self.section_elements))), 

97 } 

98 except Exception as e: 

99 # Fallback structure for malformed HTML 

100 return { 

101 "heading_hierarchy": [], 

102 "semantic_elements": [], 

103 "internal_links": 0, 

104 "external_links": 0, 

105 "has_navigation": False, 

106 "has_main_content": False, 

107 "has_header": False, 

108 "has_footer": False, 

109 "has_aside": False, 

110 "structure_type": "html_malformed", 

111 "accessibility_features": {}, 

112 "form_count": 0, 

113 "table_count": 0, 

114 "image_count": 0, 

115 "list_count": 0, 

116 "content_sections": 0, 

117 "parse_error": str(e), 

118 } 

119 

120 def extract_section_metadata(self, section: Any) -> dict[str, Any]: 

121 """Extract metadata from an HTML section.""" 

122 if isinstance(section, dict): 

123 # Already processed section metadata 

124 return section 

125 

126 if isinstance(section, Tag): 

127 return self._extract_tag_metadata(section) 

128 

129 # Fallback for string content 

130 return { 

131 "tag_name": "div", 

132 "section_type": SectionType.DIV.value, 

133 "level": 0, 

134 "attributes": {}, 

135 "has_links": bool(re.search(r"<a\s+[^>]*href", str(section))), 

136 "has_images": bool(re.search(r"<img\s+[^>]*src", str(section))), 

137 } 

138 

139 def _extract_heading_hierarchy(self, soup: BeautifulSoup) -> list[dict[str, Any]]: 

140 """Extract document heading hierarchy.""" 

141 headings = [] 

142 

143 for heading in soup.find_all(list(self.heading_elements)): 

144 level = int(heading.name[1]) # Extract number from h1, h2, etc. 

145 text = heading.get_text(strip=True) 

146 

147 headings.append( 

148 { 

149 "level": level, 

150 "text": text, 

151 "tag": heading.name, 

152 "id": heading.get("id"), 

153 "classes": heading.get("class", []), 

154 } 

155 ) 

156 

157 return headings 

158 

159 def _identify_semantic_elements(self, soup: BeautifulSoup) -> list[dict[str, Any]]: 

160 """Identify semantic HTML elements and their roles.""" 

161 semantic_elements = [] 

162 

163 for element in soup.find_all(list(self.section_elements)): 

164 semantic_elements.append( 

165 { 

166 "tag": element.name, 

167 "role": element.get("role"), 

168 "id": element.get("id"), 

169 "classes": element.get("class", []), 

170 "text_length": len(element.get_text(strip=True)), 

171 "has_children": bool(element.find_all()), 

172 } 

173 ) 

174 

175 return semantic_elements 

176 

177 def _extract_links(self, soup: BeautifulSoup) -> list[dict[str, Any]]: 

178 """Extract and categorize links.""" 

179 links = [] 

180 

181 for link in soup.find_all("a", href=True): 

182 href = link["href"] 

183 text = link.get_text(strip=True) 

184 

185 # Determine if link is internal or external 

186 is_internal = ( 

187 href.startswith("#") 

188 or href.startswith("/") 

189 or href.startswith("./") 

190 or href.startswith("../") 

191 or not href.startswith(("http://", "https://", "mailto:", "tel:")) 

192 ) 

193 

194 links.append( 

195 { 

196 "href": href, 

197 "text": text, 

198 "internal": is_internal, 

199 "title": link.get("title"), 

200 "target": link.get("target"), 

201 } 

202 ) 

203 

204 return links 

205 

206 def _analyze_accessibility(self, soup: BeautifulSoup) -> dict[str, Any]: 

207 """Analyze accessibility features of the HTML document.""" 

208 accessibility = { 

209 "has_lang_attribute": bool(soup.find("html", lang=True)), 

210 "has_title": bool(soup.find("title")), 

211 "images_with_alt": 0, 

212 "images_without_alt": 0, 

213 "headings_properly_nested": True, 

214 "has_skip_links": False, 

215 "form_labels": 0, 

216 "form_inputs": 0, 

217 } 

218 

219 # Analyze images 

220 for img in soup.find_all("img"): 

221 if img.get("alt") is not None: 

222 accessibility["images_with_alt"] += 1 

223 else: 

224 accessibility["images_without_alt"] += 1 

225 

226 # Check for skip links 

227 skip_link_indicators = ["skip", "jump", "goto"] 

228 for link in soup.find_all("a", href=True): 

229 link_text = link.get_text(strip=True).lower() 

230 if any(indicator in link_text for indicator in skip_link_indicators): 

231 accessibility["has_skip_links"] = True 

232 break 

233 

234 # Analyze forms 

235 accessibility["form_inputs"] = len( 

236 soup.find_all(["input", "textarea", "select"]) 

237 ) 

238 accessibility["form_labels"] = len(soup.find_all("label")) 

239 

240 # Check heading nesting (simplified) 

241 headings = soup.find_all(list(self.heading_elements)) 

242 if len(headings) > 1: 

243 prev_level = 0 

244 for heading in headings: 

245 level = int(heading.name[1]) 

246 if prev_level > 0 and level > prev_level + 1: 

247 accessibility["headings_properly_nested"] = False 

248 break 

249 prev_level = level 

250 

251 return accessibility 

252 

253 def _extract_tag_metadata(self, tag: Tag) -> dict[str, Any]: 

254 """Extract metadata from a BeautifulSoup tag.""" 

255 tag_name = tag.name.lower() 

256 section_type = self._identify_section_type(tag) 

257 

258 # Get attributes (limited for performance) 

259 attributes = {} 

260 if tag.attrs: 

261 # Only keep essential attributes 

262 for attr in ["id", "class", "role", "data-*"]: 

263 if attr in tag.attrs: 

264 attributes[attr] = tag.attrs[attr] 

265 elif attr == "data-*": 

266 # Collect data attributes 

267 data_attrs = { 

268 k: v for k, v in tag.attrs.items() if k.startswith("data-") 

269 } 

270 if data_attrs: 

271 attributes["data_attributes"] = data_attrs 

272 

273 text_content = tag.get_text(strip=True) 

274 

275 return { 

276 "tag_name": tag_name, 

277 "section_type": section_type.value, 

278 "level": self._get_heading_level(tag), 

279 "attributes": attributes, 

280 "text_content": text_content, 

281 "word_count": len(text_content.split()), 

282 "char_count": len(text_content), 

283 "has_code": section_type == SectionType.CODE_BLOCK, 

284 "has_links": bool(tag.find_all("a")), 

285 "has_images": bool(tag.find_all("img")), 

286 "is_semantic": tag_name in self.section_elements, 

287 "is_heading": tag_name in self.heading_elements, 

288 "child_count": len(tag.find_all()), 

289 } 

290 

291 def _identify_section_type(self, tag: Tag) -> SectionType: 

292 """Identify the type of section based on the HTML tag.""" 

293 tag_name = tag.name.lower() 

294 

295 if tag_name in self.heading_elements: 

296 return SectionType.HEADER 

297 elif tag_name == "article": 

298 return SectionType.ARTICLE 

299 elif tag_name == "section": 

300 return SectionType.SECTION 

301 elif tag_name == "nav": 

302 return SectionType.NAV 

303 elif tag_name == "aside": 

304 return SectionType.ASIDE 

305 elif tag_name == "main": 

306 return SectionType.MAIN 

307 elif tag_name == "footer": 

308 return SectionType.FOOTER 

309 elif tag_name in ["ul", "ol", "li"]: 

310 return SectionType.LIST 

311 elif tag_name == "table": 

312 return SectionType.TABLE 

313 elif tag_name in ["pre", "code"]: 

314 return SectionType.CODE_BLOCK 

315 elif tag_name == "blockquote": 

316 return SectionType.BLOCKQUOTE 

317 elif tag_name == "p": 

318 return SectionType.PARAGRAPH 

319 else: 

320 return SectionType.DIV 

321 

322 def _get_heading_level(self, tag: Tag) -> int: 

323 """Get the heading level from an HTML heading tag.""" 

324 if tag.name.lower() in self.heading_elements: 

325 return int(tag.name[1]) # Extract number from h1, h2, etc. 

326 return 0 

327 

328 def extract_section_title(self, content: str) -> str: 

329 """Extract a title from HTML content.""" 

330 try: 

331 soup = BeautifulSoup(content, "html.parser") 

332 

333 # Try to find title in various elements 

334 for tag in ["h1", "h2", "h3", "h4", "h5", "h6", "title"]: 

335 element = soup.find(tag) 

336 if element: 

337 title = element.get_text(strip=True) 

338 if title: 

339 return title[:100] # Limit title length 

340 

341 # Try to find text in semantic elements 

342 for tag in ["article", "section", "main"]: 

343 element = soup.find(tag) 

344 if element: 

345 text = element.get_text(strip=True) 

346 if text: 

347 return self._extract_title_from_content(text) 

348 

349 # Fallback to first text content 

350 text = soup.get_text(strip=True) 

351 if text: 

352 return self._extract_title_from_content(text) 

353 

354 return "Untitled Section" 

355 

356 except Exception: 

357 return "Untitled Section" 

358 

359 def _extract_title_from_content(self, content: str) -> str: 

360 """Extract a title from content text.""" 

361 if not content: 

362 return "Untitled Section" 

363 

364 # Take first line or first 50 characters, whichever is shorter 

365 lines = content.strip().split("\n") 

366 first_line = lines[0].strip() if lines else "" 

367 

368 if first_line: 

369 # Limit title length for performance 

370 return first_line[:100] if len(first_line) > 100 else first_line 

371 

372 return "Untitled Section"