Coverage for src/qdrant_loader/core/chunking/strategy/html/html_metadata_extractor.py: 75%

224 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-13 09:19 +0000

1"""HTML-specific metadata extractor for enhanced HTML document analysis.""" 

2 

3import re 

4from typing import Any 

5 

6from bs4 import BeautifulSoup 

7 

8from qdrant_loader.core.chunking.strategy.base.metadata_extractor import ( 

9 BaseMetadataExtractor, 

10) 

11from qdrant_loader.core.document import Document 

12 

13from .html_document_parser import HTMLDocumentParser 

14 

15 

16class HTMLMetadataExtractor(BaseMetadataExtractor): 

17 """Metadata extractor for HTML documents with semantic and accessibility analysis.""" 

18 

19 def __init__(self): 

20 """Initialize the HTML metadata extractor.""" 

21 self.document_parser = HTMLDocumentParser() 

22 

23 def extract_hierarchical_metadata( 

24 self, content: str, chunk_metadata: dict[str, Any], document: Document 

25 ) -> dict[str, Any]: 

26 """Extract HTML-specific hierarchical metadata.""" 

27 try: 

28 soup = BeautifulSoup(content, "html.parser") 

29 

30 metadata = chunk_metadata.copy() 

31 

32 # Add HTML-specific metadata 

33 metadata.update( 

34 { 

35 "dom_path": self._build_dom_path_breadcrumb(soup), 

36 "semantic_tags": self._extract_semantic_tags(soup), 

37 "accessibility_score": self._calculate_accessibility_score(soup), 

38 "has_structured_data": self._has_structured_data(soup), 

39 "interactive_elements": self._analyze_interactive_elements(soup), 

40 "media_elements": self._analyze_media_elements(soup), 

41 "content_type": "html", 

42 "html_features": self._analyze_html_features(soup), 

43 "seo_indicators": self._analyze_seo_indicators(soup), 

44 "markup_quality": self._assess_markup_quality(soup), 

45 } 

46 ) 

47 

48 return metadata 

49 

50 except Exception as e: 

51 # Fallback metadata for malformed HTML 

52 metadata = chunk_metadata.copy() 

53 metadata.update( 

54 { 

55 "content_type": "html_malformed", 

56 "parse_error": str(e), 

57 "dom_path": "unknown", 

58 "semantic_tags": [], 

59 "accessibility_score": 0.0, 

60 "has_structured_data": False, 

61 } 

62 ) 

63 return metadata 

64 

65 def extract_entities(self, text: str) -> list[str]: 

66 """Extract HTML-specific entities including semantic elements and IDs.""" 

67 try: 

68 soup = BeautifulSoup(text, "html.parser") 

69 entities = [] 

70 

71 # Extract IDs as entities 

72 for element in soup.find_all(id=True): 

73 entities.append(f"#{element.get('id')}") 

74 

75 # Extract class names as entities 

76 for element in soup.find_all(class_=True): 

77 classes = element.get("class", []) 

78 entities.extend([f".{cls}" for cls in classes]) 

79 

80 # Extract semantic element types 

81 semantic_elements = soup.find_all( 

82 list(self.document_parser.section_elements) 

83 ) 

84 entities.extend([elem.name for elem in semantic_elements]) 

85 

86 # Extract link destinations 

87 for link in soup.find_all("a", href=True): 

88 href = link["href"] 

89 if href.startswith("#"): 

90 entities.append(href) # Internal link 

91 elif href.startswith("http"): 

92 entities.append(href) # External link 

93 

94 # Remove duplicates and limit 

95 return list(set(entities))[:50] 

96 

97 except Exception: 

98 return [] 

99 

100 def _build_dom_path_breadcrumb(self, soup: BeautifulSoup) -> str: 

101 """Build a DOM path breadcrumb for context.""" 

102 try: 

103 # Find the deepest meaningful element 

104 meaningful_elements = [] 

105 

106 for element in soup.find_all(): 

107 if ( 

108 element.name in self.document_parser.section_elements 

109 or element.name in self.document_parser.heading_elements 

110 or element.get("id") 

111 or element.get("role") 

112 ): 

113 meaningful_elements.append(element) 

114 

115 if not meaningful_elements: 

116 return "body" 

117 

118 # Build path from the first meaningful element 

119 element = meaningful_elements[0] 

120 path_parts = [] 

121 

122 while element and len(path_parts) < 5: # Limit depth 

123 part = element.name 

124 if element.get("id"): 

125 part += f"#{element.get('id')}" 

126 elif element.get("class"): 

127 classes = element.get("class", [])[:2] # Limit classes 

128 part += f".{'.'.join(classes)}" 

129 

130 path_parts.append(part) 

131 element = element.parent 

132 

133 # Stop at body or html 

134 if element and element.name in ["body", "html"]: 

135 break 

136 

137 return " > ".join(reversed(path_parts)) if path_parts else "body" 

138 

139 except Exception: 

140 return "unknown" 

141 

142 def _extract_semantic_tags(self, soup: BeautifulSoup) -> list[dict[str, Any]]: 

143 """Extract semantic HTML tags and their properties.""" 

144 semantic_tags = [] 

145 

146 try: 

147 for element in soup.find_all(): 

148 if element.name in self.document_parser.section_elements: 

149 tag_info = { 

150 "tag": element.name, 

151 "role": element.get("role"), 

152 "id": element.get("id"), 

153 "classes": element.get("class", [])[:3], # Limit classes 

154 "has_content": bool(element.get_text(strip=True)), 

155 "child_count": len(element.find_all()), 

156 } 

157 semantic_tags.append(tag_info) 

158 

159 return semantic_tags[:10] # Limit results 

160 

161 except Exception: 

162 return [] 

163 

164 def _calculate_accessibility_score(self, soup: BeautifulSoup) -> float: 

165 """Calculate an accessibility score for the HTML content.""" 

166 try: 

167 score = 0.0 

168 max_score = 10.0 

169 

170 # Check for lang attribute 

171 if soup.find("html", lang=True): 

172 score += 1.0 

173 

174 # Check image alt texts 

175 images = soup.find_all("img") 

176 if images: 

177 images_with_alt = len( 

178 [img for img in images if img.get("alt") is not None] 

179 ) 

180 score += (images_with_alt / len(images)) * 2.0 

181 else: 

182 score += 2.0 # No images, full score 

183 

184 # Check heading hierarchy 

185 headings = soup.find_all(list(self.document_parser.heading_elements)) 

186 if headings: 

187 # Simple check: first heading should be h1 

188 if headings[0].name == "h1": 

189 score += 1.0 

190 # Check for proper nesting (simplified) 

191 proper_nesting = True 

192 prev_level = 0 

193 for heading in headings: 

194 level = int(heading.name[1]) 

195 if prev_level > 0 and level > prev_level + 1: 

196 proper_nesting = False 

197 break 

198 prev_level = level 

199 if proper_nesting: 

200 score += 1.0 

201 

202 # Check for skip links 

203 skip_indicators = ["skip", "jump", "goto"] 

204 for link in soup.find_all("a", href=True): 

205 link_text = link.get_text(strip=True).lower() 

206 if any(indicator in link_text for indicator in skip_indicators): 

207 score += 1.0 

208 break 

209 

210 # Check form labels 

211 forms = soup.find_all("form") 

212 if forms: 

213 inputs = soup.find_all(["input", "textarea", "select"]) 

214 labels = soup.find_all("label") 

215 if inputs: 

216 label_ratio = len(labels) / len(inputs) 

217 score += min(label_ratio, 1.0) * 2.0 

218 else: 

219 score += 2.0 # No forms, full score 

220 

221 # Check for ARIA attributes 

222 aria_elements = soup.find_all(attrs={"role": True}) 

223 aria_elements.extend(soup.find_all(attrs=re.compile(r"^aria-"))) 

224 if aria_elements: 

225 score += 1.0 

226 

227 # Check for semantic HTML5 elements 

228 semantic_count = len( 

229 soup.find_all(list(self.document_parser.section_elements)) 

230 ) 

231 if semantic_count > 0: 

232 score += min( 

233 semantic_count / 3.0, 1.0 

234 ) # Up to 1 point for semantic elements 

235 

236 return min(score / max_score, 1.0) # Normalize to 0-1 

237 

238 except Exception: 

239 return 0.0 

240 

241 def _has_structured_data(self, soup: BeautifulSoup) -> bool: 

242 """Check if the HTML contains structured data.""" 

243 try: 

244 # Check for JSON-LD 

245 json_ld = soup.find("script", type="application/ld+json") 

246 if json_ld: 

247 return True 

248 

249 # Check for microdata 

250 microdata = soup.find_all(attrs={"itemscope": True}) 

251 if microdata: 

252 return True 

253 

254 # Check for RDFa 

255 rdfa = soup.find_all(attrs={"property": True}) 

256 if rdfa: 

257 return True 

258 

259 # Check for Open Graph 

260 og_tags = soup.find_all("meta", property=re.compile(r"^og:")) 

261 if og_tags: 

262 return True 

263 

264 # Check for Twitter Cards 

265 twitter_tags = soup.find_all( 

266 "meta", attrs={"name": re.compile(r"^twitter:")} 

267 ) 

268 if twitter_tags: 

269 return True 

270 

271 return False 

272 

273 except Exception: 

274 return False 

275 

276 def _analyze_interactive_elements(self, soup: BeautifulSoup) -> dict[str, Any]: 

277 """Analyze interactive elements in the HTML.""" 

278 try: 

279 interactive = { 

280 "forms": len(soup.find_all("form")), 

281 "buttons": len( 

282 soup.find_all( 

283 ["button", "input[type='button']", "input[type='submit']"] 

284 ) 

285 ), 

286 "links": len(soup.find_all("a", href=True)), 

287 "inputs": len(soup.find_all(["input", "textarea", "select"])), 

288 "clickable_elements": 0, 

289 "has_javascript_events": False, 

290 } 

291 

292 # Count elements with click events 

293 clickable = soup.find_all(attrs=re.compile(r"^on(click|touch|mouse)")) 

294 interactive["clickable_elements"] = len(clickable) 

295 

296 # Check for JavaScript event handlers 

297 js_events = soup.find_all(attrs=re.compile(r"^on[a-z]+")) 

298 interactive["has_javascript_events"] = len(js_events) > 0 

299 

300 return interactive 

301 

302 except Exception: 

303 return {} 

304 

305 def _analyze_media_elements(self, soup: BeautifulSoup) -> dict[str, Any]: 

306 """Analyze media elements in the HTML.""" 

307 try: 

308 media = { 

309 "images": len(soup.find_all("img")), 

310 "videos": len(soup.find_all("video")), 

311 "audio": len(soup.find_all("audio")), 

312 "iframes": len(soup.find_all("iframe")), 

313 "canvas": len(soup.find_all("canvas")), 

314 "svg": len(soup.find_all("svg")), 

315 } 

316 

317 # Analyze image properties 

318 images = soup.find_all("img") 

319 if images: 

320 media["images_with_alt"] = len( 

321 [img for img in images if img.get("alt")] 

322 ) 

323 media["images_with_title"] = len( 

324 [img for img in images if img.get("title")] 

325 ) 

326 media["responsive_images"] = len( 

327 [img for img in images if img.get("srcset")] 

328 ) 

329 

330 return media 

331 

332 except Exception: 

333 return {} 

334 

335 def _analyze_html_features(self, soup: BeautifulSoup) -> dict[str, Any]: 

336 """Analyze HTML5 and modern web features.""" 

337 try: 

338 features = { 

339 "html5_semantic_tags": 0, 

340 "custom_elements": 0, 

341 "data_attributes": 0, 

342 "css_classes": 0, 

343 "inline_styles": 0, 

344 } 

345 

346 # Count HTML5 semantic tags 

347 features["html5_semantic_tags"] = len( 

348 soup.find_all(list(self.document_parser.section_elements)) 

349 ) 

350 

351 # Count custom elements (tags with hyphens) 

352 custom_elements = soup.find_all(lambda tag: tag.name and "-" in tag.name) 

353 features["custom_elements"] = len(custom_elements) 

354 

355 # Count data attributes 

356 data_attrs = soup.find_all(attrs=re.compile(r"^data-")) 

357 features["data_attributes"] = len(data_attrs) 

358 

359 # Count CSS classes 

360 elements_with_class = soup.find_all(class_=True) 

361 features["css_classes"] = sum( 

362 len(elem.get("class", [])) for elem in elements_with_class 

363 ) 

364 

365 # Count inline styles 

366 features["inline_styles"] = len(soup.find_all(style=True)) 

367 

368 return features 

369 

370 except Exception: 

371 return {} 

372 

373 def _analyze_seo_indicators(self, soup: BeautifulSoup) -> dict[str, Any]: 

374 """Analyze SEO-related indicators.""" 

375 try: 

376 seo = { 

377 "has_title": False, 

378 "has_meta_description": False, 

379 "has_h1": False, 

380 "heading_count": 0, 

381 "internal_links": 0, 

382 "external_links": 0, 

383 "has_canonical": False, 

384 "has_robots_meta": False, 

385 } 

386 

387 # Check for title 

388 title = soup.find("title") 

389 seo["has_title"] = bool(title and title.get_text(strip=True)) 

390 

391 # Check for meta description 

392 meta_desc = soup.find("meta", attrs={"name": "description"}) 

393 seo["has_meta_description"] = bool(meta_desc and meta_desc.get("content")) 

394 

395 # Check for H1 

396 h1 = soup.find("h1") 

397 seo["has_h1"] = bool(h1 and h1.get_text(strip=True)) 

398 

399 # Count headings 

400 headings = soup.find_all(list(self.document_parser.heading_elements)) 

401 seo["heading_count"] = len(headings) 

402 

403 # Analyze links 

404 links = soup.find_all("a", href=True) 

405 for link in links: 

406 href = link["href"] 

407 if href.startswith(("http://", "https://")) and "://" in href: 

408 seo["external_links"] += 1 

409 else: 

410 seo["internal_links"] += 1 

411 

412 # Check for canonical link 

413 canonical = soup.find("link", rel="canonical") 

414 seo["has_canonical"] = bool(canonical) 

415 

416 # Check for robots meta 

417 robots = soup.find("meta", attrs={"name": "robots"}) 

418 seo["has_robots_meta"] = bool(robots) 

419 

420 return seo 

421 

422 except Exception: 

423 return {} 

424 

425 def _assess_markup_quality(self, soup: BeautifulSoup) -> dict[str, Any]: 

426 """Assess the quality of HTML markup.""" 

427 try: 

428 quality = { 

429 "semantic_ratio": 0.0, 

430 "accessibility_features": 0, 

431 "deprecated_tags": 0, 

432 "inline_styles": 0, 

433 "proper_nesting": True, 

434 "valid_attributes": True, 

435 } 

436 

437 # Calculate semantic ratio 

438 all_elements = soup.find_all() 

439 semantic_elements = soup.find_all( 

440 list(self.document_parser.section_elements) 

441 ) 

442 if all_elements: 

443 quality["semantic_ratio"] = len(semantic_elements) / len(all_elements) 

444 

445 # Count accessibility features 

446 accessibility_features = 0 

447 if soup.find_all(alt=True): 

448 accessibility_features += 1 

449 if soup.find_all(attrs={"role": True}): 

450 accessibility_features += 1 

451 if soup.find_all(attrs=re.compile(r"^aria-")): 

452 accessibility_features += 1 

453 if soup.find_all("label"): 

454 accessibility_features += 1 

455 quality["accessibility_features"] = accessibility_features 

456 

457 # Count deprecated tags (simplified list) 

458 deprecated_tags = ["font", "center", "big", "small", "strike", "tt"] 

459 quality["deprecated_tags"] = sum( 

460 len(soup.find_all(tag)) for tag in deprecated_tags 

461 ) 

462 

463 # Count inline styles 

464 quality["inline_styles"] = len(soup.find_all(style=True)) 

465 

466 return quality 

467 

468 except Exception: 

469 return {}