Coverage for src/qdrant_loader/core/chunking/strategy/html/html_metadata_extractor.py: 75%
224 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-11 07:21 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-11 07:21 +0000
1"""HTML-specific metadata extractor for enhanced HTML document analysis."""
3import re
4from typing import Any
6from bs4 import BeautifulSoup
8from qdrant_loader.core.chunking.strategy.base.metadata_extractor import (
9 BaseMetadataExtractor,
10)
11from qdrant_loader.core.document import Document
13from .html_document_parser import HTMLDocumentParser
16class HTMLMetadataExtractor(BaseMetadataExtractor):
17 """Metadata extractor for HTML documents with semantic and accessibility analysis."""
19 def __init__(self):
20 """Initialize the HTML metadata extractor."""
21 self.document_parser = HTMLDocumentParser()
23 def extract_hierarchical_metadata(
24 self, content: str, chunk_metadata: dict[str, Any], document: Document
25 ) -> dict[str, Any]:
26 """Extract HTML-specific hierarchical metadata."""
27 try:
28 soup = BeautifulSoup(content, "html.parser")
30 metadata = chunk_metadata.copy()
32 # Add HTML-specific metadata
33 metadata.update(
34 {
35 "dom_path": self._build_dom_path_breadcrumb(soup),
36 "semantic_tags": self._extract_semantic_tags(soup),
37 "accessibility_score": self._calculate_accessibility_score(soup),
38 "has_structured_data": self._has_structured_data(soup),
39 "interactive_elements": self._analyze_interactive_elements(soup),
40 "media_elements": self._analyze_media_elements(soup),
41 "content_type": "html",
42 "html_features": self._analyze_html_features(soup),
43 "seo_indicators": self._analyze_seo_indicators(soup),
44 "markup_quality": self._assess_markup_quality(soup),
45 }
46 )
48 return metadata
50 except Exception as e:
51 # Fallback metadata for malformed HTML
52 metadata = chunk_metadata.copy()
53 metadata.update(
54 {
55 "content_type": "html_malformed",
56 "parse_error": str(e),
57 "dom_path": "unknown",
58 "semantic_tags": [],
59 "accessibility_score": 0.0,
60 "has_structured_data": False,
61 }
62 )
63 return metadata
65 def extract_entities(self, text: str) -> list[str]:
66 """Extract HTML-specific entities including semantic elements and IDs."""
67 try:
68 soup = BeautifulSoup(text, "html.parser")
69 entities = []
71 # Extract IDs as entities
72 for element in soup.find_all(id=True):
73 entities.append(f"#{element.get('id')}")
75 # Extract class names as entities
76 for element in soup.find_all(class_=True):
77 classes = element.get("class", [])
78 entities.extend([f".{cls}" for cls in classes])
80 # Extract semantic element types
81 semantic_elements = soup.find_all(
82 list(self.document_parser.section_elements)
83 )
84 entities.extend([elem.name for elem in semantic_elements])
86 # Extract link destinations
87 for link in soup.find_all("a", href=True):
88 href = link["href"]
89 if href.startswith("#"):
90 entities.append(href) # Internal link
91 elif href.startswith("http"):
92 entities.append(href) # External link
94 # Remove duplicates and limit
95 return list(set(entities))[:50]
97 except Exception:
98 return []
100 def _build_dom_path_breadcrumb(self, soup: BeautifulSoup) -> str:
101 """Build a DOM path breadcrumb for context."""
102 try:
103 # Find the deepest meaningful element
104 meaningful_elements = []
106 for element in soup.find_all():
107 if (
108 element.name in self.document_parser.section_elements
109 or element.name in self.document_parser.heading_elements
110 or element.get("id")
111 or element.get("role")
112 ):
113 meaningful_elements.append(element)
115 if not meaningful_elements:
116 return "body"
118 # Build path from the first meaningful element
119 element = meaningful_elements[0]
120 path_parts = []
122 while element and len(path_parts) < 5: # Limit depth
123 part = element.name
124 if element.get("id"):
125 part += f"#{element.get('id')}"
126 elif element.get("class"):
127 classes = element.get("class", [])[:2] # Limit classes
128 part += f".{'.'.join(classes)}"
130 path_parts.append(part)
131 element = element.parent
133 # Stop at body or html
134 if element and element.name in ["body", "html"]:
135 break
137 return " > ".join(reversed(path_parts)) if path_parts else "body"
139 except Exception:
140 return "unknown"
142 def _extract_semantic_tags(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
143 """Extract semantic HTML tags and their properties."""
144 semantic_tags = []
146 try:
147 for element in soup.find_all():
148 if element.name in self.document_parser.section_elements:
149 tag_info = {
150 "tag": element.name,
151 "role": element.get("role"),
152 "id": element.get("id"),
153 "classes": element.get("class", [])[:3], # Limit classes
154 "has_content": bool(element.get_text(strip=True)),
155 "child_count": len(element.find_all()),
156 }
157 semantic_tags.append(tag_info)
159 return semantic_tags[:10] # Limit results
161 except Exception:
162 return []
164 def _calculate_accessibility_score(self, soup: BeautifulSoup) -> float:
165 """Calculate an accessibility score for the HTML content."""
166 try:
167 score = 0.0
168 max_score = 10.0
170 # Check for lang attribute
171 if soup.find("html", lang=True):
172 score += 1.0
174 # Check image alt texts
175 images = soup.find_all("img")
176 if images:
177 images_with_alt = len(
178 [img for img in images if img.get("alt") is not None]
179 )
180 score += (images_with_alt / len(images)) * 2.0
181 else:
182 score += 2.0 # No images, full score
184 # Check heading hierarchy
185 headings = soup.find_all(list(self.document_parser.heading_elements))
186 if headings:
187 # Simple check: first heading should be h1
188 if headings[0].name == "h1":
189 score += 1.0
190 # Check for proper nesting (simplified)
191 proper_nesting = True
192 prev_level = 0
193 for heading in headings:
194 level = int(heading.name[1])
195 if prev_level > 0 and level > prev_level + 1:
196 proper_nesting = False
197 break
198 prev_level = level
199 if proper_nesting:
200 score += 1.0
202 # Check for skip links
203 skip_indicators = ["skip", "jump", "goto"]
204 for link in soup.find_all("a", href=True):
205 link_text = link.get_text(strip=True).lower()
206 if any(indicator in link_text for indicator in skip_indicators):
207 score += 1.0
208 break
210 # Check form labels
211 forms = soup.find_all("form")
212 if forms:
213 inputs = soup.find_all(["input", "textarea", "select"])
214 labels = soup.find_all("label")
215 if inputs:
216 label_ratio = len(labels) / len(inputs)
217 score += min(label_ratio, 1.0) * 2.0
218 else:
219 score += 2.0 # No forms, full score
221 # Check for ARIA attributes
222 aria_elements = soup.find_all(attrs={"role": True})
223 aria_elements.extend(soup.find_all(attrs=re.compile(r"^aria-")))
224 if aria_elements:
225 score += 1.0
227 # Check for semantic HTML5 elements
228 semantic_count = len(
229 soup.find_all(list(self.document_parser.section_elements))
230 )
231 if semantic_count > 0:
232 score += min(
233 semantic_count / 3.0, 1.0
234 ) # Up to 1 point for semantic elements
236 return min(score / max_score, 1.0) # Normalize to 0-1
238 except Exception:
239 return 0.0
241 def _has_structured_data(self, soup: BeautifulSoup) -> bool:
242 """Check if the HTML contains structured data."""
243 try:
244 # Check for JSON-LD
245 json_ld = soup.find("script", type="application/ld+json")
246 if json_ld:
247 return True
249 # Check for microdata
250 microdata = soup.find_all(attrs={"itemscope": True})
251 if microdata:
252 return True
254 # Check for RDFa
255 rdfa = soup.find_all(attrs={"property": True})
256 if rdfa:
257 return True
259 # Check for Open Graph
260 og_tags = soup.find_all("meta", property=re.compile(r"^og:"))
261 if og_tags:
262 return True
264 # Check for Twitter Cards
265 twitter_tags = soup.find_all(
266 "meta", attrs={"name": re.compile(r"^twitter:")}
267 )
268 if twitter_tags:
269 return True
271 return False
273 except Exception:
274 return False
276 def _analyze_interactive_elements(self, soup: BeautifulSoup) -> dict[str, Any]:
277 """Analyze interactive elements in the HTML."""
278 try:
279 interactive = {
280 "forms": len(soup.find_all("form")),
281 "buttons": len(
282 soup.find_all(
283 ["button", "input[type='button']", "input[type='submit']"]
284 )
285 ),
286 "links": len(soup.find_all("a", href=True)),
287 "inputs": len(soup.find_all(["input", "textarea", "select"])),
288 "clickable_elements": 0,
289 "has_javascript_events": False,
290 }
292 # Count elements with click events
293 clickable = soup.find_all(attrs=re.compile(r"^on(click|touch|mouse)"))
294 interactive["clickable_elements"] = len(clickable)
296 # Check for JavaScript event handlers
297 js_events = soup.find_all(attrs=re.compile(r"^on[a-z]+"))
298 interactive["has_javascript_events"] = len(js_events) > 0
300 return interactive
302 except Exception:
303 return {}
305 def _analyze_media_elements(self, soup: BeautifulSoup) -> dict[str, Any]:
306 """Analyze media elements in the HTML."""
307 try:
308 media = {
309 "images": len(soup.find_all("img")),
310 "videos": len(soup.find_all("video")),
311 "audio": len(soup.find_all("audio")),
312 "iframes": len(soup.find_all("iframe")),
313 "canvas": len(soup.find_all("canvas")),
314 "svg": len(soup.find_all("svg")),
315 }
317 # Analyze image properties
318 images = soup.find_all("img")
319 if images:
320 media["images_with_alt"] = len(
321 [img for img in images if img.get("alt")]
322 )
323 media["images_with_title"] = len(
324 [img for img in images if img.get("title")]
325 )
326 media["responsive_images"] = len(
327 [img for img in images if img.get("srcset")]
328 )
330 return media
332 except Exception:
333 return {}
335 def _analyze_html_features(self, soup: BeautifulSoup) -> dict[str, Any]:
336 """Analyze HTML5 and modern web features."""
337 try:
338 features = {
339 "html5_semantic_tags": 0,
340 "custom_elements": 0,
341 "data_attributes": 0,
342 "css_classes": 0,
343 "inline_styles": 0,
344 }
346 # Count HTML5 semantic tags
347 features["html5_semantic_tags"] = len(
348 soup.find_all(list(self.document_parser.section_elements))
349 )
351 # Count custom elements (tags with hyphens)
352 custom_elements = soup.find_all(lambda tag: tag.name and "-" in tag.name)
353 features["custom_elements"] = len(custom_elements)
355 # Count data attributes
356 data_attrs = soup.find_all(attrs=re.compile(r"^data-"))
357 features["data_attributes"] = len(data_attrs)
359 # Count CSS classes
360 elements_with_class = soup.find_all(class_=True)
361 features["css_classes"] = sum(
362 len(elem.get("class", [])) for elem in elements_with_class
363 )
365 # Count inline styles
366 features["inline_styles"] = len(soup.find_all(style=True))
368 return features
370 except Exception:
371 return {}
373 def _analyze_seo_indicators(self, soup: BeautifulSoup) -> dict[str, Any]:
374 """Analyze SEO-related indicators."""
375 try:
376 seo = {
377 "has_title": False,
378 "has_meta_description": False,
379 "has_h1": False,
380 "heading_count": 0,
381 "internal_links": 0,
382 "external_links": 0,
383 "has_canonical": False,
384 "has_robots_meta": False,
385 }
387 # Check for title
388 title = soup.find("title")
389 seo["has_title"] = bool(title and title.get_text(strip=True))
391 # Check for meta description
392 meta_desc = soup.find("meta", attrs={"name": "description"})
393 seo["has_meta_description"] = bool(meta_desc and meta_desc.get("content"))
395 # Check for H1
396 h1 = soup.find("h1")
397 seo["has_h1"] = bool(h1 and h1.get_text(strip=True))
399 # Count headings
400 headings = soup.find_all(list(self.document_parser.heading_elements))
401 seo["heading_count"] = len(headings)
403 # Analyze links
404 links = soup.find_all("a", href=True)
405 for link in links:
406 href = link["href"]
407 if href.startswith(("http://", "https://")) and "://" in href:
408 seo["external_links"] += 1
409 else:
410 seo["internal_links"] += 1
412 # Check for canonical link
413 canonical = soup.find("link", rel="canonical")
414 seo["has_canonical"] = bool(canonical)
416 # Check for robots meta
417 robots = soup.find("meta", attrs={"name": "robots"})
418 seo["has_robots_meta"] = bool(robots)
420 return seo
422 except Exception:
423 return {}
425 def _assess_markup_quality(self, soup: BeautifulSoup) -> dict[str, Any]:
426 """Assess the quality of HTML markup."""
427 try:
428 quality = {
429 "semantic_ratio": 0.0,
430 "accessibility_features": 0,
431 "deprecated_tags": 0,
432 "inline_styles": 0,
433 "proper_nesting": True,
434 "valid_attributes": True,
435 }
437 # Calculate semantic ratio
438 all_elements = soup.find_all()
439 semantic_elements = soup.find_all(
440 list(self.document_parser.section_elements)
441 )
442 if all_elements:
443 quality["semantic_ratio"] = len(semantic_elements) / len(all_elements)
445 # Count accessibility features
446 accessibility_features = 0
447 if soup.find_all(alt=True):
448 accessibility_features += 1
449 if soup.find_all(attrs={"role": True}):
450 accessibility_features += 1
451 if soup.find_all(attrs=re.compile(r"^aria-")):
452 accessibility_features += 1
453 if soup.find_all("label"):
454 accessibility_features += 1
455 quality["accessibility_features"] = accessibility_features
457 # Count deprecated tags (simplified list)
458 deprecated_tags = ["font", "center", "big", "small", "strike", "tt"]
459 quality["deprecated_tags"] = sum(
460 len(soup.find_all(tag)) for tag in deprecated_tags
461 )
463 # Count inline styles
464 quality["inline_styles"] = len(soup.find_all(style=True))
466 return quality
468 except Exception:
469 return {}