Coverage for src/qdrant_loader/core/chunking/strategy/html/html_document_parser.py: 77%
160 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-13 09:19 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-13 09:19 +0000
1"""HTML-specific document parser for DOM structure analysis."""
3import re
4from enum import Enum
5from typing import Any
7from bs4 import BeautifulSoup, Tag
9from qdrant_loader.core.chunking.strategy.base.document_parser import BaseDocumentParser
12class SectionType(Enum):
13 """Types of sections in an HTML document."""
15 HEADER = "header"
16 ARTICLE = "article"
17 SECTION = "section"
18 NAV = "nav"
19 ASIDE = "aside"
20 MAIN = "main"
21 PARAGRAPH = "paragraph"
22 LIST = "list"
23 TABLE = "table"
24 CODE_BLOCK = "code_block"
25 BLOCKQUOTE = "blockquote"
26 DIV = "div"
27 FOOTER = "footer"
30class HTMLDocumentParser(BaseDocumentParser):
31 """Parser for HTML documents with semantic analysis."""
33 def __init__(self):
34 """Initialize the HTML document parser."""
35 # Define semantic HTML elements that should be treated as section boundaries
36 self.section_elements = {
37 "article",
38 "section",
39 "main",
40 "header",
41 "footer",
42 "nav",
43 "aside",
44 }
46 # Define heading elements for hierarchy
47 self.heading_elements = {"h1", "h2", "h3", "h4", "h5", "h6"}
49 # Define block-level elements that can form chunks
50 self.block_elements = {
51 "div",
52 "p",
53 "blockquote",
54 "pre",
55 "ul",
56 "ol",
57 "li",
58 "table",
59 "figure",
60 "form",
61 }
63 def parse_document_structure(self, content: str) -> dict[str, Any]:
64 """Parse HTML DOM structure and extract semantic information."""
65 try:
66 soup = BeautifulSoup(content, "html.parser")
68 # Remove script and style elements for cleaner analysis
69 for script in soup(["script", "style"]):
70 script.decompose()
72 # Extract document outline
73 headings = self._extract_heading_hierarchy(soup)
74 semantic_elements = self._identify_semantic_elements(soup)
75 links = self._extract_links(soup)
76 accessibility = self._analyze_accessibility(soup)
78 return {
79 "heading_hierarchy": headings,
80 "semantic_elements": semantic_elements,
81 "internal_links": len([l for l in links if l.get("internal", False)]),
82 "external_links": len(
83 [l for l in links if not l.get("internal", False)]
84 ),
85 "has_navigation": bool(soup.find("nav")),
86 "has_main_content": bool(soup.find("main")),
87 "has_header": bool(soup.find("header")),
88 "has_footer": bool(soup.find("footer")),
89 "has_aside": bool(soup.find("aside")),
90 "structure_type": "html",
91 "accessibility_features": accessibility,
92 "form_count": len(soup.find_all("form")),
93 "table_count": len(soup.find_all("table")),
94 "image_count": len(soup.find_all("img")),
95 "list_count": len(soup.find_all(["ul", "ol"])),
96 "content_sections": len(soup.find_all(list(self.section_elements))),
97 }
98 except Exception as e:
99 # Fallback structure for malformed HTML
100 return {
101 "heading_hierarchy": [],
102 "semantic_elements": [],
103 "internal_links": 0,
104 "external_links": 0,
105 "has_navigation": False,
106 "has_main_content": False,
107 "has_header": False,
108 "has_footer": False,
109 "has_aside": False,
110 "structure_type": "html_malformed",
111 "accessibility_features": {},
112 "form_count": 0,
113 "table_count": 0,
114 "image_count": 0,
115 "list_count": 0,
116 "content_sections": 0,
117 "parse_error": str(e),
118 }
120 def extract_section_metadata(self, section: Any) -> dict[str, Any]:
121 """Extract metadata from an HTML section."""
122 if isinstance(section, dict):
123 # Already processed section metadata
124 return section
126 if isinstance(section, Tag):
127 return self._extract_tag_metadata(section)
129 # Fallback for string content
130 return {
131 "tag_name": "div",
132 "section_type": SectionType.DIV.value,
133 "level": 0,
134 "attributes": {},
135 "has_links": bool(re.search(r"<a\s+[^>]*href", str(section))),
136 "has_images": bool(re.search(r"<img\s+[^>]*src", str(section))),
137 }
139 def _extract_heading_hierarchy(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
140 """Extract document heading hierarchy."""
141 headings = []
143 for heading in soup.find_all(list(self.heading_elements)):
144 level = int(heading.name[1]) # Extract number from h1, h2, etc.
145 text = heading.get_text(strip=True)
147 headings.append(
148 {
149 "level": level,
150 "text": text,
151 "tag": heading.name,
152 "id": heading.get("id"),
153 "classes": heading.get("class", []),
154 }
155 )
157 return headings
159 def _identify_semantic_elements(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
160 """Identify semantic HTML elements and their roles."""
161 semantic_elements = []
163 for element in soup.find_all(list(self.section_elements)):
164 semantic_elements.append(
165 {
166 "tag": element.name,
167 "role": element.get("role"),
168 "id": element.get("id"),
169 "classes": element.get("class", []),
170 "text_length": len(element.get_text(strip=True)),
171 "has_children": bool(element.find_all()),
172 }
173 )
175 return semantic_elements
177 def _extract_links(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
178 """Extract and categorize links."""
179 links = []
181 for link in soup.find_all("a", href=True):
182 href = link["href"]
183 text = link.get_text(strip=True)
185 # Determine if link is internal or external
186 is_internal = (
187 href.startswith("#")
188 or href.startswith("/")
189 or href.startswith("./")
190 or href.startswith("../")
191 or not href.startswith(("http://", "https://", "mailto:", "tel:"))
192 )
194 links.append(
195 {
196 "href": href,
197 "text": text,
198 "internal": is_internal,
199 "title": link.get("title"),
200 "target": link.get("target"),
201 }
202 )
204 return links
206 def _analyze_accessibility(self, soup: BeautifulSoup) -> dict[str, Any]:
207 """Analyze accessibility features of the HTML document."""
208 accessibility = {
209 "has_lang_attribute": bool(soup.find("html", lang=True)),
210 "has_title": bool(soup.find("title")),
211 "images_with_alt": 0,
212 "images_without_alt": 0,
213 "headings_properly_nested": True,
214 "has_skip_links": False,
215 "form_labels": 0,
216 "form_inputs": 0,
217 }
219 # Analyze images
220 for img in soup.find_all("img"):
221 if img.get("alt") is not None:
222 accessibility["images_with_alt"] += 1
223 else:
224 accessibility["images_without_alt"] += 1
226 # Check for skip links
227 skip_link_indicators = ["skip", "jump", "goto"]
228 for link in soup.find_all("a", href=True):
229 link_text = link.get_text(strip=True).lower()
230 if any(indicator in link_text for indicator in skip_link_indicators):
231 accessibility["has_skip_links"] = True
232 break
234 # Analyze forms
235 accessibility["form_inputs"] = len(
236 soup.find_all(["input", "textarea", "select"])
237 )
238 accessibility["form_labels"] = len(soup.find_all("label"))
240 # Check heading nesting (simplified)
241 headings = soup.find_all(list(self.heading_elements))
242 if len(headings) > 1:
243 prev_level = 0
244 for heading in headings:
245 level = int(heading.name[1])
246 if prev_level > 0 and level > prev_level + 1:
247 accessibility["headings_properly_nested"] = False
248 break
249 prev_level = level
251 return accessibility
253 def _extract_tag_metadata(self, tag: Tag) -> dict[str, Any]:
254 """Extract metadata from a BeautifulSoup tag."""
255 tag_name = tag.name.lower()
256 section_type = self._identify_section_type(tag)
258 # Get attributes (limited for performance)
259 attributes = {}
260 if tag.attrs:
261 # Only keep essential attributes
262 for attr in ["id", "class", "role", "data-*"]:
263 if attr in tag.attrs:
264 attributes[attr] = tag.attrs[attr]
265 elif attr == "data-*":
266 # Collect data attributes
267 data_attrs = {
268 k: v for k, v in tag.attrs.items() if k.startswith("data-")
269 }
270 if data_attrs:
271 attributes["data_attributes"] = data_attrs
273 text_content = tag.get_text(strip=True)
275 return {
276 "tag_name": tag_name,
277 "section_type": section_type.value,
278 "level": self._get_heading_level(tag),
279 "attributes": attributes,
280 "text_content": text_content,
281 "word_count": len(text_content.split()),
282 "char_count": len(text_content),
283 "has_code": section_type == SectionType.CODE_BLOCK,
284 "has_links": bool(tag.find_all("a")),
285 "has_images": bool(tag.find_all("img")),
286 "is_semantic": tag_name in self.section_elements,
287 "is_heading": tag_name in self.heading_elements,
288 "child_count": len(tag.find_all()),
289 }
291 def _identify_section_type(self, tag: Tag) -> SectionType:
292 """Identify the type of section based on the HTML tag."""
293 tag_name = tag.name.lower()
295 if tag_name in self.heading_elements:
296 return SectionType.HEADER
297 elif tag_name == "article":
298 return SectionType.ARTICLE
299 elif tag_name == "section":
300 return SectionType.SECTION
301 elif tag_name == "nav":
302 return SectionType.NAV
303 elif tag_name == "aside":
304 return SectionType.ASIDE
305 elif tag_name == "main":
306 return SectionType.MAIN
307 elif tag_name == "footer":
308 return SectionType.FOOTER
309 elif tag_name in ["ul", "ol", "li"]:
310 return SectionType.LIST
311 elif tag_name == "table":
312 return SectionType.TABLE
313 elif tag_name in ["pre", "code"]:
314 return SectionType.CODE_BLOCK
315 elif tag_name == "blockquote":
316 return SectionType.BLOCKQUOTE
317 elif tag_name == "p":
318 return SectionType.PARAGRAPH
319 else:
320 return SectionType.DIV
322 def _get_heading_level(self, tag: Tag) -> int:
323 """Get the heading level from an HTML heading tag."""
324 if tag.name.lower() in self.heading_elements:
325 return int(tag.name[1]) # Extract number from h1, h2, etc.
326 return 0
328 def extract_section_title(self, content: str) -> str:
329 """Extract a title from HTML content."""
330 try:
331 soup = BeautifulSoup(content, "html.parser")
333 # Try to find title in various elements
334 for tag in ["h1", "h2", "h3", "h4", "h5", "h6", "title"]:
335 element = soup.find(tag)
336 if element:
337 title = element.get_text(strip=True)
338 if title:
339 return title[:100] # Limit title length
341 # Try to find text in semantic elements
342 for tag in ["article", "section", "main"]:
343 element = soup.find(tag)
344 if element:
345 text = element.get_text(strip=True)
346 if text:
347 return self._extract_title_from_content(text)
349 # Fallback to first text content
350 text = soup.get_text(strip=True)
351 if text:
352 return self._extract_title_from_content(text)
354 return "Untitled Section"
356 except Exception:
357 return "Untitled Section"
359 def _extract_title_from_content(self, content: str) -> str:
360 """Extract a title from content text."""
361 if not content:
362 return "Untitled Section"
364 # Take first line or first 50 characters, whichever is shorter
365 lines = content.strip().split("\n")
366 first_line = lines[0].strip() if lines else ""
368 if first_line:
369 # Limit title length for performance
370 return first_line[:100] if len(first_line) > 100 else first_line
372 return "Untitled Section"