Coverage for src/qdrant_loader/core/chunking/strategy/html/html_document_parser.py: 77%
160 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1"""HTML-specific document parser for DOM structure analysis."""
3import re
4from enum import Enum
5from typing import Any
7from bs4 import BeautifulSoup, Tag
9from qdrant_loader.core.chunking.strategy.base.document_parser import BaseDocumentParser
12class SectionType(Enum):
13 """Types of sections in an HTML document."""
15 HEADER = "header"
16 ARTICLE = "article"
17 SECTION = "section"
18 NAV = "nav"
19 ASIDE = "aside"
20 MAIN = "main"
21 PARAGRAPH = "paragraph"
22 LIST = "list"
23 TABLE = "table"
24 CODE_BLOCK = "code_block"
25 BLOCKQUOTE = "blockquote"
26 DIV = "div"
27 FOOTER = "footer"
30class HTMLDocumentParser(BaseDocumentParser):
31 """Parser for HTML documents with semantic analysis."""
33 def __init__(self):
34 """Initialize the HTML document parser."""
35 # Define semantic HTML elements that should be treated as section boundaries
36 self.section_elements = {
37 "article",
38 "section",
39 "main",
40 "header",
41 "footer",
42 "nav",
43 "aside",
44 }
46 # Define heading elements for hierarchy
47 self.heading_elements = {"h1", "h2", "h3", "h4", "h5", "h6"}
49 # Define block-level elements that can form chunks
50 self.block_elements = {
51 "div",
52 "p",
53 "blockquote",
54 "pre",
55 "ul",
56 "ol",
57 "li",
58 "table",
59 "figure",
60 "form",
61 }
63 def parse_document_structure(self, content: str) -> dict[str, Any]:
64 """Parse HTML DOM structure and extract semantic information."""
65 try:
66 soup = BeautifulSoup(content, "html.parser")
68 # Remove script and style elements for cleaner analysis
69 for script in soup(["script", "style"]):
70 script.decompose()
72 # Extract document outline
73 headings = self._extract_heading_hierarchy(soup)
74 semantic_elements = self._identify_semantic_elements(soup)
75 links = self._extract_links(soup)
76 accessibility = self._analyze_accessibility(soup)
78 return {
79 "heading_hierarchy": headings,
80 "semantic_elements": semantic_elements,
81 "internal_links": len(
82 [link for link in links if link.get("internal", False)]
83 ),
84 "external_links": len(
85 [link for link in links if not link.get("internal", False)]
86 ),
87 "has_navigation": bool(soup.find("nav")),
88 "has_main_content": bool(soup.find("main")),
89 "has_header": bool(soup.find("header")),
90 "has_footer": bool(soup.find("footer")),
91 "has_aside": bool(soup.find("aside")),
92 "structure_type": "html",
93 "accessibility_features": accessibility,
94 "form_count": len(soup.find_all("form")),
95 "table_count": len(soup.find_all("table")),
96 "image_count": len(soup.find_all("img")),
97 "list_count": len(soup.find_all(["ul", "ol"])),
98 "content_sections": len(soup.find_all(list(self.section_elements))),
99 }
100 except Exception as e:
101 # Fallback structure for malformed HTML
102 return {
103 "heading_hierarchy": [],
104 "semantic_elements": [],
105 "internal_links": 0,
106 "external_links": 0,
107 "has_navigation": False,
108 "has_main_content": False,
109 "has_header": False,
110 "has_footer": False,
111 "has_aside": False,
112 "structure_type": "html_malformed",
113 "accessibility_features": {},
114 "form_count": 0,
115 "table_count": 0,
116 "image_count": 0,
117 "list_count": 0,
118 "content_sections": 0,
119 "parse_error": str(e),
120 }
122 def extract_section_metadata(self, section: Any) -> dict[str, Any]:
123 """Extract metadata from an HTML section."""
124 if isinstance(section, dict):
125 # Already processed section metadata
126 return section
128 if isinstance(section, Tag):
129 return self._extract_tag_metadata(section)
131 # Fallback for string content
132 return {
133 "tag_name": "div",
134 "section_type": SectionType.DIV.value,
135 "level": 0,
136 "attributes": {},
137 "has_links": bool(re.search(r"<a\s+[^>]*href", str(section))),
138 "has_images": bool(re.search(r"<img\s+[^>]*src", str(section))),
139 }
141 def _extract_heading_hierarchy(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
142 """Extract document heading hierarchy."""
143 headings = []
145 for heading in soup.find_all(list(self.heading_elements)):
146 level = int(heading.name[1]) # Extract number from h1, h2, etc.
147 text = heading.get_text(strip=True)
149 headings.append(
150 {
151 "level": level,
152 "text": text,
153 "tag": heading.name,
154 "id": heading.get("id"),
155 "classes": heading.get("class", []),
156 }
157 )
159 return headings
161 def _identify_semantic_elements(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
162 """Identify semantic HTML elements and their roles."""
163 semantic_elements = []
165 for element in soup.find_all(list(self.section_elements)):
166 semantic_elements.append(
167 {
168 "tag": element.name,
169 "role": element.get("role"),
170 "id": element.get("id"),
171 "classes": element.get("class", []),
172 "text_length": len(element.get_text(strip=True)),
173 "has_children": bool(element.find_all()),
174 }
175 )
177 return semantic_elements
179 def _extract_links(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
180 """Extract and categorize links."""
181 links = []
183 for link in soup.find_all("a", href=True):
184 href = link["href"]
185 text = link.get_text(strip=True)
187 # Determine if link is internal or external
188 is_internal = (
189 href.startswith("#")
190 or href.startswith("/")
191 or href.startswith("./")
192 or href.startswith("../")
193 or not href.startswith(("http://", "https://", "mailto:", "tel:"))
194 )
196 links.append(
197 {
198 "href": href,
199 "text": text,
200 "internal": is_internal,
201 "title": link.get("title"),
202 "target": link.get("target"),
203 }
204 )
206 return links
208 def _analyze_accessibility(self, soup: BeautifulSoup) -> dict[str, Any]:
209 """Analyze accessibility features of the HTML document."""
210 accessibility = {
211 "has_lang_attribute": bool(soup.find("html", lang=True)),
212 "has_title": bool(soup.find("title")),
213 "images_with_alt": 0,
214 "images_without_alt": 0,
215 "headings_properly_nested": True,
216 "has_skip_links": False,
217 "form_labels": 0,
218 "form_inputs": 0,
219 }
221 # Analyze images
222 for img in soup.find_all("img"):
223 if img.get("alt") is not None:
224 accessibility["images_with_alt"] += 1
225 else:
226 accessibility["images_without_alt"] += 1
228 # Check for skip links
229 skip_link_indicators = ["skip", "jump", "goto"]
230 for link in soup.find_all("a", href=True):
231 link_text = link.get_text(strip=True).lower()
232 if any(indicator in link_text for indicator in skip_link_indicators):
233 accessibility["has_skip_links"] = True
234 break
236 # Analyze forms
237 accessibility["form_inputs"] = len(
238 soup.find_all(["input", "textarea", "select"])
239 )
240 accessibility["form_labels"] = len(soup.find_all("label"))
242 # Check heading nesting (simplified)
243 headings = soup.find_all(list(self.heading_elements))
244 if len(headings) > 1:
245 prev_level = 0
246 for heading in headings:
247 level = int(heading.name[1])
248 if prev_level > 0 and level > prev_level + 1:
249 accessibility["headings_properly_nested"] = False
250 break
251 prev_level = level
253 return accessibility
255 def _extract_tag_metadata(self, tag: Tag) -> dict[str, Any]:
256 """Extract metadata from a BeautifulSoup tag."""
257 tag_name = tag.name.lower()
258 section_type = self._identify_section_type(tag)
260 # Get attributes (limited for performance)
261 attributes = {}
262 if tag.attrs:
263 # Only keep essential attributes
264 for attr in ["id", "class", "role", "data-*"]:
265 if attr in tag.attrs:
266 attributes[attr] = tag.attrs[attr]
267 elif attr == "data-*":
268 # Collect data attributes
269 data_attrs = {
270 k: v for k, v in tag.attrs.items() if k.startswith("data-")
271 }
272 if data_attrs:
273 attributes["data_attributes"] = data_attrs
275 text_content = tag.get_text(strip=True)
277 return {
278 "tag_name": tag_name,
279 "section_type": section_type.value,
280 "level": self._get_heading_level(tag),
281 "attributes": attributes,
282 "text_content": text_content,
283 "word_count": len(text_content.split()),
284 "char_count": len(text_content),
285 "has_code": section_type == SectionType.CODE_BLOCK,
286 "has_links": bool(tag.find_all("a")),
287 "has_images": bool(tag.find_all("img")),
288 "is_semantic": tag_name in self.section_elements,
289 "is_heading": tag_name in self.heading_elements,
290 "child_count": len(tag.find_all()),
291 }
293 def _identify_section_type(self, tag: Tag) -> SectionType:
294 """Identify the type of section based on the HTML tag."""
295 tag_name = tag.name.lower()
297 if tag_name in self.heading_elements:
298 return SectionType.HEADER
299 elif tag_name == "article":
300 return SectionType.ARTICLE
301 elif tag_name == "section":
302 return SectionType.SECTION
303 elif tag_name == "nav":
304 return SectionType.NAV
305 elif tag_name == "aside":
306 return SectionType.ASIDE
307 elif tag_name == "main":
308 return SectionType.MAIN
309 elif tag_name == "footer":
310 return SectionType.FOOTER
311 elif tag_name in ["ul", "ol", "li"]:
312 return SectionType.LIST
313 elif tag_name == "table":
314 return SectionType.TABLE
315 elif tag_name in ["pre", "code"]:
316 return SectionType.CODE_BLOCK
317 elif tag_name == "blockquote":
318 return SectionType.BLOCKQUOTE
319 elif tag_name == "p":
320 return SectionType.PARAGRAPH
321 else:
322 return SectionType.DIV
324 def _get_heading_level(self, tag: Tag) -> int:
325 """Get the heading level from an HTML heading tag."""
326 if tag.name.lower() in self.heading_elements:
327 return int(tag.name[1]) # Extract number from h1, h2, etc.
328 return 0
330 def extract_section_title(self, content: str) -> str:
331 """Extract a title from HTML content."""
332 try:
333 soup = BeautifulSoup(content, "html.parser")
335 # Try to find title in various elements
336 for tag in ["h1", "h2", "h3", "h4", "h5", "h6", "title"]:
337 element = soup.find(tag)
338 if element:
339 title = element.get_text(strip=True)
340 if title:
341 return title[:100] # Limit title length
343 # Try to find text in semantic elements
344 for tag in ["article", "section", "main"]:
345 element = soup.find(tag)
346 if element:
347 text = element.get_text(strip=True)
348 if text:
349 return self._extract_title_from_content(text)
351 # Fallback to first text content
352 text = soup.get_text(strip=True)
353 if text:
354 return self._extract_title_from_content(text)
356 return "Untitled Section"
358 except Exception:
359 return "Untitled Section"
361 def _extract_title_from_content(self, content: str) -> str:
362 """Extract a title from content text."""
363 if not content:
364 return "Untitled Section"
366 # Take first line or first 50 characters, whichever is shorter
367 lines = content.strip().split("\n")
368 first_line = lines[0].strip() if lines else ""
370 if first_line:
371 # Limit title length for performance
372 return first_line[:100] if len(first_line) > 100 else first_line
374 return "Untitled Section"