Coverage for src/qdrant_loader/core/chunking/strategy/html/html_document

1"""HTML-specific document parser for DOM structure analysis."""

3import re

4from enum import Enum

5from typing import Any

7from bs4 import BeautifulSoup, Tag

9from qdrant_loader.core.chunking.strategy.base.document_parser import BaseDocumentParser

12class SectionType(Enum):

13 """Types of sections in an HTML document."""

15 HEADER = "header"

16 ARTICLE = "article"

17 SECTION = "section"

18 NAV = "nav"

19 ASIDE = "aside"

20 MAIN = "main"

21 PARAGRAPH = "paragraph"

22 LIST = "list"

23 TABLE = "table"

24 CODE_BLOCK = "code_block"

25 BLOCKQUOTE = "blockquote"

26 DIV = "div"

27 FOOTER = "footer"

30class HTMLDocumentParser(BaseDocumentParser):

31 """Parser for HTML documents with semantic analysis."""

33 def __init__(self):

34 """Initialize the HTML document parser."""

35 # Define semantic HTML elements that should be treated as section boundaries

36 self.section_elements = {

37 "article",

38 "section",

39 "main",

40 "header",

41 "footer",

42 "nav",

43 "aside",

44 }

46 # Define heading elements for hierarchy

47 self.heading_elements = {"h1", "h2", "h3", "h4", "h5", "h6"}

49 # Define block-level elements that can form chunks

50 self.block_elements = {

51 "div",

52 "p",

53 "blockquote",

54 "pre",

55 "ul",

56 "ol",

57 "li",

58 "table",

59 "figure",

60 "form",

61 }

63 def parse_document_structure(self, content: str) -> dict[str, Any]:

64 """Parse HTML DOM structure and extract semantic information."""

65 try:

66 soup = BeautifulSoup(content, "html.parser")

68 # Remove script and style elements for cleaner analysis

69 for script in soup(["script", "style"]):

70 script.decompose()

72 # Extract document outline

73 headings = self._extract_heading_hierarchy(soup)

74 semantic_elements = self._identify_semantic_elements(soup)

75 links = self._extract_links(soup)

76 accessibility = self._analyze_accessibility(soup)

78 return {

79 "heading_hierarchy": headings,

80 "semantic_elements": semantic_elements,

81 "internal_links": len(

82 [link for link in links if link.get("internal", False)]

83 ),

84 "external_links": len(

85 [link for link in links if not link.get("internal", False)]

86 ),

87 "has_navigation": bool(soup.find("nav")),

88 "has_main_content": bool(soup.find("main")),

89 "has_header": bool(soup.find("header")),

90 "has_footer": bool(soup.find("footer")),

91 "has_aside": bool(soup.find("aside")),

92 "structure_type": "html",

93 "accessibility_features": accessibility,

94 "form_count": len(soup.find_all("form")),

95 "table_count": len(soup.find_all("table")),

96 "image_count": len(soup.find_all("img")),

97 "list_count": len(soup.find_all(["ul", "ol"])),

98 "content_sections": len(soup.find_all(list(self.section_elements))),

99 }

100 except Exception as e:

101 # Fallback structure for malformed HTML

102 return {

103 "heading_hierarchy": [],

104 "semantic_elements": [],

105 "internal_links": 0,

106 "external_links": 0,

107 "has_navigation": False,

108 "has_main_content": False,

109 "has_header": False,

110 "has_footer": False,

111 "has_aside": False,

112 "structure_type": "html_malformed",

113 "accessibility_features": {},

114 "form_count": 0,

115 "table_count": 0,

116 "image_count": 0,

117 "list_count": 0,

118 "content_sections": 0,

119 "parse_error": str(e),

120 }

121

122 def extract_section_metadata(self, section: Any) -> dict[str, Any]:

123 """Extract metadata from an HTML section."""

124 if isinstance(section, dict):

125 # Already processed section metadata

126 return section

127

128 if isinstance(section, Tag):

129 return self._extract_tag_metadata(section)

130

131 # Fallback for string content

132 return {

133 "tag_name": "div",

134 "section_type": SectionType.DIV.value,

135 "level": 0,

136 "attributes": {},

137 "has_links": bool(re.search(r"<a\s+[^>]*href", str(section))),

138 "has_images": bool(re.search(r"<img\s+[^>]*src", str(section))),

139 }

140

141 def _extract_heading_hierarchy(self, soup: BeautifulSoup) -> list[dict[str, Any]]:

142 """Extract document heading hierarchy."""

143 headings = []

144

145 for heading in soup.find_all(list(self.heading_elements)):

146 level = int(heading.name[1]) # Extract number from h1, h2, etc.

147 text = heading.get_text(strip=True)

148

149 headings.append(

150 {

151 "level": level,

152 "text": text,

153 "tag": heading.name,

154 "id": heading.get("id"),

155 "classes": heading.get("class", []),

156 }

157 )

158

159 return headings

160

161 def _identify_semantic_elements(self, soup: BeautifulSoup) -> list[dict[str, Any]]:

162 """Identify semantic HTML elements and their roles."""

163 semantic_elements = []

164

165 for element in soup.find_all(list(self.section_elements)):

166 semantic_elements.append(

167 {

168 "tag": element.name,

169 "role": element.get("role"),

170 "id": element.get("id"),

171 "classes": element.get("class", []),

172 "text_length": len(element.get_text(strip=True)),

173 "has_children": bool(element.find_all()),

174 }

175 )

176

177 return semantic_elements

178

179 def _extract_links(self, soup: BeautifulSoup) -> list[dict[str, Any]]:

180 """Extract and categorize links."""

181 links = []

182

183 for link in soup.find_all("a", href=True):

184 href = link["href"]

185 text = link.get_text(strip=True)

186

187 # Determine if link is internal or external

188 is_internal = (

189 href.startswith("#")

190 or href.startswith("/")

191 or href.startswith("./")

192 or href.startswith("../")

193 or not href.startswith(("http://", "https://", "mailto:", "tel:"))

194 )

195

196 links.append(

197 {

198 "href": href,

199 "text": text,

200 "internal": is_internal,

201 "title": link.get("title"),

202 "target": link.get("target"),

203 }

204 )

205

206 return links

207

208 def _analyze_accessibility(self, soup: BeautifulSoup) -> dict[str, Any]:

209 """Analyze accessibility features of the HTML document."""

210 accessibility = {

211 "has_lang_attribute": bool(soup.find("html", lang=True)),

212 "has_title": bool(soup.find("title")),

213 "images_with_alt": 0,

214 "images_without_alt": 0,

215 "headings_properly_nested": True,

216 "has_skip_links": False,

217 "form_labels": 0,

218 "form_inputs": 0,

219 }

220

221 # Analyze images

222 for img in soup.find_all("img"):

223 if img.get("alt") is not None:

224 accessibility["images_with_alt"] += 1

225 else:

226 accessibility["images_without_alt"] += 1

227

228 # Check for skip links

229 skip_link_indicators = ["skip", "jump", "goto"]

230 for link in soup.find_all("a", href=True):

231 link_text = link.get_text(strip=True).lower()

232 if any(indicator in link_text for indicator in skip_link_indicators):

233 accessibility["has_skip_links"] = True

234 break

235

236 # Analyze forms

237 accessibility["form_inputs"] = len(

238 soup.find_all(["input", "textarea", "select"])

239 )

240 accessibility["form_labels"] = len(soup.find_all("label"))

241

242 # Check heading nesting (simplified)

243 headings = soup.find_all(list(self.heading_elements))

244 if len(headings) > 1:

245 prev_level = 0

246 for heading in headings:

247 level = int(heading.name[1])

248 if prev_level > 0 and level > prev_level + 1:

249 accessibility["headings_properly_nested"] = False

250 break

251 prev_level = level

252

253 return accessibility

254

255 def _extract_tag_metadata(self, tag: Tag) -> dict[str, Any]:

256 """Extract metadata from a BeautifulSoup tag."""

257 tag_name = tag.name.lower()

258 section_type = self._identify_section_type(tag)

259

260 # Get attributes (limited for performance)

261 attributes = {}

262 if tag.attrs:

263 # Only keep essential attributes

264 for attr in ["id", "class", "role", "data-*"]:

265 if attr in tag.attrs:

266 attributes[attr] = tag.attrs[attr]

267 elif attr == "data-*":

268 # Collect data attributes

269 data_attrs = {

270 k: v for k, v in tag.attrs.items() if k.startswith("data-")

271 }

272 if data_attrs:

273 attributes["data_attributes"] = data_attrs

274

275 text_content = tag.get_text(strip=True)

276

277 return {

278 "tag_name": tag_name,

279 "section_type": section_type.value,

280 "level": self._get_heading_level(tag),

281 "attributes": attributes,

282 "text_content": text_content,

283 "word_count": len(text_content.split()),

284 "char_count": len(text_content),

285 "has_code": section_type == SectionType.CODE_BLOCK,

286 "has_links": bool(tag.find_all("a")),

287 "has_images": bool(tag.find_all("img")),

288 "is_semantic": tag_name in self.section_elements,

289 "is_heading": tag_name in self.heading_elements,

290 "child_count": len(tag.find_all()),

291 }

292

293 def _identify_section_type(self, tag: Tag) -> SectionType:

294 """Identify the type of section based on the HTML tag."""

295 tag_name = tag.name.lower()

296

297 if tag_name in self.heading_elements:

298 return SectionType.HEADER

299 elif tag_name == "article":

300 return SectionType.ARTICLE

301 elif tag_name == "section":

302 return SectionType.SECTION

303 elif tag_name == "nav":

304 return SectionType.NAV

305 elif tag_name == "aside":

306 return SectionType.ASIDE

307 elif tag_name == "main":

308 return SectionType.MAIN

309 elif tag_name == "footer":

310 return SectionType.FOOTER

311 elif tag_name in ["ul", "ol", "li"]:

312 return SectionType.LIST

313 elif tag_name == "table":

314 return SectionType.TABLE

315 elif tag_name in ["pre", "code"]:

316 return SectionType.CODE_BLOCK

317 elif tag_name == "blockquote":

318 return SectionType.BLOCKQUOTE

319 elif tag_name == "p":

320 return SectionType.PARAGRAPH

321 else:

322 return SectionType.DIV

323

324 def _get_heading_level(self, tag: Tag) -> int:

325 """Get the heading level from an HTML heading tag."""

326 if tag.name.lower() in self.heading_elements:

327 return int(tag.name[1]) # Extract number from h1, h2, etc.

328 return 0

329

330 def extract_section_title(self, content: str) -> str:

331 """Extract a title from HTML content."""

332 try:

333 soup = BeautifulSoup(content, "html.parser")

334

335 # Try to find title in various elements

336 for tag in ["h1", "h2", "h3", "h4", "h5", "h6", "title"]:

337 element = soup.find(tag)

338 if element:

339 title = element.get_text(strip=True)

340 if title:

341 return title[:100] # Limit title length

342

343 # Try to find text in semantic elements

344 for tag in ["article", "section", "main"]:

345 element = soup.find(tag)

346 if element:

347 text = element.get_text(strip=True)

348 if text:

349 return self._extract_title_from_content(text)

350

351 # Fallback to first text content

352 text = soup.get_text(strip=True)

353 if text:

354 return self._extract_title_from_content(text)

355

356 return "Untitled Section"

357

358 except Exception:

359 return "Untitled Section"

360

361 def _extract_title_from_content(self, content: str) -> str:

362 """Extract a title from content text."""

363 if not content:

364 return "Untitled Section"

365

366 # Take first line or first 50 characters, whichever is shorter

367 lines = content.strip().split("\n")

368 first_line = lines[0].strip() if lines else ""

369

370 if first_line:

371 # Limit title length for performance

372 return first_line[:100] if len(first_line) > 100 else first_line

373

374 return "Untitled Section"

Coverage for src/qdrant_loader/core/chunking/strategy/html/html_document_parser.py: 77%

160 statements