Coverage for src/qdrant_loader/core/chunking/strategy/html/html_metadata

1"""HTML-specific metadata extractor for enhanced HTML document analysis."""

3import re

4from typing import Any

6from bs4 import BeautifulSoup

8from qdrant_loader.core.chunking.strategy.base.metadata_extractor import (

9 BaseMetadataExtractor,

10)

11from qdrant_loader.core.document import Document

13from .html_document_parser import HTMLDocumentParser

16class HTMLMetadataExtractor(BaseMetadataExtractor):

17 """Metadata extractor for HTML documents with semantic and accessibility analysis."""

19 def __init__(self):

20 """Initialize the HTML metadata extractor."""

21 self.document_parser = HTMLDocumentParser()

23 def extract_hierarchical_metadata(

24 self, content: str, chunk_metadata: dict[str, Any], document: Document

25 ) -> dict[str, Any]:

26 """Extract HTML-specific hierarchical metadata."""

27 try:

28 soup = BeautifulSoup(content, "html.parser")

30 metadata = chunk_metadata.copy()

32 # Add HTML-specific metadata

33 metadata.update(

34 {

35 "dom_path": self._build_dom_path_breadcrumb(soup),

36 "semantic_tags": self._extract_semantic_tags(soup),

37 "accessibility_score": self._calculate_accessibility_score(soup),

38 "has_structured_data": self._has_structured_data(soup),

39 "interactive_elements": self._analyze_interactive_elements(soup),

40 "media_elements": self._analyze_media_elements(soup),

41 "content_type": "html",

42 "html_features": self._analyze_html_features(soup),

43 "seo_indicators": self._analyze_seo_indicators(soup),

44 "markup_quality": self._assess_markup_quality(soup),

45 }

46 )

48 return metadata

50 except Exception as e:

51 # Fallback metadata for malformed HTML

52 metadata = chunk_metadata.copy()

53 metadata.update(

54 {

55 "content_type": "html_malformed",

56 "parse_error": str(e),

57 "dom_path": "unknown",

58 "semantic_tags": [],

59 "accessibility_score": 0.0,

60 "has_structured_data": False,

61 }

62 )

63 return metadata

65 def extract_entities(self, text: str) -> list[str]:

66 """Extract HTML-specific entities including semantic elements and IDs."""

67 try:

68 soup = BeautifulSoup(text, "html.parser")

69 entities = []

71 # Extract IDs as entities

72 for element in soup.find_all(id=True):

73 entities.append(f"#{element.get('id')}")

75 # Extract class names as entities

76 for element in soup.find_all(class_=True):

77 classes = element.get("class", [])

78 entities.extend([f".{cls}" for cls in classes])

80 # Extract semantic element types

81 semantic_elements = soup.find_all(

82 list(self.document_parser.section_elements)

83 )

84 entities.extend([elem.name for elem in semantic_elements])

86 # Extract link destinations

87 for link in soup.find_all("a", href=True):

88 href = link["href"]

89 if href.startswith("#"):

90 entities.append(href) # Internal link

91 elif href.startswith("http"):

92 entities.append(href) # External link

94 # Remove duplicates and limit

95 return list(set(entities))[:50]

97 except Exception:

98 return []

100 def _build_dom_path_breadcrumb(self, soup: BeautifulSoup) -> str:

101 """Build a DOM path breadcrumb for context."""

102 try:

103 # Find the deepest meaningful element

104 meaningful_elements = []

105

106 for element in soup.find_all():

107 if (

108 element.name in self.document_parser.section_elements

109 or element.name in self.document_parser.heading_elements

110 or element.get("id")

111 or element.get("role")

112 ):

113 meaningful_elements.append(element)

114

115 if not meaningful_elements:

116 return "body"

117

118 # Build path from the first meaningful element

119 element = meaningful_elements[0]

120 path_parts = []

121

122 while element and len(path_parts) < 5: # Limit depth

123 part = element.name

124 if element.get("id"):

125 part += f"#{element.get('id')}"

126 elif element.get("class"):

127 classes = element.get("class", [])[:2] # Limit classes

128 part += f".{'.'.join(classes)}"

129

130 path_parts.append(part)

131 element = element.parent

132

133 # Stop at body or html

134 if element and element.name in ["body", "html"]:

135 break

136

137 return " > ".join(reversed(path_parts)) if path_parts else "body"

138

139 except Exception:

140 return "unknown"

141

142 def _extract_semantic_tags(self, soup: BeautifulSoup) -> list[dict[str, Any]]:

143 """Extract semantic HTML tags and their properties."""

144 semantic_tags = []

145

146 try:

147 for element in soup.find_all():

148 if element.name in self.document_parser.section_elements:

149 tag_info = {

150 "tag": element.name,

151 "role": element.get("role"),

152 "id": element.get("id"),

153 "classes": element.get("class", [])[:3], # Limit classes

154 "has_content": bool(element.get_text(strip=True)),

155 "child_count": len(element.find_all()),

156 }

157 semantic_tags.append(tag_info)

158

159 return semantic_tags[:10] # Limit results

160

161 except Exception:

162 return []

163

164 def _calculate_accessibility_score(self, soup: BeautifulSoup) -> float:

165 """Calculate an accessibility score for the HTML content."""

166 try:

167 score = 0.0

168 max_score = 10.0

169

170 # Check for lang attribute

171 if soup.find("html", lang=True):

172 score += 1.0

173

174 # Check image alt texts

175 images = soup.find_all("img")

176 if images:

177 images_with_alt = len(

178 [img for img in images if img.get("alt") is not None]

179 )

180 score += (images_with_alt / len(images)) * 2.0

181 else:

182 score += 2.0 # No images, full score

183

184 # Check heading hierarchy

185 headings = soup.find_all(list(self.document_parser.heading_elements))

186 if headings:

187 # Simple check: first heading should be h1

188 if headings[0].name == "h1":

189 score += 1.0

190 # Check for proper nesting (simplified)

191 proper_nesting = True

192 prev_level = 0

193 for heading in headings:

194 level = int(heading.name[1])

195 if prev_level > 0 and level > prev_level + 1:

196 proper_nesting = False

197 break

198 prev_level = level

199 if proper_nesting:

200 score += 1.0

201

202 # Check for skip links

203 skip_indicators = ["skip", "jump", "goto"]

204 for link in soup.find_all("a", href=True):

205 link_text = link.get_text(strip=True).lower()

206 if any(indicator in link_text for indicator in skip_indicators):

207 score += 1.0

208 break

209

210 # Check form labels

211 forms = soup.find_all("form")

212 if forms:

213 inputs = soup.find_all(["input", "textarea", "select"])

214 labels = soup.find_all("label")

215 if inputs:

216 label_ratio = len(labels) / len(inputs)

217 score += min(label_ratio, 1.0) * 2.0

218 else:

219 score += 2.0 # No forms, full score

220

221 # Check for ARIA attributes

222 aria_elements = soup.find_all(attrs={"role": True})

223 aria_elements.extend(soup.find_all(attrs=re.compile(r"^aria-")))

224 if aria_elements:

225 score += 1.0

226

227 # Check for semantic HTML5 elements

228 semantic_count = len(

229 soup.find_all(list(self.document_parser.section_elements))

230 )

231 if semantic_count > 0:

232 score += min(

233 semantic_count / 3.0, 1.0

234 ) # Up to 1 point for semantic elements

235

236 return min(score / max_score, 1.0) # Normalize to 0-1

237

238 except Exception:

239 return 0.0

240

241 def _has_structured_data(self, soup: BeautifulSoup) -> bool:

242 """Check if the HTML contains structured data."""

243 try:

244 # Check for JSON-LD

245 json_ld = soup.find("script", type="application/ld+json")

246 if json_ld:

247 return True

248

249 # Check for microdata

250 microdata = soup.find_all(attrs={"itemscope": True})

251 if microdata:

252 return True

253

254 # Check for RDFa

255 rdfa = soup.find_all(attrs={"property": True})

256 if rdfa:

257 return True

258

259 # Check for Open Graph

260 og_tags = soup.find_all("meta", property=re.compile(r"^og:"))

261 if og_tags:

262 return True

263

264 # Check for Twitter Cards

265 twitter_tags = soup.find_all(

266 "meta", attrs={"name": re.compile(r"^twitter:")}

267 )

268 if twitter_tags:

269 return True

270

271 return False

272

273 except Exception:

274 return False

275

276 def _analyze_interactive_elements(self, soup: BeautifulSoup) -> dict[str, Any]:

277 """Analyze interactive elements in the HTML."""

278 try:

279 interactive = {

280 "forms": len(soup.find_all("form")),

281 "buttons": len(

282 soup.find_all(

283 ["button", "input[type='button']", "input[type='submit']"]

284 )

285 ),

286 "links": len(soup.find_all("a", href=True)),

287 "inputs": len(soup.find_all(["input", "textarea", "select"])),

288 "clickable_elements": 0,

289 "has_javascript_events": False,

290 }

291

292 # Count elements with click events

293 clickable = soup.find_all(attrs=re.compile(r"^on(click|touch|mouse)"))

294 interactive["clickable_elements"] = len(clickable)

295

296 # Check for JavaScript event handlers

297 js_events = soup.find_all(attrs=re.compile(r"^on[a-z]+"))

298 interactive["has_javascript_events"] = len(js_events) > 0

299

300 return interactive

301

302 except Exception:

303 return {}

304

305 def _analyze_media_elements(self, soup: BeautifulSoup) -> dict[str, Any]:

306 """Analyze media elements in the HTML."""

307 try:

308 media = {

309 "images": len(soup.find_all("img")),

310 "videos": len(soup.find_all("video")),

311 "audio": len(soup.find_all("audio")),

312 "iframes": len(soup.find_all("iframe")),

313 "canvas": len(soup.find_all("canvas")),

314 "svg": len(soup.find_all("svg")),

315 }

316

317 # Analyze image properties

318 images = soup.find_all("img")

319 if images:

320 media["images_with_alt"] = len(

321 [img for img in images if img.get("alt")]

322 )

323 media["images_with_title"] = len(

324 [img for img in images if img.get("title")]

325 )

326 media["responsive_images"] = len(

327 [img for img in images if img.get("srcset")]

328 )

329

330 return media

331

332 except Exception:

333 return {}

334

335 def _analyze_html_features(self, soup: BeautifulSoup) -> dict[str, Any]:

336 """Analyze HTML5 and modern web features."""

337 try:

338 features = {

339 "html5_semantic_tags": 0,

340 "custom_elements": 0,

341 "data_attributes": 0,

342 "css_classes": 0,

343 "inline_styles": 0,

344 }

345

346 # Count HTML5 semantic tags

347 features["html5_semantic_tags"] = len(

348 soup.find_all(list(self.document_parser.section_elements))

349 )

350

351 # Count custom elements (tags with hyphens)

352 custom_elements = soup.find_all(lambda tag: tag.name and "-" in tag.name)

353 features["custom_elements"] = len(custom_elements)

354

355 # Count data attributes

356 data_attrs = soup.find_all(attrs=re.compile(r"^data-"))

357 features["data_attributes"] = len(data_attrs)

358

359 # Count CSS classes

360 elements_with_class = soup.find_all(class_=True)

361 features["css_classes"] = sum(

362 len(elem.get("class", [])) for elem in elements_with_class

363 )

364

365 # Count inline styles

366 features["inline_styles"] = len(soup.find_all(style=True))

367

368 return features

369

370 except Exception:

371 return {}

372

373 def _analyze_seo_indicators(self, soup: BeautifulSoup) -> dict[str, Any]:

374 """Analyze SEO-related indicators."""

375 try:

376 seo = {

377 "has_title": False,

378 "has_meta_description": False,

379 "has_h1": False,

380 "heading_count": 0,

381 "internal_links": 0,

382 "external_links": 0,

383 "has_canonical": False,

384 "has_robots_meta": False,

385 }

386

387 # Check for title

388 title = soup.find("title")

389 seo["has_title"] = bool(title and title.get_text(strip=True))

390

391 # Check for meta description

392 meta_desc = soup.find("meta", attrs={"name": "description"})

393 seo["has_meta_description"] = bool(meta_desc and meta_desc.get("content"))

394

395 # Check for H1

396 h1 = soup.find("h1")

397 seo["has_h1"] = bool(h1 and h1.get_text(strip=True))

398

399 # Count headings

400 headings = soup.find_all(list(self.document_parser.heading_elements))

401 seo["heading_count"] = len(headings)

402

403 # Analyze links

404 links = soup.find_all("a", href=True)

405 for link in links:

406 href = link["href"]

407 if href.startswith(("http://", "https://")) and "://" in href:

408 seo["external_links"] += 1

409 else:

410 seo["internal_links"] += 1

411

412 # Check for canonical link

413 canonical = soup.find("link", rel="canonical")

414 seo["has_canonical"] = bool(canonical)

415

416 # Check for robots meta

417 robots = soup.find("meta", attrs={"name": "robots"})

418 seo["has_robots_meta"] = bool(robots)

419

420 return seo

421

422 except Exception:

423 return {}

424

425 def _assess_markup_quality(self, soup: BeautifulSoup) -> dict[str, Any]:

426 """Assess the quality of HTML markup."""

427 try:

428 quality = {

429 "semantic_ratio": 0.0,

430 "accessibility_features": 0,

431 "deprecated_tags": 0,

432 "inline_styles": 0,

433 "proper_nesting": True,

434 "valid_attributes": True,

435 }

436

437 # Calculate semantic ratio

438 all_elements = soup.find_all()

439 semantic_elements = soup.find_all(

440 list(self.document_parser.section_elements)

441 )

442 if all_elements:

443 quality["semantic_ratio"] = len(semantic_elements) / len(all_elements)

444

445 # Count accessibility features

446 accessibility_features = 0

447 if soup.find_all(alt=True):

448 accessibility_features += 1

449 if soup.find_all(attrs={"role": True}):

450 accessibility_features += 1

451 if soup.find_all(attrs=re.compile(r"^aria-")):

452 accessibility_features += 1

453 if soup.find_all("label"):

454 accessibility_features += 1

455 quality["accessibility_features"] = accessibility_features

456

457 # Count deprecated tags (simplified list)

458 deprecated_tags = ["font", "center", "big", "small", "strike", "tt"]

459 quality["deprecated_tags"] = sum(

460 len(soup.find_all(tag)) for tag in deprecated_tags

461 )

462

463 # Count inline styles

464 quality["inline_styles"] = len(soup.find_all(style=True))

465

466 return quality

467

468 except Exception:

469 return {}

Coverage for src/qdrant_loader/core/chunking/strategy/html/html_metadata_extractor.py: 75%

224 statements