Coverage for src/qdrant_loader/core/chunking/strategy/html/html_chunk_processor.py: 76%
83 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-13 09:19 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-13 09:19 +0000
1"""HTML-specific chunk processor for creating HTML documents with enhanced metadata."""
3from typing import Any
5from qdrant_loader.config import Settings
6from qdrant_loader.core.chunking.strategy.base.chunk_processor import BaseChunkProcessor
7from qdrant_loader.core.document import Document
9from .html_metadata_extractor import HTMLMetadataExtractor
12class HTMLChunkProcessor(BaseChunkProcessor):
13 """Chunk processor for HTML documents with semantic and accessibility analysis."""
15 def __init__(self, settings: Settings):
16 """Initialize the HTML chunk processor."""
17 super().__init__(settings)
19 # Initialize HTML-specific metadata extractor
20 self.metadata_extractor = HTMLMetadataExtractor()
22 # Get HTML strategy configuration
23 self.html_config = settings.global_config.chunking.strategies.html
24 self.max_chunk_size_for_nlp = self.html_config.max_chunk_size_for_nlp
26 def create_chunk_document(
27 self,
28 original_doc: Document,
29 chunk_content: str,
30 chunk_metadata: dict[str, Any],
31 chunk_index: int,
32 total_chunks: int,
33 skip_nlp: bool = False,
34 ) -> Document:
35 """Create an HTML chunk document with enhanced metadata."""
37 # Generate unique chunk ID
38 chunk_id = Document.generate_chunk_id(original_doc.id, chunk_index)
40 # Extract HTML-specific hierarchical metadata
41 enriched_metadata = self.metadata_extractor.extract_hierarchical_metadata(
42 chunk_content, chunk_metadata, original_doc
43 )
45 # Add chunk-specific metadata
46 enriched_metadata.update(
47 {
48 "chunk_index": chunk_index,
49 "total_chunks": total_chunks,
50 "parent_document_id": original_doc.id,
51 "chunking_strategy": "html_modular",
52 "chunk_size": len(chunk_content),
53 }
54 )
56 # Determine if we should skip NLP processing
57 should_skip_nlp = skip_nlp or self.should_skip_semantic_analysis(
58 chunk_content, enriched_metadata
59 )
61 # Extract entities if NLP is enabled
62 entities = []
63 if not should_skip_nlp:
64 entities = self.metadata_extractor.extract_entities(chunk_content)
65 enriched_metadata["entities"] = entities
66 enriched_metadata["nlp_skipped"] = False
67 else:
68 enriched_metadata["entities"] = []
69 enriched_metadata["nlp_skipped"] = True
70 enriched_metadata["skip_reason"] = self._determine_skip_reason(
71 chunk_content, enriched_metadata
72 )
74 # Create the chunk document
75 chunk_doc = Document(
76 id=chunk_id,
77 content=chunk_content,
78 metadata=enriched_metadata,
79 source=original_doc.source,
80 source_type=original_doc.source_type,
81 url=original_doc.url,
82 content_type=original_doc.content_type,
83 title=self._generate_chunk_title(chunk_content, chunk_index, original_doc),
84 )
86 return chunk_doc
88 def should_skip_semantic_analysis(
89 self, content: str, chunk_metadata: dict[str, Any]
90 ) -> bool:
91 """Determine if semantic analysis should be skipped for HTML content."""
93 # Skip for very large chunks to prevent performance issues
94 if len(content) > self.max_chunk_size_for_nlp:
95 return True
97 # Skip for very small chunks (likely not meaningful)
98 if len(content.strip()) < 50:
99 return True
101 # Check if content is mostly HTML markup without substantial text
102 text_content = chunk_metadata.get("text_content", "")
103 if text_content and len(text_content) < len(content) * 0.3:
104 # Less than 30% is actual text content
105 return True
107 # Skip for certain HTML section types that are typically non-semantic
108 section_type = chunk_metadata.get("section_type", "")
109 if section_type in ["nav", "aside", "footer"]:
110 # Navigation, sidebars, and footers are usually not meaningful for NLP
111 return True
113 # Skip if content is primarily code or script blocks
114 if section_type == "code_block":
115 return True
117 # Skip if content has very high markup-to-text ratio
118 markup_ratio = self._calculate_markup_ratio(content, text_content)
119 if markup_ratio > 0.8: # More than 80% markup
120 return True
122 # Skip if accessibility score is very low (might indicate poor content)
123 accessibility_score = chunk_metadata.get("accessibility_score", 1.0)
124 if accessibility_score < 0.2:
125 return True
127 return False
129 def _generate_chunk_title(
130 self, content: str, chunk_index: int, original_doc: Document
131 ) -> str:
132 """Generate a descriptive title for the HTML chunk."""
133 try:
134 # Try to extract title from HTML content using metadata extractor
135 section_title = (
136 self.metadata_extractor.document_parser.extract_section_title(content)
137 )
139 if section_title and section_title != "Untitled Section":
140 return f"{section_title} (Chunk {chunk_index + 1})"
142 # Fallback to original document title with chunk number
143 if original_doc.title:
144 return f"{original_doc.title} - Chunk {chunk_index + 1}"
146 # Ultimate fallback
147 return f"HTML Content - Chunk {chunk_index + 1}"
149 except Exception:
150 return f"HTML Content - Chunk {chunk_index + 1}"
152 def _determine_skip_reason(
153 self, content: str, chunk_metadata: dict[str, Any]
154 ) -> str:
155 """Determine the specific reason why NLP was skipped."""
157 if len(content) > self.max_chunk_size_for_nlp:
158 return f"content_too_large ({len(content)} > {self.max_chunk_size_for_nlp})"
160 if len(content.strip()) < 50:
161 return "content_too_small"
163 text_content = chunk_metadata.get("text_content", "")
164 if text_content and len(text_content) < len(content) * 0.3:
165 return "low_text_ratio"
167 section_type = chunk_metadata.get("section_type", "")
168 if section_type in ["nav", "aside", "footer"]:
169 return f"non_semantic_section ({section_type})"
171 if section_type == "code_block":
172 return "code_content"
174 markup_ratio = self._calculate_markup_ratio(content, text_content)
175 if markup_ratio > 0.8:
176 return f"high_markup_ratio ({markup_ratio:.2f})"
178 accessibility_score = chunk_metadata.get("accessibility_score", 1.0)
179 if accessibility_score < 0.2:
180 return f"low_accessibility_score ({accessibility_score:.2f})"
182 return "unknown"
184 def _calculate_markup_ratio(self, content: str, text_content: str) -> float:
185 """Calculate the ratio of markup to text content."""
186 if not content:
187 return 0.0
189 if not text_content:
190 return 1.0 # All markup, no text
192 markup_length = len(content) - len(text_content)
193 return markup_length / len(content) if len(content) > 0 else 0.0