Coverage for src/qdrant_loader/core/chunking/strategy/html_strategy.py: 73%
93 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1"""HTML-specific chunking strategy with modular architecture."""
3import structlog
5from qdrant_loader.config import Settings
6from qdrant_loader.core.chunking.progress_tracker import ChunkingProgressTracker
7from qdrant_loader.core.chunking.strategy.base_strategy import BaseChunkingStrategy
8from qdrant_loader.core.document import Document
10from .html import (
11 HTMLChunkProcessor,
12 HTMLDocumentParser,
13 HTMLMetadataExtractor,
14 HTMLSectionSplitter,
15)
17logger = structlog.get_logger(__name__)
20class HTMLChunkingStrategy(BaseChunkingStrategy):
21 """Strategy for chunking HTML documents using modular architecture.
23 This strategy leverages HTML-specific components for intelligent document processing:
24 - HTMLDocumentParser: Analyzes HTML DOM structure and semantic elements
25 - HTMLSectionSplitter: Splits content based on semantic boundaries
26 - HTMLMetadataExtractor: Extracts HTML-specific metadata and accessibility features
27 - HTMLChunkProcessor: Creates enhanced chunk documents with DOM context
29 The strategy preserves HTML semantic structure while providing intelligent
30 fallbacks for large or malformed documents.
31 """
33 def __init__(self, settings: Settings):
34 """Initialize the HTML chunking strategy with modular components.
36 Args:
37 settings: Configuration settings
38 """
39 super().__init__(settings)
40 self.logger = logger
41 self.progress_tracker = ChunkingProgressTracker(logger)
43 # Initialize HTML-specific modular components
44 self.document_parser = HTMLDocumentParser()
45 self.section_splitter = HTMLSectionSplitter(settings)
46 self.metadata_extractor = HTMLMetadataExtractor()
47 self.chunk_processor = HTMLChunkProcessor(settings)
49 # Get configuration settings
50 self.html_config = settings.global_config.chunking.strategies.html
51 self.max_html_size_for_parsing = self.html_config.max_html_size_for_parsing
53 self.logger.info(
54 "HTMLChunkingStrategy initialized with modular architecture",
55 extra={
56 "chunk_size": self.chunk_size,
57 "chunk_overlap": self.chunk_overlap,
58 "max_html_size_for_parsing": self.max_html_size_for_parsing,
59 "preserve_semantic_structure": self.html_config.preserve_semantic_structure,
60 },
61 )
63 def chunk_document(self, document: Document) -> list[Document]:
64 """Chunk an HTML document using modular architecture.
66 Args:
67 document: The document to chunk
69 Returns:
70 List of chunked documents with enhanced HTML metadata
71 """
72 file_name = (
73 document.metadata.get("file_name")
74 or document.metadata.get("original_filename")
75 or document.title
76 or f"{document.source_type}:{document.source}"
77 )
79 # Start progress tracking
80 self.progress_tracker.start_chunking(
81 document.id,
82 document.source,
83 document.source_type,
84 len(document.content),
85 file_name,
86 )
88 try:
89 # Check for very large files that should use fallback chunking
90 if len(document.content) > self.max_html_size_for_parsing:
91 self.logger.info(
92 f"HTML file too large ({len(document.content)} bytes), using fallback chunking"
93 )
94 self.progress_tracker.log_fallback(
95 document.id, f"Large HTML file ({len(document.content)} bytes)"
96 )
97 return self._fallback_chunking(document)
99 # Parse document structure for analysis
100 self.logger.debug("Analyzing HTML document structure")
101 document_structure = self.document_parser.parse_document_structure(
102 document.content
103 )
105 # Split content into semantic sections
106 self.logger.debug("Splitting HTML content into sections")
107 sections = self.section_splitter.split_sections(document.content, document)
109 if not sections:
110 self.progress_tracker.finish_chunking(document.id, 0, "html_modular")
111 return []
113 # Create chunk documents using modular processor
114 chunked_docs = []
115 for i, section in enumerate(sections):
116 chunk_content = section["content"]
117 self.logger.debug(
118 f"Processing HTML section {i+1}/{len(sections)}",
119 extra={
120 "chunk_size": len(chunk_content),
121 "section_type": section.get("section_type", "unknown"),
122 "tag_name": section.get("tag_name", "unknown"),
123 "dom_path": section.get("dom_path", "unknown"),
124 },
125 )
127 # Create chunk document using the modular chunk processor
128 chunk_doc = self.chunk_processor.create_chunk_document(
129 original_doc=document,
130 chunk_content=chunk_content,
131 chunk_metadata=section,
132 chunk_index=i,
133 total_chunks=len(sections),
134 skip_nlp=False, # Let the processor decide based on content analysis
135 )
137 # Add document structure context to metadata
138 chunk_doc.metadata["document_structure"] = document_structure
139 chunk_doc.metadata["chunking_strategy"] = "html_modular"
141 chunked_docs.append(chunk_doc)
143 # Finish progress tracking
144 self.progress_tracker.finish_chunking(
145 document.id, len(chunked_docs), "html_modular"
146 )
148 self.logger.info(
149 "Successfully chunked HTML document with modular architecture",
150 extra={
151 "document_id": document.id,
152 "total_chunks": len(chunked_docs),
153 "document_structure_type": document_structure.get(
154 "structure_type", "unknown"
155 ),
156 "has_semantic_elements": len(
157 document_structure.get("semantic_elements", [])
158 )
159 > 0,
160 "accessibility_features": len(
161 document_structure.get("accessibility_features", {})
162 )
163 > 0,
164 },
165 )
167 return chunked_docs
169 except Exception as e:
170 self.progress_tracker.log_error(document.id, str(e))
171 self.logger.error(
172 "HTML chunking failed, using fallback strategy",
173 extra={"document_id": document.id, "error": str(e)},
174 )
175 # Fallback to simple chunking
176 self.progress_tracker.log_fallback(
177 document.id, f"HTML parsing failed: {str(e)}"
178 )
179 return self._fallback_chunking(document)
181 def _fallback_chunking(self, document: Document) -> list[Document]:
182 """Simple fallback chunking when the main strategy fails.
184 This method provides a robust fallback by using the section splitter's
185 fallback mechanism and basic chunk processing.
187 Args:
188 document: Document to chunk
190 Returns:
191 List of chunked documents
192 """
193 self.logger.info("Using fallback chunking strategy for HTML document")
195 try:
196 # Use section splitter's fallback mechanism
197 sections = self.section_splitter._fallback_split(document.content)
199 if not sections:
200 # Ultimate fallback: single chunk
201 return self._create_single_chunk_fallback(document)
203 # Create chunked documents using basic processing
204 chunked_docs = []
205 for i, section in enumerate(sections):
206 chunk_content = section["content"]
208 # Validate chunk content
209 if not chunk_content or not chunk_content.strip():
210 self.logger.warning(f"Skipping empty fallback chunk {i+1}")
211 continue
213 # Create simple chunk document
214 chunk_doc = self.chunk_processor.create_chunk_document(
215 original_doc=document,
216 chunk_content=chunk_content,
217 chunk_metadata=section,
218 chunk_index=i,
219 total_chunks=len(sections),
220 skip_nlp=True, # Skip NLP for fallback chunks
221 )
223 # Mark as fallback chunking
224 chunk_doc.metadata.update(
225 {
226 "chunking_strategy": "html_fallback",
227 "chunking_method": "fallback_modular",
228 }
229 )
231 chunked_docs.append(chunk_doc)
233 return chunked_docs
235 except Exception as e:
236 self.logger.error(f"Fallback chunking failed: {e}")
237 return self._create_single_chunk_fallback(document)
239 def _create_single_chunk_fallback(self, document: Document) -> list[Document]:
240 """Ultimate fallback: return original document as single chunk.
242 Args:
243 document: Document to return as single chunk
245 Returns:
246 List containing single chunk document
247 """
248 try:
249 # Create single chunk with minimal processing
250 chunk_doc = Document(
251 content=document.content,
252 metadata=document.metadata.copy(),
253 source=document.source,
254 source_type=document.source_type,
255 url=document.url,
256 title=document.title,
257 content_type=document.content_type,
258 )
260 chunk_doc.id = Document.generate_chunk_id(document.id, 0)
261 chunk_doc.metadata.update(
262 {
263 "chunk_index": 0,
264 "total_chunks": 1,
265 "parent_document_id": document.id,
266 "chunking_strategy": "html_single_fallback",
267 "chunking_method": "fallback_single",
268 "entities": [],
269 "nlp_skipped": True,
270 "skip_reason": "fallback_error",
271 "content_type": "html",
272 }
273 )
275 return [chunk_doc]
277 except Exception as e:
278 self.logger.error(f"Single chunk fallback failed: {e}")
279 # If even this fails, return empty list
280 return []
282 def __del__(self):
283 """Cleanup method."""
284 # Call shutdown to clean up resources
285 self.shutdown()
287 def _split_text(self, text: str) -> list[str]:
288 """Split text into chunks using the section splitter.
290 This method implements the abstract method from BaseChunkingStrategy
291 for backward compatibility, though the main chunking is handled by
292 the modular chunk_document method.
294 Args:
295 text: Text to split
297 Returns:
298 List of text chunks
299 """
300 try:
301 # Use the section splitter to split the text
302 sections = self.section_splitter.split_sections(text)
303 return [section.get("content", "") for section in sections]
304 except Exception as e:
305 self.logger.warning(f"Text splitting failed, using fallback: {e}")
306 # Fallback to simple text splitting
307 return [text]
309 def shutdown(self):
310 """Shutdown the strategy and clean up resources."""
311 # Clean up any cached data from components
312 if hasattr(self, "section_splitter"):
313 # Section splitter cleanup if needed
314 pass
316 if hasattr(self, "chunk_processor"):
317 # Chunk processor cleanup if needed
318 pass
320 self.logger.debug("HTMLChunkingStrategy shutdown completed")