Coverage for src/qdrant_loader/core/chunking/strategy/html_strategy.py: 73%

93 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""HTML-specific chunking strategy with modular architecture.""" 

2 

3import structlog 

4 

5from qdrant_loader.config import Settings 

6from qdrant_loader.core.chunking.progress_tracker import ChunkingProgressTracker 

7from qdrant_loader.core.chunking.strategy.base_strategy import BaseChunkingStrategy 

8from qdrant_loader.core.document import Document 

9 

10from .html import ( 

11 HTMLChunkProcessor, 

12 HTMLDocumentParser, 

13 HTMLMetadataExtractor, 

14 HTMLSectionSplitter, 

15) 

16 

17logger = structlog.get_logger(__name__) 

18 

19 

20class HTMLChunkingStrategy(BaseChunkingStrategy): 

21 """Strategy for chunking HTML documents using modular architecture. 

22 

23 This strategy leverages HTML-specific components for intelligent document processing: 

24 - HTMLDocumentParser: Analyzes HTML DOM structure and semantic elements 

25 - HTMLSectionSplitter: Splits content based on semantic boundaries 

26 - HTMLMetadataExtractor: Extracts HTML-specific metadata and accessibility features 

27 - HTMLChunkProcessor: Creates enhanced chunk documents with DOM context 

28 

29 The strategy preserves HTML semantic structure while providing intelligent 

30 fallbacks for large or malformed documents. 

31 """ 

32 

33 def __init__(self, settings: Settings): 

34 """Initialize the HTML chunking strategy with modular components. 

35 

36 Args: 

37 settings: Configuration settings 

38 """ 

39 super().__init__(settings) 

40 self.logger = logger 

41 self.progress_tracker = ChunkingProgressTracker(logger) 

42 

43 # Initialize HTML-specific modular components 

44 self.document_parser = HTMLDocumentParser() 

45 self.section_splitter = HTMLSectionSplitter(settings) 

46 self.metadata_extractor = HTMLMetadataExtractor() 

47 self.chunk_processor = HTMLChunkProcessor(settings) 

48 

49 # Get configuration settings 

50 self.html_config = settings.global_config.chunking.strategies.html 

51 self.max_html_size_for_parsing = self.html_config.max_html_size_for_parsing 

52 

53 self.logger.info( 

54 "HTMLChunkingStrategy initialized with modular architecture", 

55 extra={ 

56 "chunk_size": self.chunk_size, 

57 "chunk_overlap": self.chunk_overlap, 

58 "max_html_size_for_parsing": self.max_html_size_for_parsing, 

59 "preserve_semantic_structure": self.html_config.preserve_semantic_structure, 

60 }, 

61 ) 

62 

63 def chunk_document(self, document: Document) -> list[Document]: 

64 """Chunk an HTML document using modular architecture. 

65 

66 Args: 

67 document: The document to chunk 

68 

69 Returns: 

70 List of chunked documents with enhanced HTML metadata 

71 """ 

72 file_name = ( 

73 document.metadata.get("file_name") 

74 or document.metadata.get("original_filename") 

75 or document.title 

76 or f"{document.source_type}:{document.source}" 

77 ) 

78 

79 # Start progress tracking 

80 self.progress_tracker.start_chunking( 

81 document.id, 

82 document.source, 

83 document.source_type, 

84 len(document.content), 

85 file_name, 

86 ) 

87 

88 try: 

89 # Check for very large files that should use fallback chunking 

90 if len(document.content) > self.max_html_size_for_parsing: 

91 self.logger.info( 

92 f"HTML file too large ({len(document.content)} bytes), using fallback chunking" 

93 ) 

94 self.progress_tracker.log_fallback( 

95 document.id, f"Large HTML file ({len(document.content)} bytes)" 

96 ) 

97 return self._fallback_chunking(document) 

98 

99 # Parse document structure for analysis 

100 self.logger.debug("Analyzing HTML document structure") 

101 document_structure = self.document_parser.parse_document_structure( 

102 document.content 

103 ) 

104 

105 # Split content into semantic sections 

106 self.logger.debug("Splitting HTML content into sections") 

107 sections = self.section_splitter.split_sections(document.content, document) 

108 

109 if not sections: 

110 self.progress_tracker.finish_chunking(document.id, 0, "html_modular") 

111 return [] 

112 

113 # Create chunk documents using modular processor 

114 chunked_docs = [] 

115 for i, section in enumerate(sections): 

116 chunk_content = section["content"] 

117 self.logger.debug( 

118 f"Processing HTML section {i+1}/{len(sections)}", 

119 extra={ 

120 "chunk_size": len(chunk_content), 

121 "section_type": section.get("section_type", "unknown"), 

122 "tag_name": section.get("tag_name", "unknown"), 

123 "dom_path": section.get("dom_path", "unknown"), 

124 }, 

125 ) 

126 

127 # Create chunk document using the modular chunk processor 

128 chunk_doc = self.chunk_processor.create_chunk_document( 

129 original_doc=document, 

130 chunk_content=chunk_content, 

131 chunk_metadata=section, 

132 chunk_index=i, 

133 total_chunks=len(sections), 

134 skip_nlp=False, # Let the processor decide based on content analysis 

135 ) 

136 

137 # Add document structure context to metadata 

138 chunk_doc.metadata["document_structure"] = document_structure 

139 chunk_doc.metadata["chunking_strategy"] = "html_modular" 

140 

141 chunked_docs.append(chunk_doc) 

142 

143 # Finish progress tracking 

144 self.progress_tracker.finish_chunking( 

145 document.id, len(chunked_docs), "html_modular" 

146 ) 

147 

148 self.logger.info( 

149 "Successfully chunked HTML document with modular architecture", 

150 extra={ 

151 "document_id": document.id, 

152 "total_chunks": len(chunked_docs), 

153 "document_structure_type": document_structure.get( 

154 "structure_type", "unknown" 

155 ), 

156 "has_semantic_elements": len( 

157 document_structure.get("semantic_elements", []) 

158 ) 

159 > 0, 

160 "accessibility_features": len( 

161 document_structure.get("accessibility_features", {}) 

162 ) 

163 > 0, 

164 }, 

165 ) 

166 

167 return chunked_docs 

168 

169 except Exception as e: 

170 self.progress_tracker.log_error(document.id, str(e)) 

171 self.logger.error( 

172 "HTML chunking failed, using fallback strategy", 

173 extra={"document_id": document.id, "error": str(e)}, 

174 ) 

175 # Fallback to simple chunking 

176 self.progress_tracker.log_fallback( 

177 document.id, f"HTML parsing failed: {str(e)}" 

178 ) 

179 return self._fallback_chunking(document) 

180 

181 def _fallback_chunking(self, document: Document) -> list[Document]: 

182 """Simple fallback chunking when the main strategy fails. 

183 

184 This method provides a robust fallback by using the section splitter's 

185 fallback mechanism and basic chunk processing. 

186 

187 Args: 

188 document: Document to chunk 

189 

190 Returns: 

191 List of chunked documents 

192 """ 

193 self.logger.info("Using fallback chunking strategy for HTML document") 

194 

195 try: 

196 # Use section splitter's fallback mechanism 

197 sections = self.section_splitter._fallback_split(document.content) 

198 

199 if not sections: 

200 # Ultimate fallback: single chunk 

201 return self._create_single_chunk_fallback(document) 

202 

203 # Create chunked documents using basic processing 

204 chunked_docs = [] 

205 for i, section in enumerate(sections): 

206 chunk_content = section["content"] 

207 

208 # Validate chunk content 

209 if not chunk_content or not chunk_content.strip(): 

210 self.logger.warning(f"Skipping empty fallback chunk {i+1}") 

211 continue 

212 

213 # Create simple chunk document 

214 chunk_doc = self.chunk_processor.create_chunk_document( 

215 original_doc=document, 

216 chunk_content=chunk_content, 

217 chunk_metadata=section, 

218 chunk_index=i, 

219 total_chunks=len(sections), 

220 skip_nlp=True, # Skip NLP for fallback chunks 

221 ) 

222 

223 # Mark as fallback chunking 

224 chunk_doc.metadata.update( 

225 { 

226 "chunking_strategy": "html_fallback", 

227 "chunking_method": "fallback_modular", 

228 } 

229 ) 

230 

231 chunked_docs.append(chunk_doc) 

232 

233 return chunked_docs 

234 

235 except Exception as e: 

236 self.logger.error(f"Fallback chunking failed: {e}") 

237 return self._create_single_chunk_fallback(document) 

238 

239 def _create_single_chunk_fallback(self, document: Document) -> list[Document]: 

240 """Ultimate fallback: return original document as single chunk. 

241 

242 Args: 

243 document: Document to return as single chunk 

244 

245 Returns: 

246 List containing single chunk document 

247 """ 

248 try: 

249 # Create single chunk with minimal processing 

250 chunk_doc = Document( 

251 content=document.content, 

252 metadata=document.metadata.copy(), 

253 source=document.source, 

254 source_type=document.source_type, 

255 url=document.url, 

256 title=document.title, 

257 content_type=document.content_type, 

258 ) 

259 

260 chunk_doc.id = Document.generate_chunk_id(document.id, 0) 

261 chunk_doc.metadata.update( 

262 { 

263 "chunk_index": 0, 

264 "total_chunks": 1, 

265 "parent_document_id": document.id, 

266 "chunking_strategy": "html_single_fallback", 

267 "chunking_method": "fallback_single", 

268 "entities": [], 

269 "nlp_skipped": True, 

270 "skip_reason": "fallback_error", 

271 "content_type": "html", 

272 } 

273 ) 

274 

275 return [chunk_doc] 

276 

277 except Exception as e: 

278 self.logger.error(f"Single chunk fallback failed: {e}") 

279 # If even this fails, return empty list 

280 return [] 

281 

282 def __del__(self): 

283 """Cleanup method.""" 

284 # Call shutdown to clean up resources 

285 self.shutdown() 

286 

287 def _split_text(self, text: str) -> list[str]: 

288 """Split text into chunks using the section splitter. 

289 

290 This method implements the abstract method from BaseChunkingStrategy 

291 for backward compatibility, though the main chunking is handled by 

292 the modular chunk_document method. 

293 

294 Args: 

295 text: Text to split 

296 

297 Returns: 

298 List of text chunks 

299 """ 

300 try: 

301 # Use the section splitter to split the text 

302 sections = self.section_splitter.split_sections(text) 

303 return [section.get("content", "") for section in sections] 

304 except Exception as e: 

305 self.logger.warning(f"Text splitting failed, using fallback: {e}") 

306 # Fallback to simple text splitting 

307 return [text] 

308 

309 def shutdown(self): 

310 """Shutdown the strategy and clean up resources.""" 

311 # Clean up any cached data from components 

312 if hasattr(self, "section_splitter"): 

313 # Section splitter cleanup if needed 

314 pass 

315 

316 if hasattr(self, "chunk_processor"): 

317 # Chunk processor cleanup if needed 

318 pass 

319 

320 self.logger.debug("HTMLChunkingStrategy shutdown completed")