Coverage for src/qdrant_loader/core/chunking/strategy/html

1"""HTML-specific chunking strategy with modular architecture."""

3import structlog

5from qdrant_loader.config import Settings

6from qdrant_loader.core.chunking.progress_tracker import ChunkingProgressTracker

7from qdrant_loader.core.chunking.strategy.base_strategy import BaseChunkingStrategy

8from qdrant_loader.core.document import Document

10from .html import (

11 HTMLChunkProcessor,

12 HTMLDocumentParser,

13 HTMLMetadataExtractor,

14 HTMLSectionSplitter,

15)

17logger = structlog.get_logger(__name__)

20class HTMLChunkingStrategy(BaseChunkingStrategy):

21 """Strategy for chunking HTML documents using modular architecture.

23 This strategy leverages HTML-specific components for intelligent document processing:

24 - HTMLDocumentParser: Analyzes HTML DOM structure and semantic elements

25 - HTMLSectionSplitter: Splits content based on semantic boundaries

26 - HTMLMetadataExtractor: Extracts HTML-specific metadata and accessibility features

27 - HTMLChunkProcessor: Creates enhanced chunk documents with DOM context

29 The strategy preserves HTML semantic structure while providing intelligent

30 fallbacks for large or malformed documents.

31 """

33 def __init__(self, settings: Settings):

34 """Initialize the HTML chunking strategy with modular components.

36 Args:

37 settings: Configuration settings

38 """

39 super().__init__(settings)

40 self.logger = logger

41 self.progress_tracker = ChunkingProgressTracker(logger)

43 # Initialize HTML-specific modular components

44 self.document_parser = HTMLDocumentParser()

45 self.section_splitter = HTMLSectionSplitter(settings)

46 self.metadata_extractor = HTMLMetadataExtractor()

47 self.chunk_processor = HTMLChunkProcessor(settings)

49 # Get configuration settings

50 self.html_config = settings.global_config.chunking.strategies.html

51 self.max_html_size_for_parsing = self.html_config.max_html_size_for_parsing

53 self.logger.info(

54 "HTMLChunkingStrategy initialized with modular architecture",

55 extra={

56 "chunk_size": self.chunk_size,

57 "chunk_overlap": self.chunk_overlap,

58 "max_html_size_for_parsing": self.max_html_size_for_parsing,

59 "preserve_semantic_structure": self.html_config.preserve_semantic_structure,

60 },

61 )

63 def chunk_document(self, document: Document) -> list[Document]:

64 """Chunk an HTML document using modular architecture.

66 Args:

67 document: The document to chunk

69 Returns:

70 List of chunked documents with enhanced HTML metadata

71 """

72 file_name = (

73 document.metadata.get("file_name")

74 or document.metadata.get("original_filename")

75 or document.title

76 or f"{document.source_type}:{document.source}"

77 )

79 # Start progress tracking

80 self.progress_tracker.start_chunking(

81 document.id,

82 document.source,

83 document.source_type,

84 len(document.content),

85 file_name,

86 )

88 try:

89 # Check for very large files that should use fallback chunking

90 if len(document.content) > self.max_html_size_for_parsing:

91 self.logger.info(

92 f"HTML file too large ({len(document.content)} bytes), using fallback chunking"

93 )

94 self.progress_tracker.log_fallback(

95 document.id, f"Large HTML file ({len(document.content)} bytes)"

96 )

97 return self._fallback_chunking(document)

99 # Parse document structure for analysis

100 self.logger.debug("Analyzing HTML document structure")

101 document_structure = self.document_parser.parse_document_structure(

102 document.content

103 )

104

105 # Split content into semantic sections

106 self.logger.debug("Splitting HTML content into sections")

107 sections = self.section_splitter.split_sections(document.content, document)

108

109 if not sections:

110 self.progress_tracker.finish_chunking(document.id, 0, "html_modular")

111 return []

112

113 # Create chunk documents using modular processor

114 chunked_docs = []

115 for i, section in enumerate(sections):

116 chunk_content = section["content"]

117 self.logger.debug(

118 f"Processing HTML section {i+1}/{len(sections)}",

119 extra={

120 "chunk_size": len(chunk_content),

121 "section_type": section.get("section_type", "unknown"),

122 "tag_name": section.get("tag_name", "unknown"),

123 "dom_path": section.get("dom_path", "unknown"),

124 },

125 )

126

127 # Create chunk document using the modular chunk processor

128 chunk_doc = self.chunk_processor.create_chunk_document(

129 original_doc=document,

130 chunk_content=chunk_content,

131 chunk_metadata=section,

132 chunk_index=i,

133 total_chunks=len(sections),

134 skip_nlp=False, # Let the processor decide based on content analysis

135 )

136

137 # Add document structure context to metadata

138 chunk_doc.metadata["document_structure"] = document_structure

139 chunk_doc.metadata["chunking_strategy"] = "html_modular"

140

141 chunked_docs.append(chunk_doc)

142

143 # Finish progress tracking

144 self.progress_tracker.finish_chunking(

145 document.id, len(chunked_docs), "html_modular"

146 )

147

148 self.logger.info(

149 "Successfully chunked HTML document with modular architecture",

150 extra={

151 "document_id": document.id,

152 "total_chunks": len(chunked_docs),

153 "document_structure_type": document_structure.get(

154 "structure_type", "unknown"

155 ),

156 "has_semantic_elements": len(

157 document_structure.get("semantic_elements", [])

158 )

159 > 0,

160 "accessibility_features": len(

161 document_structure.get("accessibility_features", {})

162 )

163 > 0,

164 },

165 )

166

167 return chunked_docs

168

169 except Exception as e:

170 self.progress_tracker.log_error(document.id, str(e))

171 self.logger.error(

172 "HTML chunking failed, using fallback strategy",

173 extra={"document_id": document.id, "error": str(e)},

174 )

175 # Fallback to simple chunking

176 self.progress_tracker.log_fallback(

177 document.id, f"HTML parsing failed: {str(e)}"

178 )

179 return self._fallback_chunking(document)

180

181 def _fallback_chunking(self, document: Document) -> list[Document]:

182 """Simple fallback chunking when the main strategy fails.

183

184 This method provides a robust fallback by using the section splitter's

185 fallback mechanism and basic chunk processing.

186

187 Args:

188 document: Document to chunk

189

190 Returns:

191 List of chunked documents

192 """

193 self.logger.info("Using fallback chunking strategy for HTML document")

194

195 try:

196 # Use section splitter's fallback mechanism

197 sections = self.section_splitter._fallback_split(document.content)

198

199 if not sections:

200 # Ultimate fallback: single chunk

201 return self._create_single_chunk_fallback(document)

202

203 # Create chunked documents using basic processing

204 chunked_docs = []

205 for i, section in enumerate(sections):

206 chunk_content = section["content"]

207

208 # Validate chunk content

209 if not chunk_content or not chunk_content.strip():

210 self.logger.warning(f"Skipping empty fallback chunk {i+1}")

211 continue

212

213 # Create simple chunk document

214 chunk_doc = self.chunk_processor.create_chunk_document(

215 original_doc=document,

216 chunk_content=chunk_content,

217 chunk_metadata=section,

218 chunk_index=i,

219 total_chunks=len(sections),

220 skip_nlp=True, # Skip NLP for fallback chunks

221 )

222

223 # Mark as fallback chunking

224 chunk_doc.metadata.update(

225 {

226 "chunking_strategy": "html_fallback",

227 "chunking_method": "fallback_modular",

228 }

229 )

230

231 chunked_docs.append(chunk_doc)

232

233 return chunked_docs

234

235 except Exception as e:

236 self.logger.error(f"Fallback chunking failed: {e}")

237 return self._create_single_chunk_fallback(document)

238

239 def _create_single_chunk_fallback(self, document: Document) -> list[Document]:

240 """Ultimate fallback: return original document as single chunk.

241

242 Args:

243 document: Document to return as single chunk

244

245 Returns:

246 List containing single chunk document

247 """

248 try:

249 # Create single chunk with minimal processing

250 chunk_doc = Document(

251 content=document.content,

252 metadata=document.metadata.copy(),

253 source=document.source,

254 source_type=document.source_type,

255 url=document.url,

256 title=document.title,

257 content_type=document.content_type,

258 )

259

260 chunk_doc.id = Document.generate_chunk_id(document.id, 0)

261 chunk_doc.metadata.update(

262 {

263 "chunk_index": 0,

264 "total_chunks": 1,

265 "parent_document_id": document.id,

266 "chunking_strategy": "html_single_fallback",

267 "chunking_method": "fallback_single",

268 "entities": [],

269 "nlp_skipped": True,

270 "skip_reason": "fallback_error",

271 "content_type": "html",

272 }

273 )

274

275 return [chunk_doc]

276

277 except Exception as e:

278 self.logger.error(f"Single chunk fallback failed: {e}")

279 # If even this fails, return empty list

280 return []

281

282 def __del__(self):

283 """Cleanup method."""

284 # Call shutdown to clean up resources

285 self.shutdown()

286

287 def _split_text(self, text: str) -> list[str]:

288 """Split text into chunks using the section splitter.

289

290 This method implements the abstract method from BaseChunkingStrategy

291 for backward compatibility, though the main chunking is handled by

292 the modular chunk_document method.

293

294 Args:

295 text: Text to split

296

297 Returns:

298 List of text chunks

299 """

300 try:

301 # Use the section splitter to split the text

302 sections = self.section_splitter.split_sections(text)

303 return [section.get("content", "") for section in sections]

304 except Exception as e:

305 self.logger.warning(f"Text splitting failed, using fallback: {e}")

306 # Fallback to simple text splitting

307 return [text]

308

309 def shutdown(self):

310 """Shutdown the strategy and clean up resources."""

311 # Clean up any cached data from components

312 if hasattr(self, "section_splitter"):

313 # Section splitter cleanup if needed

314 pass

315

316 if hasattr(self, "chunk_processor"):

317 # Chunk processor cleanup if needed

318 pass

319

320 self.logger.debug("HTMLChunkingStrategy shutdown completed")

Coverage for src/qdrant_loader/core/chunking/strategy/html_strategy.py: 73%

93 statements