Coverage for src/qdrant_loader/core/chunking/strategy/default_strategy.py: 86%

66 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""Default chunking strategy for text documents using modular architecture. 

2 

3This strategy uses intelligent text-based chunking with enhanced metadata extraction. 

4It follows the modern modular architecture pattern established in MarkdownChunkingStrategy, 

5using specialized text-processing components for optimal text document handling. 

6""" 

7 

8from typing import TYPE_CHECKING 

9 

10import structlog 

11 

12from qdrant_loader.core.chunking.progress_tracker import ChunkingProgressTracker 

13from qdrant_loader.core.chunking.strategy.base_strategy import BaseChunkingStrategy 

14from qdrant_loader.core.document import Document 

15 

16from .default import ( 

17 TextChunkProcessor, 

18 TextDocumentParser, 

19 TextMetadataExtractor, 

20 TextSectionSplitter, 

21) 

22 

23if TYPE_CHECKING: 

24 from qdrant_loader.config import Settings 

25 

26logger = structlog.get_logger(__name__) 

27 

28 

29class DefaultChunkingStrategy(BaseChunkingStrategy): 

30 """Modern default text chunking strategy using modular architecture. 

31 

32 This strategy intelligently splits text documents into chunks while preserving 

33 semantic meaning and structure. Each chunk includes: 

34 - Intelligent text analysis and boundaries 

35 - Enhanced metadata with text-specific features 

36 - Content quality metrics and readability analysis 

37 - Semantic analysis results when appropriate 

38 

39 The strategy uses a modular architecture with focused components: 

40 - TextDocumentParser: Handles text structure analysis 

41 - TextSectionSplitter: Manages intelligent text splitting strategies 

42 - TextMetadataExtractor: Enriches chunks with comprehensive text metadata 

43 - TextChunkProcessor: Coordinates processing and semantic analysis 

44 """ 

45 

46 def __init__(self, settings: "Settings"): 

47 """Initialize the default chunking strategy. 

48 

49 Args: 

50 settings: Configuration settings 

51 """ 

52 super().__init__(settings) 

53 self.progress_tracker = ChunkingProgressTracker(logger) 

54 

55 # Initialize modular components 

56 self.document_parser = TextDocumentParser() 

57 self.section_splitter = TextSectionSplitter(settings) 

58 self.metadata_extractor = TextMetadataExtractor() 

59 self.chunk_processor = TextChunkProcessor(settings) 

60 

61 # Give section splitter access to tokenizer 

62 self.section_splitter._parent_strategy = self 

63 

64 # Apply any chunk overlap that was set before components were initialized 

65 if hasattr(self, "_chunk_overlap"): 

66 self.chunk_overlap = self._chunk_overlap 

67 

68 # Log configuration for debugging 

69 logger.info( 

70 "DefaultChunkingStrategy initialized with modular architecture", 

71 chunk_size=self.chunk_size, 

72 chunk_overlap=self.chunk_overlap, 

73 tokenizer=self.tokenizer, 

74 has_encoding=self.encoding is not None, 

75 chunking_method="intelligent_text_processing", 

76 ) 

77 

78 # Warn about suspiciously small chunk sizes 

79 if self.chunk_size < 100: 

80 logger.warning( 

81 f"Very small chunk_size detected: {self.chunk_size} characters. " 

82 f"This may cause performance issues and excessive chunking. " 

83 f"Consider using a larger value (e.g., 1000-1500 characters)." 

84 ) 

85 

86 def chunk_document(self, document: Document) -> list[Document]: 

87 """Chunk a text document into intelligent semantic sections. 

88 

89 Args: 

90 document: The document to chunk 

91 

92 Returns: 

93 List of chunked documents with enhanced metadata 

94 """ 

95 file_name = ( 

96 document.metadata.get("file_name") 

97 or document.metadata.get("original_filename") 

98 or document.title 

99 or f"{document.source_type}:{document.source}" 

100 ) 

101 

102 # Start progress tracking 

103 self.progress_tracker.start_chunking( 

104 document.id, 

105 document.source, 

106 document.source_type, 

107 len(document.content), 

108 file_name, 

109 ) 

110 

111 # Provide user guidance on expected chunk count 

112 estimated_chunks = self.chunk_processor.estimate_chunk_count(document.content) 

113 logger.info( 

114 f"Processing document: {document.title} ({len(document.content):,} chars)", 

115 extra={ 

116 "estimated_chunks": estimated_chunks, 

117 "chunk_size": self.settings.global_config.chunking.chunk_size, 

118 "max_chunks_allowed": self.settings.global_config.chunking.max_chunks_per_document, 

119 }, 

120 ) 

121 

122 try: 

123 # Parse document structure and split into sections 

124 logger.debug("Analyzing document structure and splitting into sections") 

125 document_structure = self.document_parser.parse_document_structure( 

126 document.content 

127 ) 

128 chunks_metadata = self.section_splitter.split_sections( 

129 document.content, document 

130 ) 

131 

132 if not chunks_metadata: 

133 self.progress_tracker.finish_chunking(document.id, 0, "default") 

134 return [] 

135 

136 # Apply configuration-driven safety limit 

137 max_chunks = self.settings.global_config.chunking.max_chunks_per_document 

138 if len(chunks_metadata) > max_chunks: 

139 logger.warning( 

140 f"Document generated {len(chunks_metadata)} chunks, limiting to {max_chunks} per config. " 

141 f"Consider increasing max_chunks_per_document in config or using larger chunk_size. " 

142 f"Document: {document.title}" 

143 ) 

144 chunks_metadata = chunks_metadata[:max_chunks] 

145 

146 # Create chunk documents 

147 chunked_docs = [] 

148 for i, chunk_meta in enumerate(chunks_metadata): 

149 chunk_content = chunk_meta["content"] 

150 logger.debug( 

151 f"Processing chunk {i+1}/{len(chunks_metadata)}", 

152 extra={ 

153 "chunk_size": len(chunk_content), 

154 "section_type": chunk_meta.get("section_type", "text"), 

155 "word_count": chunk_meta.get("word_count", 0), 

156 }, 

157 ) 

158 

159 # Add document structure info to chunk metadata 

160 chunk_meta.update( 

161 { 

162 "document_structure": document_structure, 

163 "chunking_strategy": "default_modular", 

164 } 

165 ) 

166 

167 # Enhanced: Use hierarchical metadata extraction 

168 enriched_metadata = ( 

169 self.metadata_extractor.extract_hierarchical_metadata( 

170 chunk_content, chunk_meta, document 

171 ) 

172 ) 

173 

174 # Create chunk document using the chunk processor 

175 # Skip NLP for small documents or documents that might cause LDA issues 

176 skip_nlp = self.chunk_processor.should_skip_semantic_analysis( 

177 chunk_content, enriched_metadata 

178 ) 

179 

180 chunk_doc = self.chunk_processor.create_chunk_document( 

181 original_doc=document, 

182 chunk_content=chunk_content, 

183 chunk_index=i, 

184 total_chunks=len(chunks_metadata), 

185 chunk_metadata=enriched_metadata, 

186 skip_nlp=skip_nlp, 

187 ) 

188 

189 chunked_docs.append(chunk_doc) 

190 

191 # Update progress 

192 self.progress_tracker.update_progress(document.id, i + 1) 

193 

194 # Finish progress tracking 

195 self.progress_tracker.finish_chunking( 

196 document.id, len(chunked_docs), "default" 

197 ) 

198 

199 logger.info( 

200 "Successfully chunked document with modular architecture", 

201 document_id=document.id, 

202 num_chunks=len(chunked_docs), 

203 strategy="default_modular", 

204 avg_chunk_size=( 

205 sum(len(doc.content) for doc in chunked_docs) // len(chunked_docs) 

206 if chunked_docs 

207 else 0 

208 ), 

209 ) 

210 

211 return chunked_docs 

212 

213 except Exception as e: 

214 self.progress_tracker.log_error(document.id, str(e)) 

215 logger.error( 

216 "Error chunking document with modular architecture", 

217 document_id=document.id, 

218 error=str(e), 

219 exc_info=True, 

220 ) 

221 raise 

222 

223 def shutdown(self): 

224 """Clean up resources used by the strategy.""" 

225 logger.debug("Shutting down DefaultChunkingStrategy") 

226 try: 

227 # Clean up modular components 

228 if hasattr(self, "chunk_processor") and hasattr( 

229 self.chunk_processor, "shutdown" 

230 ): 

231 self.chunk_processor.shutdown() 

232 except Exception as e: 

233 logger.warning(f"Error during DefaultChunkingStrategy shutdown: {e}") 

234 

235 def __del__(self): 

236 """Ensure cleanup on deletion.""" 

237 try: 

238 self.shutdown() 

239 except Exception: 

240 # Ignore errors during cleanup in destructor 

241 pass