Coverage for src/qdrant_loader/core/chunking/strategy/default_strategy.py: 86%
66 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1"""Default chunking strategy for text documents using modular architecture.
3This strategy uses intelligent text-based chunking with enhanced metadata extraction.
4It follows the modern modular architecture pattern established in MarkdownChunkingStrategy,
5using specialized text-processing components for optimal text document handling.
6"""
8from typing import TYPE_CHECKING
10import structlog
12from qdrant_loader.core.chunking.progress_tracker import ChunkingProgressTracker
13from qdrant_loader.core.chunking.strategy.base_strategy import BaseChunkingStrategy
14from qdrant_loader.core.document import Document
16from .default import (
17 TextChunkProcessor,
18 TextDocumentParser,
19 TextMetadataExtractor,
20 TextSectionSplitter,
21)
23if TYPE_CHECKING:
24 from qdrant_loader.config import Settings
26logger = structlog.get_logger(__name__)
29class DefaultChunkingStrategy(BaseChunkingStrategy):
30 """Modern default text chunking strategy using modular architecture.
32 This strategy intelligently splits text documents into chunks while preserving
33 semantic meaning and structure. Each chunk includes:
34 - Intelligent text analysis and boundaries
35 - Enhanced metadata with text-specific features
36 - Content quality metrics and readability analysis
37 - Semantic analysis results when appropriate
39 The strategy uses a modular architecture with focused components:
40 - TextDocumentParser: Handles text structure analysis
41 - TextSectionSplitter: Manages intelligent text splitting strategies
42 - TextMetadataExtractor: Enriches chunks with comprehensive text metadata
43 - TextChunkProcessor: Coordinates processing and semantic analysis
44 """
46 def __init__(self, settings: "Settings"):
47 """Initialize the default chunking strategy.
49 Args:
50 settings: Configuration settings
51 """
52 super().__init__(settings)
53 self.progress_tracker = ChunkingProgressTracker(logger)
55 # Initialize modular components
56 self.document_parser = TextDocumentParser()
57 self.section_splitter = TextSectionSplitter(settings)
58 self.metadata_extractor = TextMetadataExtractor()
59 self.chunk_processor = TextChunkProcessor(settings)
61 # Give section splitter access to tokenizer
62 self.section_splitter._parent_strategy = self
64 # Apply any chunk overlap that was set before components were initialized
65 if hasattr(self, "_chunk_overlap"):
66 self.chunk_overlap = self._chunk_overlap
68 # Log configuration for debugging
69 logger.info(
70 "DefaultChunkingStrategy initialized with modular architecture",
71 chunk_size=self.chunk_size,
72 chunk_overlap=self.chunk_overlap,
73 tokenizer=self.tokenizer,
74 has_encoding=self.encoding is not None,
75 chunking_method="intelligent_text_processing",
76 )
78 # Warn about suspiciously small chunk sizes
79 if self.chunk_size < 100:
80 logger.warning(
81 f"Very small chunk_size detected: {self.chunk_size} characters. "
82 f"This may cause performance issues and excessive chunking. "
83 f"Consider using a larger value (e.g., 1000-1500 characters)."
84 )
86 def chunk_document(self, document: Document) -> list[Document]:
87 """Chunk a text document into intelligent semantic sections.
89 Args:
90 document: The document to chunk
92 Returns:
93 List of chunked documents with enhanced metadata
94 """
95 file_name = (
96 document.metadata.get("file_name")
97 or document.metadata.get("original_filename")
98 or document.title
99 or f"{document.source_type}:{document.source}"
100 )
102 # Start progress tracking
103 self.progress_tracker.start_chunking(
104 document.id,
105 document.source,
106 document.source_type,
107 len(document.content),
108 file_name,
109 )
111 # Provide user guidance on expected chunk count
112 estimated_chunks = self.chunk_processor.estimate_chunk_count(document.content)
113 logger.info(
114 f"Processing document: {document.title} ({len(document.content):,} chars)",
115 extra={
116 "estimated_chunks": estimated_chunks,
117 "chunk_size": self.settings.global_config.chunking.chunk_size,
118 "max_chunks_allowed": self.settings.global_config.chunking.max_chunks_per_document,
119 },
120 )
122 try:
123 # Parse document structure and split into sections
124 logger.debug("Analyzing document structure and splitting into sections")
125 document_structure = self.document_parser.parse_document_structure(
126 document.content
127 )
128 chunks_metadata = self.section_splitter.split_sections(
129 document.content, document
130 )
132 if not chunks_metadata:
133 self.progress_tracker.finish_chunking(document.id, 0, "default")
134 return []
136 # Apply configuration-driven safety limit
137 max_chunks = self.settings.global_config.chunking.max_chunks_per_document
138 if len(chunks_metadata) > max_chunks:
139 logger.warning(
140 f"Document generated {len(chunks_metadata)} chunks, limiting to {max_chunks} per config. "
141 f"Consider increasing max_chunks_per_document in config or using larger chunk_size. "
142 f"Document: {document.title}"
143 )
144 chunks_metadata = chunks_metadata[:max_chunks]
146 # Create chunk documents
147 chunked_docs = []
148 for i, chunk_meta in enumerate(chunks_metadata):
149 chunk_content = chunk_meta["content"]
150 logger.debug(
151 f"Processing chunk {i+1}/{len(chunks_metadata)}",
152 extra={
153 "chunk_size": len(chunk_content),
154 "section_type": chunk_meta.get("section_type", "text"),
155 "word_count": chunk_meta.get("word_count", 0),
156 },
157 )
159 # Add document structure info to chunk metadata
160 chunk_meta.update(
161 {
162 "document_structure": document_structure,
163 "chunking_strategy": "default_modular",
164 }
165 )
167 # Enhanced: Use hierarchical metadata extraction
168 enriched_metadata = (
169 self.metadata_extractor.extract_hierarchical_metadata(
170 chunk_content, chunk_meta, document
171 )
172 )
174 # Create chunk document using the chunk processor
175 # Skip NLP for small documents or documents that might cause LDA issues
176 skip_nlp = self.chunk_processor.should_skip_semantic_analysis(
177 chunk_content, enriched_metadata
178 )
180 chunk_doc = self.chunk_processor.create_chunk_document(
181 original_doc=document,
182 chunk_content=chunk_content,
183 chunk_index=i,
184 total_chunks=len(chunks_metadata),
185 chunk_metadata=enriched_metadata,
186 skip_nlp=skip_nlp,
187 )
189 chunked_docs.append(chunk_doc)
191 # Update progress
192 self.progress_tracker.update_progress(document.id, i + 1)
194 # Finish progress tracking
195 self.progress_tracker.finish_chunking(
196 document.id, len(chunked_docs), "default"
197 )
199 logger.info(
200 "Successfully chunked document with modular architecture",
201 document_id=document.id,
202 num_chunks=len(chunked_docs),
203 strategy="default_modular",
204 avg_chunk_size=(
205 sum(len(doc.content) for doc in chunked_docs) // len(chunked_docs)
206 if chunked_docs
207 else 0
208 ),
209 )
211 return chunked_docs
213 except Exception as e:
214 self.progress_tracker.log_error(document.id, str(e))
215 logger.error(
216 "Error chunking document with modular architecture",
217 document_id=document.id,
218 error=str(e),
219 exc_info=True,
220 )
221 raise
223 def shutdown(self):
224 """Clean up resources used by the strategy."""
225 logger.debug("Shutting down DefaultChunkingStrategy")
226 try:
227 # Clean up modular components
228 if hasattr(self, "chunk_processor") and hasattr(
229 self.chunk_processor, "shutdown"
230 ):
231 self.chunk_processor.shutdown()
232 except Exception as e:
233 logger.warning(f"Error during DefaultChunkingStrategy shutdown: {e}")
235 def __del__(self):
236 """Ensure cleanup on deletion."""
237 try:
238 self.shutdown()
239 except Exception:
240 # Ignore errors during cleanup in destructor
241 pass