Coverage for src/qdrant_loader/core/chunking/strategy/default

1"""Default chunking strategy for text documents using modular architecture.

3This strategy uses intelligent text-based chunking with enhanced metadata extraction.

4It follows the modern modular architecture pattern established in MarkdownChunkingStrategy,

5using specialized text-processing components for optimal text document handling.

6"""

8from typing import TYPE_CHECKING

10import structlog

12from qdrant_loader.core.chunking.progress_tracker import ChunkingProgressTracker

13from qdrant_loader.core.chunking.strategy.base_strategy import BaseChunkingStrategy

14from qdrant_loader.core.document import Document

16from .default import (

17 TextChunkProcessor,

18 TextDocumentParser,

19 TextMetadataExtractor,

20 TextSectionSplitter,

21)

23if TYPE_CHECKING:

24 from qdrant_loader.config import Settings

26logger = structlog.get_logger(__name__)

29class DefaultChunkingStrategy(BaseChunkingStrategy):

30 """Modern default text chunking strategy using modular architecture.

32 This strategy intelligently splits text documents into chunks while preserving

33 semantic meaning and structure. Each chunk includes:

34 - Intelligent text analysis and boundaries

35 - Enhanced metadata with text-specific features

36 - Content quality metrics and readability analysis

37 - Semantic analysis results when appropriate

39 The strategy uses a modular architecture with focused components:

40 - TextDocumentParser: Handles text structure analysis

41 - TextSectionSplitter: Manages intelligent text splitting strategies

42 - TextMetadataExtractor: Enriches chunks with comprehensive text metadata

43 - TextChunkProcessor: Coordinates processing and semantic analysis

44 """

46 def __init__(self, settings: "Settings"):

47 """Initialize the default chunking strategy.

49 Args:

50 settings: Configuration settings

51 """

52 super().__init__(settings)

53 self.progress_tracker = ChunkingProgressTracker(logger)

55 # Initialize modular components

56 self.document_parser = TextDocumentParser()

57 self.section_splitter = TextSectionSplitter(settings)

58 self.metadata_extractor = TextMetadataExtractor()

59 self.chunk_processor = TextChunkProcessor(settings)

61 # Give section splitter access to tokenizer

62 self.section_splitter._parent_strategy = self

64 # Apply any chunk overlap that was set before components were initialized

65 if hasattr(self, "_chunk_overlap"):

66 self.chunk_overlap = self._chunk_overlap

68 # Log configuration for debugging

69 logger.info(

70 "DefaultChunkingStrategy initialized with modular architecture",

71 chunk_size=self.chunk_size,

72 chunk_overlap=self.chunk_overlap,

73 tokenizer=self.tokenizer,

74 has_encoding=self.encoding is not None,

75 chunking_method="intelligent_text_processing",

76 )

78 # Warn about suspiciously small chunk sizes

79 if self.chunk_size < 100:

80 logger.warning(

81 f"Very small chunk_size detected: {self.chunk_size} characters. "

82 f"This may cause performance issues and excessive chunking. "

83 f"Consider using a larger value (e.g., 1000-1500 characters)."

84 )

86 def chunk_document(self, document: Document) -> list[Document]:

87 """Chunk a text document into intelligent semantic sections.

89 Args:

90 document: The document to chunk

92 Returns:

93 List of chunked documents with enhanced metadata

94 """

95 file_name = (

96 document.metadata.get("file_name")

97 or document.metadata.get("original_filename")

98 or document.title

99 or f"{document.source_type}:{document.source}"

100 )

101

102 # Start progress tracking

103 self.progress_tracker.start_chunking(

104 document.id,

105 document.source,

106 document.source_type,

107 len(document.content),

108 file_name,

109 )

110

111 # Provide user guidance on expected chunk count

112 estimated_chunks = self.chunk_processor.estimate_chunk_count(document.content)

113 logger.info(

114 f"Processing document: {document.title} ({len(document.content):,} chars)",

115 extra={

116 "estimated_chunks": estimated_chunks,

117 "chunk_size": self.settings.global_config.chunking.chunk_size,

118 "max_chunks_allowed": self.settings.global_config.chunking.max_chunks_per_document,

119 },

120 )

121

122 try:

123 # Parse document structure and split into sections

124 logger.debug("Analyzing document structure and splitting into sections")

125 document_structure = self.document_parser.parse_document_structure(

126 document.content

127 )

128 chunks_metadata = self.section_splitter.split_sections(

129 document.content, document

130 )

131

132 if not chunks_metadata:

133 self.progress_tracker.finish_chunking(document.id, 0, "default")

134 return []

135

136 # Apply configuration-driven safety limit

137 max_chunks = self.settings.global_config.chunking.max_chunks_per_document

138 if len(chunks_metadata) > max_chunks:

139 logger.warning(

140 f"Document generated {len(chunks_metadata)} chunks, limiting to {max_chunks} per config. "

141 f"Consider increasing max_chunks_per_document in config or using larger chunk_size. "

142 f"Document: {document.title}"

143 )

144 chunks_metadata = chunks_metadata[:max_chunks]

145

146 # Create chunk documents

147 chunked_docs = []

148 for i, chunk_meta in enumerate(chunks_metadata):

149 chunk_content = chunk_meta["content"]

150 logger.debug(

151 f"Processing chunk {i+1}/{len(chunks_metadata)}",

152 extra={

153 "chunk_size": len(chunk_content),

154 "section_type": chunk_meta.get("section_type", "text"),

155 "word_count": chunk_meta.get("word_count", 0),

156 },

157 )

158

159 # Add document structure info to chunk metadata

160 chunk_meta.update(

161 {

162 "document_structure": document_structure,

163 "chunking_strategy": "default_modular",

164 }

165 )

166

167 # Enhanced: Use hierarchical metadata extraction

168 enriched_metadata = (

169 self.metadata_extractor.extract_hierarchical_metadata(

170 chunk_content, chunk_meta, document

171 )

172 )

173

174 # Create chunk document using the chunk processor

175 # Skip NLP for small documents or documents that might cause LDA issues

176 skip_nlp = self.chunk_processor.should_skip_semantic_analysis(

177 chunk_content, enriched_metadata

178 )

179

180 chunk_doc = self.chunk_processor.create_chunk_document(

181 original_doc=document,

182 chunk_content=chunk_content,

183 chunk_index=i,

184 total_chunks=len(chunks_metadata),

185 chunk_metadata=enriched_metadata,

186 skip_nlp=skip_nlp,

187 )

188

189 chunked_docs.append(chunk_doc)

190

191 # Update progress

192 self.progress_tracker.update_progress(document.id, i + 1)

193

194 # Finish progress tracking

195 self.progress_tracker.finish_chunking(

196 document.id, len(chunked_docs), "default"

197 )

198

199 logger.info(

200 "Successfully chunked document with modular architecture",

201 document_id=document.id,

202 num_chunks=len(chunked_docs),

203 strategy="default_modular",

204 avg_chunk_size=(

205 sum(len(doc.content) for doc in chunked_docs) // len(chunked_docs)

206 if chunked_docs

207 else 0

208 ),

209 )

210

211 return chunked_docs

212

213 except Exception as e:

214 self.progress_tracker.log_error(document.id, str(e))

215 logger.error(

216 "Error chunking document with modular architecture",

217 document_id=document.id,

218 error=str(e),

219 exc_info=True,

220 )

221 raise

222

223 def shutdown(self):

224 """Clean up resources used by the strategy."""

225 logger.debug("Shutting down DefaultChunkingStrategy")

226 try:

227 # Clean up modular components

228 if hasattr(self, "chunk_processor") and hasattr(

229 self.chunk_processor, "shutdown"

230 ):

231 self.chunk_processor.shutdown()

232 except Exception as e:

233 logger.warning(f"Error during DefaultChunkingStrategy shutdown: {e}")

234

235 def __del__(self):

236 """Ensure cleanup on deletion."""

237 try:

238 self.shutdown()

239 except Exception:

240 # Ignore errors during cleanup in destructor

241 pass

Coverage for src / qdrant_loader / core / chunking / strategy / default_strategy.py: 86%

66 statements