Coverage for src/qdrant_loader/core/chunking/strategy/code_strategy.py: 73%
70 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1"""Code-specific chunking strategy for programming languages."""
3import structlog
5from qdrant_loader.config import Settings
6from qdrant_loader.core.chunking.progress_tracker import ChunkingProgressTracker
7from qdrant_loader.core.chunking.strategy.base_strategy import BaseChunkingStrategy
8from qdrant_loader.core.document import Document
10from .code import (
11 CodeChunkProcessor,
12 CodeDocumentParser,
13 CodeMetadataExtractor,
14 CodeSectionSplitter,
15)
17logger = structlog.get_logger(__name__)
20class CodeChunkingStrategy(BaseChunkingStrategy):
21 """Modern code chunking strategy using modular architecture.
23 This strategy uses AST parsing (primarily tree-sitter) to split code files into
24 chunks based on semantic code elements, preserving the code structure and hierarchy.
25 Uses modular components for parsing, splitting, metadata extraction, and chunk processing.
26 """
28 def __init__(self, settings: Settings):
29 """Initialize the code chunking strategy.
31 Args:
32 settings: Configuration settings
33 """
34 super().__init__(settings)
35 self.logger = logger
36 self.progress_tracker = ChunkingProgressTracker(logger)
38 # Initialize modular components
39 self.document_parser = CodeDocumentParser(settings)
40 self.section_splitter = CodeSectionSplitter(settings)
41 self.metadata_extractor = CodeMetadataExtractor(settings)
42 self.chunk_processor = CodeChunkProcessor(settings)
44 # Code-specific configuration
45 self.code_config = settings.global_config.chunking.strategies.code
46 self.chunk_size_threshold = getattr(
47 self.code_config, "max_file_size_for_ast", 40000
48 )
50 logger.info(
51 "CodeChunkingStrategy initialized with modular architecture",
52 extra={
53 "chunk_size": settings.global_config.chunking.chunk_size,
54 "chunk_overlap": settings.global_config.chunking.chunk_overlap,
55 "max_file_size_for_ast": self.code_config.max_file_size_for_ast,
56 "enable_ast_parsing": self.code_config.enable_ast_parsing,
57 "enable_dependency_analysis": self.code_config.enable_dependency_analysis,
58 "chunking_method": "intelligent_ast_parsing",
59 },
60 )
62 def chunk_document(self, document: Document) -> list[Document]:
63 """Chunk a code document using modern modular approach.
65 Args:
66 document: Document to chunk
68 Returns:
69 List of chunked documents
70 """
71 file_name = (
72 document.metadata.get("file_name")
73 or document.metadata.get("original_filename")
74 or document.title
75 or f"{document.source_type}:{document.source}"
76 )
78 # Start progress tracking
79 self.progress_tracker.start_chunking(
80 document.id,
81 document.source,
82 document.source_type,
83 len(document.content),
84 file_name,
85 )
87 # Provide user guidance on expected chunk count
88 estimated_chunks = self.chunk_processor.estimate_chunk_count(document.content)
89 logger.info(
90 f"Processing code document: {document.title} ({len(document.content):,} chars)",
91 extra={
92 "estimated_chunks": estimated_chunks,
93 "chunk_size": self.settings.global_config.chunking.chunk_size,
94 "max_chunks_allowed": self.settings.global_config.chunking.max_chunks_per_document,
95 "file_type": "code",
96 },
97 )
99 try:
100 # Parse document structure first
101 logger.debug("Analyzing code document structure")
102 document_structure = self.document_parser.parse_document_structure(
103 document.content
104 )
106 # Split content into intelligent sections using the section splitter
107 logger.debug("Splitting code into semantic sections")
108 chunks_metadata = self.section_splitter.split_sections(
109 document.content, document
110 )
112 if not chunks_metadata:
113 self.progress_tracker.finish_chunking(document.id, 0, "code")
114 return []
116 # Apply configuration-driven safety limit
117 max_chunks = self.settings.global_config.chunking.max_chunks_per_document
118 if len(chunks_metadata) > max_chunks:
119 logger.warning(
120 f"Code document generated {len(chunks_metadata)} chunks, limiting to {max_chunks} per config. "
121 f"Consider increasing max_chunks_per_document in config or using larger chunk_size. "
122 f"Document: {document.title}"
123 )
124 chunks_metadata = chunks_metadata[:max_chunks]
126 # Create chunk documents
127 chunked_docs = []
128 for i, chunk_meta in enumerate(chunks_metadata):
129 chunk_content = chunk_meta["content"]
130 logger.debug(
131 f"Processing code chunk {i+1}/{len(chunks_metadata)}",
132 extra={
133 "chunk_size": len(chunk_content),
134 "element_type": chunk_meta.get("element_type", "unknown"),
135 "language": chunk_meta.get("language", "unknown"),
136 },
137 )
139 # Add document structure info to chunk metadata
140 chunk_meta.update(
141 {
142 "document_structure": document_structure,
143 "chunking_strategy": "code_modular",
144 }
145 )
147 # Enhanced: Use hierarchical metadata extraction
148 enriched_metadata = (
149 self.metadata_extractor.extract_hierarchical_metadata(
150 chunk_content, chunk_meta, document
151 )
152 )
154 # Create chunk document using the chunk processor
155 # Skip NLP for large code chunks or generated code
156 skip_nlp, skip_reason = (
157 self.chunk_processor.should_skip_semantic_analysis(
158 chunk_content, enriched_metadata
159 )
160 )
162 chunk_doc = self.chunk_processor.create_chunk_document(
163 original_doc=document,
164 chunk_content=chunk_content,
165 chunk_index=i,
166 total_chunks=len(chunks_metadata),
167 chunk_metadata=enriched_metadata,
168 skip_nlp=skip_nlp,
169 )
171 chunked_docs.append(chunk_doc)
173 # Finish progress tracking
174 avg_chunk_size = (
175 sum(len(doc.content) for doc in chunked_docs) // len(chunked_docs)
176 if chunked_docs
177 else 0
178 )
179 self.progress_tracker.finish_chunking(
180 document.id, len(chunked_docs), "code_modular"
181 )
183 logger.info(
184 "Successfully chunked code document with modular architecture",
185 extra={
186 "document_id": document.id,
187 "num_chunks": len(chunked_docs),
188 "avg_chunk_size": avg_chunk_size,
189 "strategy": "code_modular",
190 },
191 )
193 return chunked_docs
195 except Exception as e:
196 self.progress_tracker.log_error(document.id, str(e))
197 logger.error(f"Code chunking failed: {e}", exc_info=True)
198 # Fallback to default chunking
199 self.progress_tracker.log_fallback(
200 document.id, f"Code parsing failed: {str(e)}"
201 )
202 return self._fallback_chunking(document)
204 def _fallback_chunking(self, document: Document) -> list[Document]:
205 """Fallback chunking using simple text-based approach.
207 Args:
208 document: Document to chunk
210 Returns:
211 List of chunked documents using fallback approach
212 """
213 logger.info(
214 f"Using fallback chunking for large code document: {document.title}"
215 )
217 # Use the section splitter's fallback method
218 fallback_sections = self.section_splitter._fallback_text_split(document.content)
220 # Create chunk documents
221 chunked_docs = []
222 for i, section in enumerate(fallback_sections):
223 chunk_content = section["content"]
224 chunk_metadata = section["metadata"]
226 # Add fallback-specific metadata
227 chunk_metadata.update(
228 {
229 "chunking_strategy": "code_fallback",
230 "fallback_reason": "file_too_large",
231 }
232 )
234 # Create chunk document
235 chunk_doc = self.chunk_processor.create_chunk_document(
236 original_doc=document,
237 chunk_content=chunk_content,
238 chunk_index=i,
239 total_chunks=len(fallback_sections),
240 chunk_metadata=chunk_metadata,
241 skip_nlp=True, # Skip NLP for fallback chunks
242 )
244 chunked_docs.append(chunk_doc)
246 return chunked_docs
248 def shutdown(self):
249 """Clean up resources used by the code chunking strategy."""
250 logger.debug("Shutting down CodeChunkingStrategy")
252 # Clean up document parser resources
253 if hasattr(self.document_parser, "_parsers"):
254 self.document_parser._parsers.clear()
256 # No additional cleanup needed for other components
257 logger.debug("CodeChunkingStrategy shutdown complete")