Coverage for src/qdrant_loader/core/chunking/strategy/code_strategy.py: 73%

70 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""Code-specific chunking strategy for programming languages.""" 

2 

3import structlog 

4 

5from qdrant_loader.config import Settings 

6from qdrant_loader.core.chunking.progress_tracker import ChunkingProgressTracker 

7from qdrant_loader.core.chunking.strategy.base_strategy import BaseChunkingStrategy 

8from qdrant_loader.core.document import Document 

9 

10from .code import ( 

11 CodeChunkProcessor, 

12 CodeDocumentParser, 

13 CodeMetadataExtractor, 

14 CodeSectionSplitter, 

15) 

16 

17logger = structlog.get_logger(__name__) 

18 

19 

20class CodeChunkingStrategy(BaseChunkingStrategy): 

21 """Modern code chunking strategy using modular architecture. 

22 

23 This strategy uses AST parsing (primarily tree-sitter) to split code files into 

24 chunks based on semantic code elements, preserving the code structure and hierarchy. 

25 Uses modular components for parsing, splitting, metadata extraction, and chunk processing. 

26 """ 

27 

28 def __init__(self, settings: Settings): 

29 """Initialize the code chunking strategy. 

30 

31 Args: 

32 settings: Configuration settings 

33 """ 

34 super().__init__(settings) 

35 self.logger = logger 

36 self.progress_tracker = ChunkingProgressTracker(logger) 

37 

38 # Initialize modular components 

39 self.document_parser = CodeDocumentParser(settings) 

40 self.section_splitter = CodeSectionSplitter(settings) 

41 self.metadata_extractor = CodeMetadataExtractor(settings) 

42 self.chunk_processor = CodeChunkProcessor(settings) 

43 

44 # Code-specific configuration 

45 self.code_config = settings.global_config.chunking.strategies.code 

46 self.chunk_size_threshold = getattr( 

47 self.code_config, "max_file_size_for_ast", 40000 

48 ) 

49 

50 logger.info( 

51 "CodeChunkingStrategy initialized with modular architecture", 

52 extra={ 

53 "chunk_size": settings.global_config.chunking.chunk_size, 

54 "chunk_overlap": settings.global_config.chunking.chunk_overlap, 

55 "max_file_size_for_ast": self.code_config.max_file_size_for_ast, 

56 "enable_ast_parsing": self.code_config.enable_ast_parsing, 

57 "enable_dependency_analysis": self.code_config.enable_dependency_analysis, 

58 "chunking_method": "intelligent_ast_parsing", 

59 }, 

60 ) 

61 

62 def chunk_document(self, document: Document) -> list[Document]: 

63 """Chunk a code document using modern modular approach. 

64 

65 Args: 

66 document: Document to chunk 

67 

68 Returns: 

69 List of chunked documents 

70 """ 

71 file_name = ( 

72 document.metadata.get("file_name") 

73 or document.metadata.get("original_filename") 

74 or document.title 

75 or f"{document.source_type}:{document.source}" 

76 ) 

77 

78 # Start progress tracking 

79 self.progress_tracker.start_chunking( 

80 document.id, 

81 document.source, 

82 document.source_type, 

83 len(document.content), 

84 file_name, 

85 ) 

86 

87 # Provide user guidance on expected chunk count 

88 estimated_chunks = self.chunk_processor.estimate_chunk_count(document.content) 

89 logger.info( 

90 f"Processing code document: {document.title} ({len(document.content):,} chars)", 

91 extra={ 

92 "estimated_chunks": estimated_chunks, 

93 "chunk_size": self.settings.global_config.chunking.chunk_size, 

94 "max_chunks_allowed": self.settings.global_config.chunking.max_chunks_per_document, 

95 "file_type": "code", 

96 }, 

97 ) 

98 

99 try: 

100 # Parse document structure first 

101 logger.debug("Analyzing code document structure") 

102 document_structure = self.document_parser.parse_document_structure( 

103 document.content 

104 ) 

105 

106 # Split content into intelligent sections using the section splitter 

107 logger.debug("Splitting code into semantic sections") 

108 chunks_metadata = self.section_splitter.split_sections( 

109 document.content, document 

110 ) 

111 

112 if not chunks_metadata: 

113 self.progress_tracker.finish_chunking(document.id, 0, "code") 

114 return [] 

115 

116 # Apply configuration-driven safety limit 

117 max_chunks = self.settings.global_config.chunking.max_chunks_per_document 

118 if len(chunks_metadata) > max_chunks: 

119 logger.warning( 

120 f"Code document generated {len(chunks_metadata)} chunks, limiting to {max_chunks} per config. " 

121 f"Consider increasing max_chunks_per_document in config or using larger chunk_size. " 

122 f"Document: {document.title}" 

123 ) 

124 chunks_metadata = chunks_metadata[:max_chunks] 

125 

126 # Create chunk documents 

127 chunked_docs = [] 

128 for i, chunk_meta in enumerate(chunks_metadata): 

129 chunk_content = chunk_meta["content"] 

130 logger.debug( 

131 f"Processing code chunk {i+1}/{len(chunks_metadata)}", 

132 extra={ 

133 "chunk_size": len(chunk_content), 

134 "element_type": chunk_meta.get("element_type", "unknown"), 

135 "language": chunk_meta.get("language", "unknown"), 

136 }, 

137 ) 

138 

139 # Add document structure info to chunk metadata 

140 chunk_meta.update( 

141 { 

142 "document_structure": document_structure, 

143 "chunking_strategy": "code_modular", 

144 } 

145 ) 

146 

147 # Enhanced: Use hierarchical metadata extraction 

148 enriched_metadata = ( 

149 self.metadata_extractor.extract_hierarchical_metadata( 

150 chunk_content, chunk_meta, document 

151 ) 

152 ) 

153 

154 # Create chunk document using the chunk processor 

155 # Skip NLP for large code chunks or generated code 

156 skip_nlp, skip_reason = ( 

157 self.chunk_processor.should_skip_semantic_analysis( 

158 chunk_content, enriched_metadata 

159 ) 

160 ) 

161 

162 chunk_doc = self.chunk_processor.create_chunk_document( 

163 original_doc=document, 

164 chunk_content=chunk_content, 

165 chunk_index=i, 

166 total_chunks=len(chunks_metadata), 

167 chunk_metadata=enriched_metadata, 

168 skip_nlp=skip_nlp, 

169 ) 

170 

171 chunked_docs.append(chunk_doc) 

172 

173 # Finish progress tracking 

174 avg_chunk_size = ( 

175 sum(len(doc.content) for doc in chunked_docs) // len(chunked_docs) 

176 if chunked_docs 

177 else 0 

178 ) 

179 self.progress_tracker.finish_chunking( 

180 document.id, len(chunked_docs), "code_modular" 

181 ) 

182 

183 logger.info( 

184 "Successfully chunked code document with modular architecture", 

185 extra={ 

186 "document_id": document.id, 

187 "num_chunks": len(chunked_docs), 

188 "avg_chunk_size": avg_chunk_size, 

189 "strategy": "code_modular", 

190 }, 

191 ) 

192 

193 return chunked_docs 

194 

195 except Exception as e: 

196 self.progress_tracker.log_error(document.id, str(e)) 

197 logger.error(f"Code chunking failed: {e}", exc_info=True) 

198 # Fallback to default chunking 

199 self.progress_tracker.log_fallback( 

200 document.id, f"Code parsing failed: {str(e)}" 

201 ) 

202 return self._fallback_chunking(document) 

203 

204 def _fallback_chunking(self, document: Document) -> list[Document]: 

205 """Fallback chunking using simple text-based approach. 

206 

207 Args: 

208 document: Document to chunk 

209 

210 Returns: 

211 List of chunked documents using fallback approach 

212 """ 

213 logger.info( 

214 f"Using fallback chunking for large code document: {document.title}" 

215 ) 

216 

217 # Use the section splitter's fallback method 

218 fallback_sections = self.section_splitter._fallback_text_split(document.content) 

219 

220 # Create chunk documents 

221 chunked_docs = [] 

222 for i, section in enumerate(fallback_sections): 

223 chunk_content = section["content"] 

224 chunk_metadata = section["metadata"] 

225 

226 # Add fallback-specific metadata 

227 chunk_metadata.update( 

228 { 

229 "chunking_strategy": "code_fallback", 

230 "fallback_reason": "file_too_large", 

231 } 

232 ) 

233 

234 # Create chunk document 

235 chunk_doc = self.chunk_processor.create_chunk_document( 

236 original_doc=document, 

237 chunk_content=chunk_content, 

238 chunk_index=i, 

239 total_chunks=len(fallback_sections), 

240 chunk_metadata=chunk_metadata, 

241 skip_nlp=True, # Skip NLP for fallback chunks 

242 ) 

243 

244 chunked_docs.append(chunk_doc) 

245 

246 return chunked_docs 

247 

248 def shutdown(self): 

249 """Clean up resources used by the code chunking strategy.""" 

250 logger.debug("Shutting down CodeChunkingStrategy") 

251 

252 # Clean up document parser resources 

253 if hasattr(self.document_parser, "_parsers"): 

254 self.document_parser._parsers.clear() 

255 

256 # No additional cleanup needed for other components 

257 logger.debug("CodeChunkingStrategy shutdown complete")