Coverage for src/qdrant_loader/core/chunking/strategy/code

1"""Code-specific chunking strategy for programming languages."""

3import structlog

5from qdrant_loader.config import Settings

6from qdrant_loader.core.chunking.progress_tracker import ChunkingProgressTracker

7from qdrant_loader.core.chunking.strategy.base_strategy import BaseChunkingStrategy

8from qdrant_loader.core.document import Document

10from .code import (

11 CodeChunkProcessor,

12 CodeDocumentParser,

13 CodeMetadataExtractor,

14 CodeSectionSplitter,

15)

17logger = structlog.get_logger(__name__)

20class CodeChunkingStrategy(BaseChunkingStrategy):

21 """Modern code chunking strategy using modular architecture.

23 This strategy uses AST parsing (primarily tree-sitter) to split code files into

24 chunks based on semantic code elements, preserving the code structure and hierarchy.

25 Uses modular components for parsing, splitting, metadata extraction, and chunk processing.

26 """

28 def __init__(self, settings: Settings):

29 """Initialize the code chunking strategy.

31 Args:

32 settings: Configuration settings

33 """

34 super().__init__(settings)

35 self.logger = logger

36 self.progress_tracker = ChunkingProgressTracker(logger)

38 # Initialize modular components

39 self.document_parser = CodeDocumentParser(settings)

40 self.section_splitter = CodeSectionSplitter(settings)

41 self.metadata_extractor = CodeMetadataExtractor(settings)

42 self.chunk_processor = CodeChunkProcessor(settings)

44 # Code-specific configuration

45 self.code_config = settings.global_config.chunking.strategies.code

46 self.chunk_size_threshold = getattr(

47 self.code_config, "max_file_size_for_ast", 40000

48 )

50 logger.info(

51 "CodeChunkingStrategy initialized with modular architecture",

52 extra={

53 "chunk_size": settings.global_config.chunking.chunk_size,

54 "chunk_overlap": settings.global_config.chunking.chunk_overlap,

55 "max_file_size_for_ast": self.code_config.max_file_size_for_ast,

56 "enable_ast_parsing": self.code_config.enable_ast_parsing,

57 "enable_dependency_analysis": self.code_config.enable_dependency_analysis,

58 "chunking_method": "intelligent_ast_parsing",

59 },

60 )

62 def chunk_document(self, document: Document) -> list[Document]:

63 """Chunk a code document using modern modular approach.

65 Args:

66 document: Document to chunk

68 Returns:

69 List of chunked documents

70 """

71 file_name = (

72 document.metadata.get("file_name")

73 or document.metadata.get("original_filename")

74 or document.title

75 or f"{document.source_type}:{document.source}"

76 )

78 # Start progress tracking

79 self.progress_tracker.start_chunking(

80 document.id,

81 document.source,

82 document.source_type,

83 len(document.content),

84 file_name,

85 )

87 # Provide user guidance on expected chunk count

88 estimated_chunks = self.chunk_processor.estimate_chunk_count(document.content)

89 logger.info(

90 f"Processing code document: {document.title} ({len(document.content):,} chars)",

91 extra={

92 "estimated_chunks": estimated_chunks,

93 "chunk_size": self.settings.global_config.chunking.chunk_size,

94 "max_chunks_allowed": self.settings.global_config.chunking.max_chunks_per_document,

95 "file_type": "code",

96 },

97 )

99 try:

100 # Parse document structure first

101 logger.debug("Analyzing code document structure")

102 document_structure = self.document_parser.parse_document_structure(

103 document.content

104 )

105

106 # Split content into intelligent sections using the section splitter

107 logger.debug("Splitting code into semantic sections")

108 chunks_metadata = self.section_splitter.split_sections(

109 document.content, document

110 )

111

112 if not chunks_metadata:

113 self.progress_tracker.finish_chunking(document.id, 0, "code")

114 return []

115

116 # Apply configuration-driven safety limit

117 max_chunks = self.settings.global_config.chunking.max_chunks_per_document

118 if len(chunks_metadata) > max_chunks:

119 logger.warning(

120 f"Code document generated {len(chunks_metadata)} chunks, limiting to {max_chunks} per config. "

121 f"Consider increasing max_chunks_per_document in config or using larger chunk_size. "

122 f"Document: {document.title}"

123 )

124 chunks_metadata = chunks_metadata[:max_chunks]

125

126 # Create chunk documents

127 chunked_docs = []

128 for i, chunk_meta in enumerate(chunks_metadata):

129 chunk_content = chunk_meta["content"]

130 logger.debug(

131 f"Processing code chunk {i+1}/{len(chunks_metadata)}",

132 extra={

133 "chunk_size": len(chunk_content),

134 "element_type": chunk_meta.get("element_type", "unknown"),

135 "language": chunk_meta.get("language", "unknown"),

136 },

137 )

138

139 # Add document structure info to chunk metadata

140 chunk_meta.update(

141 {

142 "document_structure": document_structure,

143 "chunking_strategy": "code_modular",

144 }

145 )

146

147 # Enhanced: Use hierarchical metadata extraction

148 enriched_metadata = (

149 self.metadata_extractor.extract_hierarchical_metadata(

150 chunk_content, chunk_meta, document

151 )

152 )

153

154 # Create chunk document using the chunk processor

155 # Skip NLP for large code chunks or generated code

156 skip_nlp, skip_reason = (

157 self.chunk_processor.should_skip_semantic_analysis(

158 chunk_content, enriched_metadata

159 )

160 )

161

162 chunk_doc = self.chunk_processor.create_chunk_document(

163 original_doc=document,

164 chunk_content=chunk_content,

165 chunk_index=i,

166 total_chunks=len(chunks_metadata),

167 chunk_metadata=enriched_metadata,

168 skip_nlp=skip_nlp,

169 )

170

171 chunked_docs.append(chunk_doc)

172

173 # Finish progress tracking

174 avg_chunk_size = (

175 sum(len(doc.content) for doc in chunked_docs) // len(chunked_docs)

176 if chunked_docs

177 else 0

178 )

179 self.progress_tracker.finish_chunking(

180 document.id, len(chunked_docs), "code_modular"

181 )

182

183 logger.info(

184 "Successfully chunked code document with modular architecture",

185 extra={

186 "document_id": document.id,

187 "num_chunks": len(chunked_docs),

188 "avg_chunk_size": avg_chunk_size,

189 "strategy": "code_modular",

190 },

191 )

192

193 return chunked_docs

194

195 except Exception as e:

196 self.progress_tracker.log_error(document.id, str(e))

197 logger.error(f"Code chunking failed: {e}", exc_info=True)

198 # Fallback to default chunking

199 self.progress_tracker.log_fallback(

200 document.id, f"Code parsing failed: {str(e)}"

201 )

202 return self._fallback_chunking(document)

203

204 def _fallback_chunking(self, document: Document) -> list[Document]:

205 """Fallback chunking using simple text-based approach.

206

207 Args:

208 document: Document to chunk

209

210 Returns:

211 List of chunked documents using fallback approach

212 """

213 logger.info(

214 f"Using fallback chunking for large code document: {document.title}"

215 )

216

217 # Use the section splitter's fallback method

218 fallback_sections = self.section_splitter._fallback_text_split(document.content)

219

220 # Create chunk documents

221 chunked_docs = []

222 for i, section in enumerate(fallback_sections):

223 chunk_content = section["content"]

224 chunk_metadata = section["metadata"]

225

226 # Add fallback-specific metadata

227 chunk_metadata.update(

228 {

229 "chunking_strategy": "code_fallback",

230 "fallback_reason": "file_too_large",

231 }

232 )

233

234 # Create chunk document

235 chunk_doc = self.chunk_processor.create_chunk_document(

236 original_doc=document,

237 chunk_content=chunk_content,

238 chunk_index=i,

239 total_chunks=len(fallback_sections),

240 chunk_metadata=chunk_metadata,

241 skip_nlp=True, # Skip NLP for fallback chunks

242 )

243

244 chunked_docs.append(chunk_doc)

245

246 return chunked_docs

247

248 def shutdown(self):

249 """Clean up resources used by the code chunking strategy."""

250 logger.debug("Shutting down CodeChunkingStrategy")

251

252 # Clean up document parser resources

253 if hasattr(self.document_parser, "_parsers"):

254 self.document_parser._parsers.clear()

255

256 # No additional cleanup needed for other components

257 logger.debug("CodeChunkingStrategy shutdown complete")

Coverage for src/qdrant_loader/core/chunking/strategy/code_strategy.py: 73%

70 statements