Coverage for src/qdrant_loader/core/chunking/strategy/code/code_chunk_processor.py: 66%

96 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""Code chunk processor for creating enhanced code chunk documents.""" 

2 

3from typing import Any 

4 

5import structlog 

6 

7from qdrant_loader.core.chunking.strategy.base.chunk_processor import BaseChunkProcessor 

8from qdrant_loader.core.chunking.strategy.code.processor.analysis import ( 

9 analyze_code_content, 

10 extract_language_context, 

11) 

12from qdrant_loader.core.chunking.strategy.code.processor.quality import ( 

13 assess_code_quality, 

14 assess_educational_value, 

15 calculate_reusability_score, 

16) 

17from qdrant_loader.core.document import Document 

18 

19logger = structlog.get_logger(__name__) 

20 

21 

22class CodeChunkProcessor(BaseChunkProcessor): 

23 """Chunk processor for code documents with programming language context.""" 

24 

25 def __init__(self, settings): 

26 super().__init__(settings) 

27 self.logger = logger 

28 self.code_config = getattr( 

29 settings.global_config.chunking.strategies, "code", None 

30 ) 

31 self.max_chunk_size_for_nlp = getattr( 

32 self.code_config, "max_chunk_size_for_nlp", 20000 

33 ) 

34 self.skip_conditions = { 

35 "large_content": self.max_chunk_size_for_nlp, 

36 "binary_patterns": ["\x00", "\xff", "\xfe"], 

37 "minified_code_threshold": 0.1, 

38 "generated_code_patterns": [ 

39 "auto-generated", 

40 "do not edit", 

41 "generated by", 

42 ], 

43 } 

44 

45 def create_chunk_document( 

46 self, 

47 original_doc: Document, 

48 chunk_content: str, 

49 chunk_index: int, 

50 total_chunks: int, 

51 chunk_metadata: dict[str, Any], 

52 skip_nlp: bool = False, 

53 ) -> Document: 

54 chunk_id = self.generate_chunk_id(original_doc, chunk_index) 

55 base_metadata = self.create_base_chunk_metadata( 

56 original_doc, chunk_index, total_chunks, chunk_metadata 

57 ) 

58 code_metadata = self._create_code_specific_metadata( 

59 chunk_content, chunk_metadata, original_doc 

60 ) 

61 base_metadata.update(code_metadata) 

62 if not skip_nlp: 

63 skip_nlp, skip_reason = self.should_skip_semantic_analysis( 

64 chunk_content, chunk_metadata 

65 ) 

66 if skip_nlp: 

67 base_metadata["nlp_skip_reason"] = skip_reason 

68 return Document( 

69 id=chunk_id, 

70 content=chunk_content, 

71 metadata=base_metadata, 

72 source=original_doc.source, 

73 source_type=original_doc.source_type, 

74 url=original_doc.url, 

75 content_type=original_doc.content_type, 

76 title=self._generate_chunk_title(original_doc, chunk_metadata, chunk_index), 

77 ) 

78 

79 def should_skip_semantic_analysis( 

80 self, chunk_content: str, chunk_metadata: dict[str, Any] 

81 ) -> tuple[bool, str]: 

82 content_length = len(chunk_content) 

83 if content_length > self.skip_conditions["large_content"]: 

84 return True, "content_too_large" 

85 if any( 

86 pattern in chunk_content 

87 for pattern in self.skip_conditions["binary_patterns"] 

88 ): 

89 return True, "binary_content" 

90 if self._is_minified_code(chunk_content): 

91 return True, "minified_code" 

92 if self._is_generated_code(chunk_content): 

93 return True, "generated_code" 

94 if self._is_mostly_comments(chunk_content): 

95 return True, "mostly_comments" 

96 if chunk_metadata.get("element_type") == "test" and content_length < 500: 

97 return True, "simple_test_code" 

98 if chunk_metadata.get("language") in ["json", "yaml", "xml", "ini"]: 

99 return True, "configuration_file" 

100 return False, "suitable_for_nlp" 

101 

102 def _create_code_specific_metadata( 

103 self, content: str, chunk_metadata: dict[str, Any], original_doc: Document 

104 ) -> dict[str, Any]: 

105 return { 

106 "content_analysis": analyze_code_content(content), 

107 "language_context": extract_language_context(content, chunk_metadata), 

108 "code_quality": assess_code_quality(content, chunk_metadata), 

109 "educational_value": assess_educational_value(content, chunk_metadata), 

110 "reusability_score": calculate_reusability_score(content, chunk_metadata), 

111 "chunking_strategy": "code_modular", 

112 "element_context": ( 

113 self._extract_element_context( 

114 content, chunk_metadata.get("element_type", "unknown") 

115 ) 

116 if chunk_metadata.get("element_type", "unknown") != "unknown" 

117 else None 

118 ), 

119 } 

120 

121 def _generate_chunk_title( 

122 self, original_doc: Document, chunk_metadata: dict[str, Any], chunk_index: int 

123 ) -> str: 

124 base_title = original_doc.title 

125 element_name = chunk_metadata.get("element_name") 

126 element_type = chunk_metadata.get("element_type", "code") 

127 if element_name: 

128 return f"{base_title}{element_type}: {element_name} (Part {chunk_index + 1})" 

129 if element_type and element_type != "code": 

130 return f"{base_title}{element_type.title()} Part {chunk_index + 1}" 

131 return f"{base_title} — Code Chunk {chunk_index + 1}" 

132 

133 # Local helpers (unchanged logic) 

134 def _is_minified_code(self, content: str) -> bool: 

135 lines = content.split("\n") 

136 non_empty = [line for line in lines if line.strip()] 

137 if not non_empty: 

138 return False 

139 avg_len = sum(len(line) for line in non_empty) / len(non_empty) 

140 specials = sum( 

141 1 for line in non_empty if any(ch in line for ch in ["{", "}", ";"]) 

142 ) 

143 ratio = specials / len(non_empty) 

144 return avg_len > 200 and ratio > self.skip_conditions["minified_code_threshold"] 

145 

146 def _is_generated_code(self, content: str) -> bool: 

147 lower = content.lower() 

148 return any( 

149 pat in lower for pat in self.skip_conditions["generated_code_patterns"] 

150 ) 

151 

152 def _is_mostly_comments(self, content: str) -> bool: 

153 lines = content.split("\n") 

154 if not lines: 

155 return False 

156 comment_lines = [ 

157 line for line in lines if line.strip().startswith(("#", "//", "/*", "--")) 

158 ] 

159 return len(comment_lines) / len(lines) > 0.6 

160 

161 def _has_meaningful_names(self, content: str) -> bool: 

162 bad_names = ["tmp", "foo", "bar", "baz", "var", "data", "x", "y", "z"] 

163 return not any(f" {n} " in content for n in bad_names) 

164 

165 def _determine_learning_level( 

166 self, content: str, chunk_metadata: dict[str, Any] 

167 ) -> str: 

168 complexity = chunk_metadata.get("complexity", 0) 

169 if complexity < 2: 

170 return "beginner" 

171 if complexity < 6: 

172 return "intermediate" 

173 return "advanced" 

174 

175 def _identify_programming_concepts(self, content: str) -> list[str]: 

176 concepts: list[str] = [] 

177 lower = content.lower() 

178 for k in [ 

179 "recursion", 

180 "memoization", 

181 "concurrency", 

182 "polymorphism", 

183 "inheritance", 

184 ]: 

185 if k in lower: 

186 concepts.append(k) 

187 return concepts 

188 

189 def _extract_element_context( 

190 self, content: str, element_type: str 

191 ) -> dict[str, Any]: 

192 context = {"element_type": element_type} 

193 if element_type in ["function", "method"]: 

194 context["has_return_statement"] = "return" in content 

195 context["param_count_estimate"] = ( 

196 content.split("(", 1)[-1].split(")")[0].count(",") + 1 

197 if "(" in content and ")" in content 

198 else 0 

199 ) 

200 elif element_type == "class": 

201 context["has_init"] = "__init__" in content 

202 context["method_count_estimate"] = content.count("def ") 

203 return context