Coverage for src/qdrant_loader/core/chunking/strategy/code/code_metadata_extractor.py: 44%

126 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""Code metadata extractor for enhanced programming language analysis.""" 

2 

3from typing import Any 

4 

5import structlog 

6 

7from qdrant_loader.core.chunking.strategy.base.metadata_extractor import ( 

8 BaseMetadataExtractor, 

9) 

10from qdrant_loader.core.document import Document 

11 

12logger = structlog.get_logger(__name__) 

13 

14 

15class CodeMetadataExtractor(BaseMetadataExtractor): 

16 """Enhanced metadata extractor for code documents.""" 

17 

18 def __init__(self, settings): 

19 """Initialize the code metadata extractor. 

20 

21 Args: 

22 settings: Configuration settings 

23 """ 

24 self.settings = settings 

25 self.logger = logger 

26 

27 # Code-specific configuration 

28 self.code_config = getattr( 

29 settings.global_config.chunking.strategies, "code", None 

30 ) 

31 

32 def extract_hierarchical_metadata( 

33 self, content: str, chunk_metadata: dict[str, Any], document: Document 

34 ) -> dict[str, Any]: 

35 """Extract comprehensive code metadata from chunk content. 

36 

37 Args: 

38 content: Code chunk content 

39 chunk_metadata: Existing chunk metadata 

40 document: Original document 

41 

42 Returns: 

43 Enhanced metadata dictionary 

44 """ 

45 metadata = chunk_metadata.copy() 

46 

47 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

48 analyze_performance_patterns, 

49 analyze_security_patterns, 

50 build_dependency_graph, 

51 calculate_complexity_metrics, 

52 calculate_doc_coverage, 

53 calculate_maintainability_metrics, 

54 extract_language_specific_metadata, 

55 identify_code_patterns, 

56 identify_test_code, 

57 ) 

58 

59 metadata.update( 

60 { 

61 "dependency_graph": build_dependency_graph(content), 

62 "complexity_metrics": calculate_complexity_metrics(content), 

63 "code_patterns": identify_code_patterns(content), 

64 "documentation_coverage": calculate_doc_coverage(content), 

65 "test_indicators": identify_test_code(content), 

66 "security_indicators": analyze_security_patterns(content), 

67 "performance_indicators": analyze_performance_patterns(content), 

68 "maintainability_metrics": calculate_maintainability_metrics(content), 

69 "content_type": "code", 

70 } 

71 ) 

72 

73 language = chunk_metadata.get("language", "unknown") 

74 if language != "unknown": 

75 metadata.update(extract_language_specific_metadata(content, language)) 

76 

77 return metadata 

78 

79 def extract_entities(self, text: str) -> list[str]: 

80 """Extract code entities like class names, function names, variables. 

81 

82 Args: 

83 text: Code text to analyze 

84 

85 Returns: 

86 List of identified code entities 

87 """ 

88 from qdrant_loader.core.chunking.strategy.code.metadata import extract_entities 

89 

90 return extract_entities(text) 

91 

92 def _build_dependency_graph(self, content: str) -> dict[str, list[str]]: 

93 """Build dependency graph for code. 

94 

95 Args: 

96 content: Code content 

97 

98 Returns: 

99 Dictionary mapping modules/classes to their dependencies 

100 """ 

101 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

102 build_dependency_graph as _build, 

103 ) 

104 

105 return _build(content) 

106 

107 def _is_third_party_import(self, import_name: str) -> bool: 

108 """Determine if an import is a third-party library. 

109 

110 Args: 

111 import_name: The import name to check 

112 

113 Returns: 

114 True if it's likely a third-party import 

115 """ 

116 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

117 is_third_party_import as _is, 

118 ) 

119 

120 return _is(import_name) 

121 

122 def _calculate_complexity_metrics(self, content: str) -> dict[str, Any]: 

123 """Calculate code complexity metrics. 

124 

125 Args: 

126 content: Code content 

127 

128 Returns: 

129 Dictionary of complexity metrics 

130 """ 

131 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

132 calculate_complexity_metrics as _calc, 

133 ) 

134 

135 return _calc(content) 

136 

137 def _calculate_maintainability_index(self, content: str) -> float: 

138 """Calculate maintainability index (0-100 scale).""" 

139 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

140 calculate_maintainability_index as _mi, 

141 ) 

142 

143 return _mi(content) 

144 

145 def _identify_code_patterns(self, content: str) -> dict[str, Any]: 

146 """Identify common code patterns and design elements.""" 

147 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

148 identify_code_patterns as _identify, 

149 ) 

150 

151 return _identify(content) 

152 

153 def _calculate_doc_coverage(self, content: str) -> dict[str, Any]: 

154 """Calculate documentation coverage metrics.""" 

155 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

156 calculate_doc_coverage as _doc, 

157 ) 

158 

159 return _doc(content) 

160 

161 def _identify_test_code(self, content: str) -> dict[str, Any]: 

162 """Identify test-related code indicators.""" 

163 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

164 identify_test_code as _tests, 

165 ) 

166 

167 return _tests(content) 

168 

169 def _analyze_security_patterns(self, content: str) -> dict[str, Any]: 

170 """Analyze security-related patterns in code.""" 

171 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

172 analyze_security_patterns as _sec, 

173 ) 

174 

175 return _sec(content) 

176 

177 def _analyze_performance_patterns(self, content: str) -> dict[str, Any]: 

178 """Analyze performance-related patterns in code.""" 

179 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

180 analyze_performance_patterns as _perf, 

181 ) 

182 

183 return _perf(content) 

184 

185 def _calculate_maintainability_metrics(self, content: str) -> dict[str, Any]: 

186 """Calculate maintainability-related metrics.""" 

187 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

188 calculate_maintainability_metrics as _maint, 

189 ) 

190 

191 return _maint(content) 

192 

193 def _extract_language_specific_metadata( 

194 self, content: str, language: str 

195 ) -> dict[str, Any]: 

196 """Extract language-specific metadata. 

197 

198 Args: 

199 content: Code content 

200 language: Programming language 

201 

202 Returns: 

203 Language-specific metadata 

204 """ 

205 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

206 extract_language_specific_metadata as _lang, 

207 ) 

208 

209 return _lang(content, language) 

210 

211 def _extract_python_metadata(self, content: str) -> dict[str, Any]: 

212 """Extract Python-specific metadata.""" 

213 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

214 detect_python_version_features as _ver, 

215 ) 

216 

217 features = [] 

218 if "async def" in content or ("async" in content and "await" in content): 

219 features.append("async_await") 

220 if "@" in content: 

221 features.append("decorators") 

222 if "typing" in content or "Type" in content or ":" in content: 

223 features.append("type_hints") 

224 if "yield" in content: 

225 features.append("generators") 

226 if "__enter__" in content and "__exit__" in content: 

227 features.append("context_managers") 

228 if "__" in content: 

229 features.append("dunder_methods") 

230 if "lambda" in content: 

231 features.append("lambda_functions") 

232 if "dataclass" in content or "@dataclass" in content: 

233 features.append("dataclasses") 

234 

235 return {"python_features": features, "python_version_indicators": _ver(content)} 

236 

237 def _extract_javascript_metadata(self, content: str) -> dict[str, Any]: 

238 """Extract JavaScript/TypeScript-specific metadata.""" 

239 features = [] 

240 if "async" in content and "await" in content: 

241 features.append("async_await") 

242 if "=>" in content: 

243 features.append("arrow_functions") 

244 if "const" in content or "let" in content: 

245 features.append("es6_variables") 

246 if "class" in content: 

247 features.append("es6_classes") 

248 if "import" in content and "from" in content: 

249 features.append("es6_modules") 

250 if "${" in content: 

251 features.append("template_literals") 

252 if "{" in content and "}" in content and ("=" in content or "const" in content): 

253 features.append("destructuring") 

254 if "function*" in content or "yield" in content: 

255 features.append("generators") 

256 return {"javascript_features": features} 

257 

258 def _extract_java_metadata(self, content: str) -> dict[str, Any]: 

259 """Extract Java-specific metadata.""" 

260 features = [] 

261 if "interface" in content: 

262 features.append("interfaces") 

263 if "extends" in content: 

264 features.append("inheritance") 

265 if "implements" in content: 

266 features.append("interface_implementation") 

267 if "synchronized" in content: 

268 features.append("thread_synchronization") 

269 if "generic" in content or "<" in content and ">" in content: 

270 features.append("generics") 

271 if "@Override" in content or "@" in content: 

272 features.append("annotations") 

273 return {"language_features": features} 

274 

275 def _extract_c_cpp_metadata(self, content: str) -> dict[str, Any]: 

276 """Extract C/C++-specific metadata.""" 

277 features = [] 

278 if "#include" in content: 

279 features.append("header_includes") 

280 if "malloc" in content or "free" in content: 

281 features.append("manual_memory_management") 

282 if "pointer" in content or "->" in content: 

283 features.append("pointer_usage") 

284 if "template" in content: 

285 features.append("templates") 

286 if "namespace" in content: 

287 features.append("namespaces") 

288 if "inline" in content: 

289 features.append("inline_functions") 

290 return {"language_features": features} 

291 

292 def _detect_python_version_features(self, content: str) -> list[str]: 

293 """Detect Python version-specific features.""" 

294 from qdrant_loader.core.chunking.strategy.code.metadata import ( 

295 detect_python_version_features as _detect, 

296 ) 

297 

298 return _detect(content)