Coverage for src/qdrant_loader/core/chunking/strategy/code/code_metadata_extractor.py: 44%
126 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1"""Code metadata extractor for enhanced programming language analysis."""
3from typing import Any
5import structlog
7from qdrant_loader.core.chunking.strategy.base.metadata_extractor import (
8 BaseMetadataExtractor,
9)
10from qdrant_loader.core.document import Document
12logger = structlog.get_logger(__name__)
15class CodeMetadataExtractor(BaseMetadataExtractor):
16 """Enhanced metadata extractor for code documents."""
18 def __init__(self, settings):
19 """Initialize the code metadata extractor.
21 Args:
22 settings: Configuration settings
23 """
24 self.settings = settings
25 self.logger = logger
27 # Code-specific configuration
28 self.code_config = getattr(
29 settings.global_config.chunking.strategies, "code", None
30 )
32 def extract_hierarchical_metadata(
33 self, content: str, chunk_metadata: dict[str, Any], document: Document
34 ) -> dict[str, Any]:
35 """Extract comprehensive code metadata from chunk content.
37 Args:
38 content: Code chunk content
39 chunk_metadata: Existing chunk metadata
40 document: Original document
42 Returns:
43 Enhanced metadata dictionary
44 """
45 metadata = chunk_metadata.copy()
47 from qdrant_loader.core.chunking.strategy.code.metadata import (
48 analyze_performance_patterns,
49 analyze_security_patterns,
50 build_dependency_graph,
51 calculate_complexity_metrics,
52 calculate_doc_coverage,
53 calculate_maintainability_metrics,
54 extract_language_specific_metadata,
55 identify_code_patterns,
56 identify_test_code,
57 )
59 metadata.update(
60 {
61 "dependency_graph": build_dependency_graph(content),
62 "complexity_metrics": calculate_complexity_metrics(content),
63 "code_patterns": identify_code_patterns(content),
64 "documentation_coverage": calculate_doc_coverage(content),
65 "test_indicators": identify_test_code(content),
66 "security_indicators": analyze_security_patterns(content),
67 "performance_indicators": analyze_performance_patterns(content),
68 "maintainability_metrics": calculate_maintainability_metrics(content),
69 "content_type": "code",
70 }
71 )
73 language = chunk_metadata.get("language", "unknown")
74 if language != "unknown":
75 metadata.update(extract_language_specific_metadata(content, language))
77 return metadata
79 def extract_entities(self, text: str) -> list[str]:
80 """Extract code entities like class names, function names, variables.
82 Args:
83 text: Code text to analyze
85 Returns:
86 List of identified code entities
87 """
88 from qdrant_loader.core.chunking.strategy.code.metadata import extract_entities
90 return extract_entities(text)
92 def _build_dependency_graph(self, content: str) -> dict[str, list[str]]:
93 """Build dependency graph for code.
95 Args:
96 content: Code content
98 Returns:
99 Dictionary mapping modules/classes to their dependencies
100 """
101 from qdrant_loader.core.chunking.strategy.code.metadata import (
102 build_dependency_graph as _build,
103 )
105 return _build(content)
107 def _is_third_party_import(self, import_name: str) -> bool:
108 """Determine if an import is a third-party library.
110 Args:
111 import_name: The import name to check
113 Returns:
114 True if it's likely a third-party import
115 """
116 from qdrant_loader.core.chunking.strategy.code.metadata import (
117 is_third_party_import as _is,
118 )
120 return _is(import_name)
122 def _calculate_complexity_metrics(self, content: str) -> dict[str, Any]:
123 """Calculate code complexity metrics.
125 Args:
126 content: Code content
128 Returns:
129 Dictionary of complexity metrics
130 """
131 from qdrant_loader.core.chunking.strategy.code.metadata import (
132 calculate_complexity_metrics as _calc,
133 )
135 return _calc(content)
137 def _calculate_maintainability_index(self, content: str) -> float:
138 """Calculate maintainability index (0-100 scale)."""
139 from qdrant_loader.core.chunking.strategy.code.metadata import (
140 calculate_maintainability_index as _mi,
141 )
143 return _mi(content)
145 def _identify_code_patterns(self, content: str) -> dict[str, Any]:
146 """Identify common code patterns and design elements."""
147 from qdrant_loader.core.chunking.strategy.code.metadata import (
148 identify_code_patterns as _identify,
149 )
151 return _identify(content)
153 def _calculate_doc_coverage(self, content: str) -> dict[str, Any]:
154 """Calculate documentation coverage metrics."""
155 from qdrant_loader.core.chunking.strategy.code.metadata import (
156 calculate_doc_coverage as _doc,
157 )
159 return _doc(content)
161 def _identify_test_code(self, content: str) -> dict[str, Any]:
162 """Identify test-related code indicators."""
163 from qdrant_loader.core.chunking.strategy.code.metadata import (
164 identify_test_code as _tests,
165 )
167 return _tests(content)
169 def _analyze_security_patterns(self, content: str) -> dict[str, Any]:
170 """Analyze security-related patterns in code."""
171 from qdrant_loader.core.chunking.strategy.code.metadata import (
172 analyze_security_patterns as _sec,
173 )
175 return _sec(content)
177 def _analyze_performance_patterns(self, content: str) -> dict[str, Any]:
178 """Analyze performance-related patterns in code."""
179 from qdrant_loader.core.chunking.strategy.code.metadata import (
180 analyze_performance_patterns as _perf,
181 )
183 return _perf(content)
185 def _calculate_maintainability_metrics(self, content: str) -> dict[str, Any]:
186 """Calculate maintainability-related metrics."""
187 from qdrant_loader.core.chunking.strategy.code.metadata import (
188 calculate_maintainability_metrics as _maint,
189 )
191 return _maint(content)
193 def _extract_language_specific_metadata(
194 self, content: str, language: str
195 ) -> dict[str, Any]:
196 """Extract language-specific metadata.
198 Args:
199 content: Code content
200 language: Programming language
202 Returns:
203 Language-specific metadata
204 """
205 from qdrant_loader.core.chunking.strategy.code.metadata import (
206 extract_language_specific_metadata as _lang,
207 )
209 return _lang(content, language)
211 def _extract_python_metadata(self, content: str) -> dict[str, Any]:
212 """Extract Python-specific metadata."""
213 from qdrant_loader.core.chunking.strategy.code.metadata import (
214 detect_python_version_features as _ver,
215 )
217 features = []
218 if "async def" in content or ("async" in content and "await" in content):
219 features.append("async_await")
220 if "@" in content:
221 features.append("decorators")
222 if "typing" in content or "Type" in content or ":" in content:
223 features.append("type_hints")
224 if "yield" in content:
225 features.append("generators")
226 if "__enter__" in content and "__exit__" in content:
227 features.append("context_managers")
228 if "__" in content:
229 features.append("dunder_methods")
230 if "lambda" in content:
231 features.append("lambda_functions")
232 if "dataclass" in content or "@dataclass" in content:
233 features.append("dataclasses")
235 return {"python_features": features, "python_version_indicators": _ver(content)}
237 def _extract_javascript_metadata(self, content: str) -> dict[str, Any]:
238 """Extract JavaScript/TypeScript-specific metadata."""
239 features = []
240 if "async" in content and "await" in content:
241 features.append("async_await")
242 if "=>" in content:
243 features.append("arrow_functions")
244 if "const" in content or "let" in content:
245 features.append("es6_variables")
246 if "class" in content:
247 features.append("es6_classes")
248 if "import" in content and "from" in content:
249 features.append("es6_modules")
250 if "${" in content:
251 features.append("template_literals")
252 if "{" in content and "}" in content and ("=" in content or "const" in content):
253 features.append("destructuring")
254 if "function*" in content or "yield" in content:
255 features.append("generators")
256 return {"javascript_features": features}
258 def _extract_java_metadata(self, content: str) -> dict[str, Any]:
259 """Extract Java-specific metadata."""
260 features = []
261 if "interface" in content:
262 features.append("interfaces")
263 if "extends" in content:
264 features.append("inheritance")
265 if "implements" in content:
266 features.append("interface_implementation")
267 if "synchronized" in content:
268 features.append("thread_synchronization")
269 if "generic" in content or "<" in content and ">" in content:
270 features.append("generics")
271 if "@Override" in content or "@" in content:
272 features.append("annotations")
273 return {"language_features": features}
275 def _extract_c_cpp_metadata(self, content: str) -> dict[str, Any]:
276 """Extract C/C++-specific metadata."""
277 features = []
278 if "#include" in content:
279 features.append("header_includes")
280 if "malloc" in content or "free" in content:
281 features.append("manual_memory_management")
282 if "pointer" in content or "->" in content:
283 features.append("pointer_usage")
284 if "template" in content:
285 features.append("templates")
286 if "namespace" in content:
287 features.append("namespaces")
288 if "inline" in content:
289 features.append("inline_functions")
290 return {"language_features": features}
292 def _detect_python_version_features(self, content: str) -> list[str]:
293 """Detect Python version-specific features."""
294 from qdrant_loader.core.chunking.strategy.code.metadata import (
295 detect_python_version_features as _detect,
296 )
298 return _detect(content)