Coverage for src/qdrant_loader/core/chunking/strategy/code/code_metadata_extractor.py: 83%
318 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-13 09:19 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-13 09:19 +0000
1"""Code metadata extractor for enhanced programming language analysis."""
3import re
4from typing import Any
6import structlog
8from qdrant_loader.core.chunking.strategy.base.metadata_extractor import (
9 BaseMetadataExtractor,
10)
11from qdrant_loader.core.document import Document
13logger = structlog.get_logger(__name__)
16class CodeMetadataExtractor(BaseMetadataExtractor):
17 """Enhanced metadata extractor for code documents."""
19 def __init__(self, settings):
20 """Initialize the code metadata extractor.
22 Args:
23 settings: Configuration settings
24 """
25 self.settings = settings
26 self.logger = logger
28 # Code-specific configuration
29 self.code_config = getattr(
30 settings.global_config.chunking.strategies, "code", None
31 )
33 def extract_hierarchical_metadata(
34 self, content: str, chunk_metadata: dict[str, Any], document: Document
35 ) -> dict[str, Any]:
36 """Extract comprehensive code metadata from chunk content.
38 Args:
39 content: Code chunk content
40 chunk_metadata: Existing chunk metadata
41 document: Original document
43 Returns:
44 Enhanced metadata dictionary
45 """
46 metadata = chunk_metadata.copy()
48 # Add enhanced code analysis
49 metadata.update(
50 {
51 "dependency_graph": self._build_dependency_graph(content),
52 "complexity_metrics": self._calculate_complexity_metrics(content),
53 "code_patterns": self._identify_code_patterns(content),
54 "documentation_coverage": self._calculate_doc_coverage(content),
55 "test_indicators": self._identify_test_code(content),
56 "security_indicators": self._analyze_security_patterns(content),
57 "performance_indicators": self._analyze_performance_patterns(content),
58 "maintainability_metrics": self._calculate_maintainability_metrics(
59 content
60 ),
61 "content_type": "code",
62 }
63 )
65 # Language-specific analysis
66 language = chunk_metadata.get("language", "unknown")
67 if language != "unknown":
68 metadata.update(self._extract_language_specific_metadata(content, language))
70 return metadata
72 def extract_entities(self, text: str) -> list[str]:
73 """Extract code entities like class names, function names, variables.
75 Args:
76 text: Code text to analyze
78 Returns:
79 List of identified code entities
80 """
81 entities = []
83 # Extract class names
84 class_pattern = r"\b(?:class|interface|struct|enum)\s+([A-Z][a-zA-Z0-9_]*)"
85 entities.extend(re.findall(class_pattern, text))
87 # Extract function/method names
88 function_patterns = [
89 r"\bdef\s+([a-zA-Z_][a-zA-Z0-9_]*)", # Python
90 r"\bfunction\s+([a-zA-Z_][a-zA-Z0-9_]*)", # JavaScript
91 r"\b(?:public|private|protected)?\s*(?:static\s+)?[a-zA-Z_][a-zA-Z0-9_<>]*\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", # Java/C#
92 ]
94 for pattern in function_patterns:
95 entities.extend(re.findall(pattern, text))
97 # Extract constant names (usually uppercase)
98 constant_pattern = r"\b([A-Z][A-Z0-9_]{2,})\b"
99 entities.extend(re.findall(constant_pattern, text))
101 # Remove duplicates and return
102 return list(set(entities))
104 def _build_dependency_graph(self, content: str) -> dict[str, list[str]]:
105 """Build dependency graph for code.
107 Args:
108 content: Code content
110 Returns:
111 Dictionary mapping modules/classes to their dependencies
112 """
113 dependencies = {
114 "imports": [],
115 "internal_references": [],
116 "third_party_imports": [],
117 "stdlib_imports": [],
118 }
120 # Extract import statements
121 import_patterns = [
122 r"import\s+([a-zA-Z_][a-zA-Z0-9_.]*)", # Python: import module
123 r"from\s+([a-zA-Z_][a-zA-Z0-9_.]*)\s+import", # Python: from module import
124 r'#include\s*[<"]([^>"]+)[>"]', # C/C++
125 r'require\s*\([\'"]([^\'"]+)[\'"]\)', # Node.js
126 r'import\s+.*\s+from\s+[\'"]([^\'"]+)[\'"]', # ES6
127 ]
129 for pattern in import_patterns:
130 imports = re.findall(pattern, content)
131 dependencies["imports"].extend(imports)
133 # Python standard library modules (common ones)
134 python_stdlib = {
135 "os",
136 "sys",
137 "json",
138 "math",
139 "random",
140 "datetime",
141 "collections",
142 "itertools",
143 "functools",
144 "operator",
145 "re",
146 "urllib",
147 "http",
148 "pathlib",
149 "typing",
150 "dataclasses",
151 "abc",
152 "enum",
153 "logging",
154 "threading",
155 "multiprocessing",
156 "subprocess",
157 "socket",
158 "sqlite3",
159 "csv",
160 "pickle",
161 "gzip",
162 "zipfile",
163 "tarfile",
164 "shutil",
165 "tempfile",
166 }
168 # Identify external vs internal vs stdlib dependencies
169 for imp in dependencies["imports"]:
170 base_module = imp.split(".")[0]
171 if base_module in python_stdlib:
172 dependencies["stdlib_imports"].append(imp)
173 elif self._is_third_party_import(imp):
174 dependencies["third_party_imports"].append(imp)
175 else:
176 dependencies["internal_references"].append(imp)
178 return dependencies
180 def _is_third_party_import(self, import_name: str) -> bool:
181 """Determine if an import is a third-party library.
183 Args:
184 import_name: The import name to check
186 Returns:
187 True if it's likely a third-party import
188 """
189 base_module = import_name.split(".")[0].lower()
191 # Known third-party libraries
192 known_third_party = {
193 "requests",
194 "numpy",
195 "pandas",
196 "flask",
197 "django",
198 "fastapi",
199 "tensorflow",
200 "torch",
201 "pytorch",
202 "sklearn",
203 "scipy",
204 "matplotlib",
205 "seaborn",
206 "plotly",
207 "streamlit",
208 "dash",
209 "celery",
210 "redis",
211 "sqlalchemy",
212 "alembic",
213 "pydantic",
214 "marshmallow",
215 "click",
216 "typer",
217 "pytest",
218 "unittest2",
219 "mock",
220 "httpx",
221 "aiohttp",
222 "websockets",
223 "uvicorn",
224 "gunicorn",
225 "jinja2",
226 "mako",
227 "babel",
228 "pillow",
229 "opencv",
230 "cv2",
231 "boto3",
232 "azure",
233 "google",
234 }
236 if base_module in known_third_party:
237 return True
239 # Heuristics for third-party libraries:
240 # 1. Contains common third-party patterns
241 if any(pattern in base_module for pattern in ["lib", "client", "sdk", "api"]):
242 return True
244 # 2. Looks like a package name (contains underscores but not starting with _)
245 if "_" in base_module and not base_module.startswith("_"):
246 return True
248 # 3. Single lowercase word that's not obviously internal
249 if (
250 base_module.islower()
251 and not base_module.startswith("test")
252 and base_module not in ["main", "app", "config", "utils", "helpers"]
253 ):
254 return True
256 return False
258 def _calculate_complexity_metrics(self, content: str) -> dict[str, Any]:
259 """Calculate code complexity metrics.
261 Args:
262 content: Code content
264 Returns:
265 Dictionary of complexity metrics
266 """
267 lines = content.split("\n")
268 non_empty_lines = [line for line in lines if line.strip()]
270 # Cyclomatic complexity indicators
271 complexity_indicators = [
272 "if ",
273 "elif ",
274 "else:",
275 "while ",
276 "for ",
277 "try:",
278 "except:",
279 "case ",
280 "&&",
281 "||",
282 "?",
283 "and ",
284 "or ",
285 "switch",
286 ]
288 cyclomatic_complexity = 1 # Base complexity
289 for indicator in complexity_indicators:
290 cyclomatic_complexity += content.lower().count(indicator.lower())
292 # Nesting depth
293 max_nesting = 0
294 current_nesting = 0
296 for line in lines:
297 stripped = line.strip()
298 if any(
299 keyword in stripped
300 for keyword in ["if", "for", "while", "try", "def", "class"]
301 ):
302 current_nesting += 1
303 max_nesting = max(max_nesting, current_nesting)
304 elif stripped in ["end", "}"] or (
305 stripped.startswith("except") or stripped.startswith("finally")
306 ):
307 current_nesting = max(0, current_nesting - 1)
309 return {
310 "cyclomatic_complexity": cyclomatic_complexity,
311 "lines_of_code": len(non_empty_lines),
312 "total_lines": len(lines),
313 "nesting_depth": max_nesting,
314 "complexity_density": cyclomatic_complexity / max(len(non_empty_lines), 1),
315 "maintainability_index": self._calculate_maintainability_index(content),
316 }
318 def _calculate_maintainability_index(self, content: str) -> float:
319 """Calculate maintainability index (0-100 scale).
321 Args:
322 content: Code content
324 Returns:
325 Maintainability index score
326 """
327 import math
329 if not content.strip():
330 return 50 # Default for empty content
332 # Calculate lines of code and complexity
333 lines = content.split("\n")
334 non_empty_lines = [line for line in lines if line.strip()]
335 loc = len(non_empty_lines)
337 # Simple cyclomatic complexity
338 complexity_indicators = [
339 "if ",
340 "elif ",
341 "else:",
342 "while ",
343 "for ",
344 "try:",
345 "except:",
346 "case ",
347 ]
348 complexity = 1 # Base complexity
349 for indicator in complexity_indicators:
350 complexity += content.lower().count(indicator.lower())
352 # Simplified maintainability index calculation
353 # Based on Halstead metrics and cyclomatic complexity
355 # Count operators and operands (simplified)
356 operators = len(re.findall(r"[+\-*/=<>!&|%^~]", content))
357 operands = len(re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", content))
359 # Avoid division by zero
360 if operands == 0:
361 halstead_volume = 0
362 else:
363 vocabulary = operators + operands
364 length = operators + operands
365 halstead_volume = length * math.log2(vocabulary) if vocabulary > 1 else 0
367 # Maintainability index formula (simplified)
368 if loc > 0 and halstead_volume > 0:
369 mi = (
370 171
371 - 5.2 * math.log(halstead_volume)
372 - 0.23 * complexity
373 - 16.2 * math.log(loc)
374 )
375 return max(0, min(100, mi))
377 return 50 # Default moderate maintainability
379 def _identify_code_patterns(self, content: str) -> dict[str, Any]:
380 """Identify common code patterns and design elements.
382 Args:
383 content: Code content
385 Returns:
386 Dictionary of identified patterns
387 """
388 patterns = {
389 "design_patterns": [],
390 "anti_patterns": [],
391 "best_practices": [],
392 "code_smells": [],
393 }
395 content_lower = content.lower()
397 # Design patterns
398 if "singleton" in content_lower or "__new__" in content:
399 patterns["design_patterns"].append("singleton")
400 if "factory" in content_lower and (
401 "create" in content_lower or "build" in content_lower
402 ):
403 patterns["design_patterns"].append("factory")
404 if "observer" in content_lower or "notify" in content_lower:
405 patterns["design_patterns"].append("observer")
406 if "strategy" in content_lower and "algorithm" in content_lower:
407 patterns["design_patterns"].append("strategy")
409 # Anti-patterns and code smells
410 if content.count("if ") > 5:
411 patterns["code_smells"].append("too_many_conditionals")
412 if len(content.split("\n")) > 100:
413 patterns["code_smells"].append("long_method")
414 if content.count("def ") > 20 or content.count("function ") > 20:
415 patterns["code_smells"].append("large_class")
416 if "global " in content:
417 patterns["anti_patterns"].append("global_variables")
419 # Best practices
420 if '"""' in content or "'''" in content:
421 patterns["best_practices"].append("documentation")
422 if "test_" in content or "Test" in content:
423 patterns["best_practices"].append("testing")
424 if any(keyword in content for keyword in ["typing", "Type", "Optional"]):
425 patterns["best_practices"].append("type_hints")
427 return patterns
429 def _calculate_doc_coverage(self, content: str) -> dict[str, Any]:
430 """Calculate documentation coverage metrics.
432 Args:
433 content: Code content
435 Returns:
436 Dictionary of documentation metrics
437 """
438 # Count functions and classes using more precise regex patterns
439 import re
441 # Match function definitions (def at start of line with optional whitespace)
442 function_count = len(re.findall(r"^\s*def\s+\w+", content, re.MULTILINE))
443 function_count += len(re.findall(r"^\s*function\s+\w+", content, re.MULTILINE))
445 # Match class definitions (class at start of line with optional whitespace)
446 class_count = len(re.findall(r"^\s*class\s+\w+", content, re.MULTILINE))
448 # Count docstrings
449 docstring_count = content.count('"""') // 2 + content.count("'''") // 2
451 # Count comments
452 comment_lines = len(
453 [
454 line
455 for line in content.split("\n")
456 if line.strip().startswith(("#", "//", "/*"))
457 ]
458 )
460 total_elements = function_count + class_count
461 doc_coverage = (
462 (docstring_count / total_elements * 100) if total_elements > 0 else 0
463 )
465 return {
466 "total_functions": function_count,
467 "total_classes": class_count,
468 "documented_elements": docstring_count,
469 "comment_lines": comment_lines,
470 "documentation_coverage_percent": doc_coverage,
471 "has_module_docstring": content.strip().startswith('"""')
472 or content.strip().startswith("'''"),
473 "avg_comment_density": (
474 comment_lines / len(content.split("\n")) if content else 0
475 ),
476 }
478 def _identify_test_code(self, content: str) -> dict[str, Any]:
479 """Identify test-related code indicators.
481 Args:
482 content: Code content
484 Returns:
485 Dictionary of test indicators
486 """
487 test_indicators = {
488 "is_test_file": False,
489 "test_framework": "none",
490 "test_count": 0,
491 "assertion_count": 0,
492 "mock_usage": False,
493 "fixture_usage": False,
494 }
496 content_lower = content.lower()
498 # Check if it's a test file
499 test_keywords = ["test_", "test", "spec", "unittest", "pytest"]
500 test_indicators["is_test_file"] = any(
501 keyword in content_lower for keyword in test_keywords
502 )
504 # Identify test framework
505 if "pytest" in content_lower or "@pytest" in content:
506 test_indicators["test_framework"] = "pytest"
507 elif "unittest" in content_lower:
508 test_indicators["test_framework"] = "unittest"
509 elif "jest" in content_lower or "describe(" in content:
510 test_indicators["test_framework"] = "jest"
511 elif "mocha" in content_lower:
512 test_indicators["test_framework"] = "mocha"
514 # Count tests and assertions
515 test_indicators["test_count"] = content.count("def test_") + content.count(
516 "it("
517 )
519 assertion_patterns = [
520 "assert ",
521 "assert(",
522 "expect(",
523 "should",
524 "assertEqual",
525 "assertTrue",
526 "pytest.raises",
527 "self.assert",
528 "with pytest.raises",
529 "raises(",
530 ]
531 test_indicators["assertion_count"] = sum(
532 content.count(pattern) for pattern in assertion_patterns
533 )
535 # Check for mocking and fixtures
536 test_indicators["mock_usage"] = any(
537 keyword in content_lower for keyword in ["mock", "stub", "spy", "patch"]
538 )
539 test_indicators["fixture_usage"] = any(
540 keyword in content_lower for keyword in ["fixture", "setup", "teardown"]
541 )
543 return test_indicators
545 def _analyze_security_patterns(self, content: str) -> dict[str, Any]:
546 """Analyze security-related patterns in code.
548 Args:
549 content: Code content
551 Returns:
552 Dictionary of security indicators
553 """
554 security_indicators = {
555 "potential_vulnerabilities": [],
556 "security_practices": [],
557 "sensitive_data_handling": [],
558 }
560 content_lower = content.lower()
562 # Potential vulnerabilities
563 if "eval(" in content_lower:
564 security_indicators["potential_vulnerabilities"].append("eval_usage")
565 if "exec(" in content_lower:
566 security_indicators["potential_vulnerabilities"].append("exec_usage")
567 if "sql" in content_lower and any(
568 keyword in content_lower for keyword in ["select", "insert", "update"]
569 ):
570 security_indicators["potential_vulnerabilities"].append("sql_queries")
571 if "password" in content_lower and "plain" in content_lower:
572 security_indicators["potential_vulnerabilities"].append(
573 "plaintext_password"
574 )
576 # Security practices
577 if any(
578 keyword in content_lower
579 for keyword in ["hash", "encrypt", "bcrypt", "pbkdf2"]
580 ):
581 security_indicators["security_practices"].append("password_hashing")
582 if any(keyword in content_lower for keyword in ["csrf", "xss", "sanitize"]):
583 security_indicators["security_practices"].append("web_security")
584 if "https" in content_lower:
585 security_indicators["security_practices"].append("secure_transport")
587 # Sensitive data patterns
588 if any(
589 keyword in content_lower
590 for keyword in ["api_key", "secret", "token", "credential"]
591 ):
592 security_indicators["sensitive_data_handling"].append("credentials")
593 if any(
594 keyword in content_lower
595 for keyword in ["email", "phone", "ssn", "credit_card"]
596 ):
597 security_indicators["sensitive_data_handling"].append("pii_data")
599 return security_indicators
601 def _analyze_performance_patterns(self, content: str) -> dict[str, Any]:
602 """Analyze performance-related patterns in code.
604 Args:
605 content: Code content
607 Returns:
608 Dictionary of performance indicators
609 """
610 performance_indicators = {
611 "optimization_patterns": [],
612 "potential_bottlenecks": [],
613 "resource_usage": [],
614 }
616 content_lower = content.lower()
618 # Optimization patterns
619 if any(keyword in content_lower for keyword in ["cache", "memoize", "lazy"]):
620 performance_indicators["optimization_patterns"].append("caching")
621 if "async" in content_lower or "await" in content_lower:
622 performance_indicators["optimization_patterns"].append("async_programming")
623 if any(
624 keyword in content_lower
625 for keyword in ["parallel", "concurrent", "threading"]
626 ):
627 performance_indicators["optimization_patterns"].append("concurrency")
629 # Potential bottlenecks
630 # Detect nested loops by looking for multiple for/while statements
631 total_loops = content.count("for ") + content.count("while ")
632 if total_loops >= 3: # 3+ loops likely indicates nesting
633 performance_indicators["potential_bottlenecks"].append("nested_loops")
635 # Detect recursion patterns (exclude the definition line itself)
636 lines = content.split("\n")
637 def_pattern = re.compile(
638 r"^\s*(?:async\s+)?def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\("
639 )
640 for idx, line in enumerate(lines):
641 match = def_pattern.match(line)
642 if not match:
643 continue
644 func_name = match.group(1)
645 # Count calls to the function name on lines other than the definition line
646 bare_call_regex = re.compile(r"\b" + re.escape(func_name) + r"\s*\(")
647 method_call_regex = re.compile(r"\." + re.escape(func_name) + r"\s*\(")
648 call_count = 0
649 for j, other_line in enumerate(lines):
650 if j == idx:
651 continue
652 if bare_call_regex.search(other_line) or method_call_regex.search(
653 other_line
654 ):
655 call_count += 1
656 if call_count > 0:
657 performance_indicators["potential_bottlenecks"].append("recursion")
658 break
660 if content.count("database") > 5 or content.count("query") > 5:
661 performance_indicators["potential_bottlenecks"].append("database_heavy")
662 if content.count("file") > 10 or content.count("read") > 10:
663 performance_indicators["potential_bottlenecks"].append("io_heavy")
665 # Resource usage
666 if any(
667 keyword in content_lower for keyword in ["memory", "buffer", "allocation"]
668 ):
669 performance_indicators["resource_usage"].append("memory_allocation")
670 if any(
671 keyword in content_lower for keyword in ["connection", "pool", "socket"]
672 ):
673 performance_indicators["resource_usage"].append("connection_management")
675 return performance_indicators
677 def _calculate_maintainability_metrics(self, content: str) -> dict[str, Any]:
678 """Calculate maintainability-related metrics.
680 Args:
681 content: Code content
683 Returns:
684 Dictionary of maintainability metrics
685 """
686 lines = content.split("\n")
687 non_empty_lines = [line for line in lines if line.strip()]
689 # Calculate various metrics
690 avg_line_length = sum(len(line) for line in lines) / len(lines) if lines else 0
691 max_line_length = max(len(line) for line in lines) if lines else 0
693 # Count long lines (> 120 characters)
694 long_lines = len([line for line in lines if len(line) > 120])
696 # Calculate code density (non-empty lines / total lines)
697 code_density = len(non_empty_lines) / len(lines) if lines else 0
699 # Estimate readability score based on various factors
700 readability_score = 100
701 if avg_line_length > 100:
702 readability_score -= 20
703 if max_line_length > 200:
704 readability_score -= 15
705 if long_lines > len(lines) * 0.3:
706 readability_score -= 25
707 if code_density < 0.5:
708 readability_score -= 10
710 return {
711 "avg_line_length": avg_line_length,
712 "max_line_length": max_line_length,
713 "long_lines_count": long_lines,
714 "code_density": code_density,
715 "readability_score": max(0, readability_score),
716 "estimated_read_time_minutes": len(non_empty_lines)
717 / 50, # ~50 lines per minute
718 }
720 def _extract_language_specific_metadata(
721 self, content: str, language: str
722 ) -> dict[str, Any]:
723 """Extract language-specific metadata.
725 Args:
726 content: Code content
727 language: Programming language
729 Returns:
730 Language-specific metadata
731 """
732 if language == "python":
733 return self._extract_python_metadata(content)
734 elif language in ["javascript", "typescript"]:
735 return self._extract_javascript_metadata(content)
736 elif language == "java":
737 return self._extract_java_metadata(content)
738 elif language in ["cpp", "c"]:
739 return self._extract_c_cpp_metadata(content)
740 else:
741 # Return empty dict for unknown languages
742 return {}
744 def _extract_python_metadata(self, content: str) -> dict[str, Any]:
745 """Extract Python-specific metadata."""
746 features = []
748 if "async def" in content or ("async" in content and "await" in content):
749 features.append("async_await")
750 if "@" in content:
751 features.append("decorators")
752 if "typing" in content or "Type" in content or ":" in content:
753 features.append("type_hints")
754 if "yield" in content:
755 features.append("generators")
756 if "__enter__" in content and "__exit__" in content:
757 features.append("context_managers")
758 if "__" in content:
759 features.append("dunder_methods")
760 if "lambda" in content:
761 features.append("lambda_functions")
762 if "dataclass" in content or "@dataclass" in content:
763 features.append("dataclasses")
765 return {
766 "python_features": features,
767 "python_version_indicators": self._detect_python_version_features(content),
768 }
770 def _extract_javascript_metadata(self, content: str) -> dict[str, Any]:
771 """Extract JavaScript/TypeScript-specific metadata."""
772 features = []
774 if "async" in content and "await" in content:
775 features.append("async_await")
776 if "=>" in content:
777 features.append("arrow_functions")
778 if "const" in content or "let" in content:
779 features.append("es6_variables")
780 if "class" in content:
781 features.append("es6_classes")
782 if "import" in content and "from" in content:
783 features.append("es6_modules")
784 if "${" in content:
785 features.append("template_literals")
786 if "{" in content and "}" in content and ("=" in content or "const" in content):
787 features.append("destructuring")
788 if "function*" in content or "yield" in content:
789 features.append("generators")
791 return {"javascript_features": features}
793 def _extract_java_metadata(self, content: str) -> dict[str, Any]:
794 """Extract Java-specific metadata."""
795 features = []
797 if "interface" in content:
798 features.append("interfaces")
799 if "extends" in content:
800 features.append("inheritance")
801 if "implements" in content:
802 features.append("interface_implementation")
803 if "synchronized" in content:
804 features.append("thread_synchronization")
805 if "generic" in content or "<" in content and ">" in content:
806 features.append("generics")
807 if "@Override" in content or "@" in content:
808 features.append("annotations")
810 return {"language_features": features}
812 def _extract_c_cpp_metadata(self, content: str) -> dict[str, Any]:
813 """Extract C/C++-specific metadata."""
814 features = []
816 if "#include" in content:
817 features.append("header_includes")
818 if "malloc" in content or "free" in content:
819 features.append("manual_memory_management")
820 if "pointer" in content or "->" in content:
821 features.append("pointer_usage")
822 if "template" in content:
823 features.append("templates")
824 if "namespace" in content:
825 features.append("namespaces")
826 if "inline" in content:
827 features.append("inline_functions")
829 return {"language_features": features}
831 def _detect_python_version_features(self, content: str) -> list[str]:
832 """Detect Python version-specific features."""
833 features = []
835 if ":=" in content:
836 features.append("walrus_operator_py38")
837 if "match " in content and "case " in content:
838 features.append("pattern_matching_py310")
839 if 'f"' in content or "f'" in content:
840 features.append("f_strings_py36")
841 if "pathlib" in content:
842 features.append("pathlib_py34")
843 if "dataclass" in content:
844 features.append("dataclasses_py37")
846 return features