Coverage for src/qdrant_loader/core/chunking/strategy/code/code_metadata_extractor.py: 83%

318 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-13 09:19 +0000

1"""Code metadata extractor for enhanced programming language analysis.""" 

2 

3import re 

4from typing import Any 

5 

6import structlog 

7 

8from qdrant_loader.core.chunking.strategy.base.metadata_extractor import ( 

9 BaseMetadataExtractor, 

10) 

11from qdrant_loader.core.document import Document 

12 

13logger = structlog.get_logger(__name__) 

14 

15 

16class CodeMetadataExtractor(BaseMetadataExtractor): 

17 """Enhanced metadata extractor for code documents.""" 

18 

19 def __init__(self, settings): 

20 """Initialize the code metadata extractor. 

21 

22 Args: 

23 settings: Configuration settings 

24 """ 

25 self.settings = settings 

26 self.logger = logger 

27 

28 # Code-specific configuration 

29 self.code_config = getattr( 

30 settings.global_config.chunking.strategies, "code", None 

31 ) 

32 

33 def extract_hierarchical_metadata( 

34 self, content: str, chunk_metadata: dict[str, Any], document: Document 

35 ) -> dict[str, Any]: 

36 """Extract comprehensive code metadata from chunk content. 

37 

38 Args: 

39 content: Code chunk content 

40 chunk_metadata: Existing chunk metadata 

41 document: Original document 

42 

43 Returns: 

44 Enhanced metadata dictionary 

45 """ 

46 metadata = chunk_metadata.copy() 

47 

48 # Add enhanced code analysis 

49 metadata.update( 

50 { 

51 "dependency_graph": self._build_dependency_graph(content), 

52 "complexity_metrics": self._calculate_complexity_metrics(content), 

53 "code_patterns": self._identify_code_patterns(content), 

54 "documentation_coverage": self._calculate_doc_coverage(content), 

55 "test_indicators": self._identify_test_code(content), 

56 "security_indicators": self._analyze_security_patterns(content), 

57 "performance_indicators": self._analyze_performance_patterns(content), 

58 "maintainability_metrics": self._calculate_maintainability_metrics( 

59 content 

60 ), 

61 "content_type": "code", 

62 } 

63 ) 

64 

65 # Language-specific analysis 

66 language = chunk_metadata.get("language", "unknown") 

67 if language != "unknown": 

68 metadata.update(self._extract_language_specific_metadata(content, language)) 

69 

70 return metadata 

71 

72 def extract_entities(self, text: str) -> list[str]: 

73 """Extract code entities like class names, function names, variables. 

74 

75 Args: 

76 text: Code text to analyze 

77 

78 Returns: 

79 List of identified code entities 

80 """ 

81 entities = [] 

82 

83 # Extract class names 

84 class_pattern = r"\b(?:class|interface|struct|enum)\s+([A-Z][a-zA-Z0-9_]*)" 

85 entities.extend(re.findall(class_pattern, text)) 

86 

87 # Extract function/method names 

88 function_patterns = [ 

89 r"\bdef\s+([a-zA-Z_][a-zA-Z0-9_]*)", # Python 

90 r"\bfunction\s+([a-zA-Z_][a-zA-Z0-9_]*)", # JavaScript 

91 r"\b(?:public|private|protected)?\s*(?:static\s+)?[a-zA-Z_][a-zA-Z0-9_<>]*\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", # Java/C# 

92 ] 

93 

94 for pattern in function_patterns: 

95 entities.extend(re.findall(pattern, text)) 

96 

97 # Extract constant names (usually uppercase) 

98 constant_pattern = r"\b([A-Z][A-Z0-9_]{2,})\b" 

99 entities.extend(re.findall(constant_pattern, text)) 

100 

101 # Remove duplicates and return 

102 return list(set(entities)) 

103 

104 def _build_dependency_graph(self, content: str) -> dict[str, list[str]]: 

105 """Build dependency graph for code. 

106 

107 Args: 

108 content: Code content 

109 

110 Returns: 

111 Dictionary mapping modules/classes to their dependencies 

112 """ 

113 dependencies = { 

114 "imports": [], 

115 "internal_references": [], 

116 "third_party_imports": [], 

117 "stdlib_imports": [], 

118 } 

119 

120 # Extract import statements 

121 import_patterns = [ 

122 r"import\s+([a-zA-Z_][a-zA-Z0-9_.]*)", # Python: import module 

123 r"from\s+([a-zA-Z_][a-zA-Z0-9_.]*)\s+import", # Python: from module import 

124 r'#include\s*[<"]([^>"]+)[>"]', # C/C++ 

125 r'require\s*\([\'"]([^\'"]+)[\'"]\)', # Node.js 

126 r'import\s+.*\s+from\s+[\'"]([^\'"]+)[\'"]', # ES6 

127 ] 

128 

129 for pattern in import_patterns: 

130 imports = re.findall(pattern, content) 

131 dependencies["imports"].extend(imports) 

132 

133 # Python standard library modules (common ones) 

134 python_stdlib = { 

135 "os", 

136 "sys", 

137 "json", 

138 "math", 

139 "random", 

140 "datetime", 

141 "collections", 

142 "itertools", 

143 "functools", 

144 "operator", 

145 "re", 

146 "urllib", 

147 "http", 

148 "pathlib", 

149 "typing", 

150 "dataclasses", 

151 "abc", 

152 "enum", 

153 "logging", 

154 "threading", 

155 "multiprocessing", 

156 "subprocess", 

157 "socket", 

158 "sqlite3", 

159 "csv", 

160 "pickle", 

161 "gzip", 

162 "zipfile", 

163 "tarfile", 

164 "shutil", 

165 "tempfile", 

166 } 

167 

168 # Identify external vs internal vs stdlib dependencies 

169 for imp in dependencies["imports"]: 

170 base_module = imp.split(".")[0] 

171 if base_module in python_stdlib: 

172 dependencies["stdlib_imports"].append(imp) 

173 elif self._is_third_party_import(imp): 

174 dependencies["third_party_imports"].append(imp) 

175 else: 

176 dependencies["internal_references"].append(imp) 

177 

178 return dependencies 

179 

180 def _is_third_party_import(self, import_name: str) -> bool: 

181 """Determine if an import is a third-party library. 

182 

183 Args: 

184 import_name: The import name to check 

185 

186 Returns: 

187 True if it's likely a third-party import 

188 """ 

189 base_module = import_name.split(".")[0].lower() 

190 

191 # Known third-party libraries 

192 known_third_party = { 

193 "requests", 

194 "numpy", 

195 "pandas", 

196 "flask", 

197 "django", 

198 "fastapi", 

199 "tensorflow", 

200 "torch", 

201 "pytorch", 

202 "sklearn", 

203 "scipy", 

204 "matplotlib", 

205 "seaborn", 

206 "plotly", 

207 "streamlit", 

208 "dash", 

209 "celery", 

210 "redis", 

211 "sqlalchemy", 

212 "alembic", 

213 "pydantic", 

214 "marshmallow", 

215 "click", 

216 "typer", 

217 "pytest", 

218 "unittest2", 

219 "mock", 

220 "httpx", 

221 "aiohttp", 

222 "websockets", 

223 "uvicorn", 

224 "gunicorn", 

225 "jinja2", 

226 "mako", 

227 "babel", 

228 "pillow", 

229 "opencv", 

230 "cv2", 

231 "boto3", 

232 "azure", 

233 "google", 

234 } 

235 

236 if base_module in known_third_party: 

237 return True 

238 

239 # Heuristics for third-party libraries: 

240 # 1. Contains common third-party patterns 

241 if any(pattern in base_module for pattern in ["lib", "client", "sdk", "api"]): 

242 return True 

243 

244 # 2. Looks like a package name (contains underscores but not starting with _) 

245 if "_" in base_module and not base_module.startswith("_"): 

246 return True 

247 

248 # 3. Single lowercase word that's not obviously internal 

249 if ( 

250 base_module.islower() 

251 and not base_module.startswith("test") 

252 and base_module not in ["main", "app", "config", "utils", "helpers"] 

253 ): 

254 return True 

255 

256 return False 

257 

258 def _calculate_complexity_metrics(self, content: str) -> dict[str, Any]: 

259 """Calculate code complexity metrics. 

260 

261 Args: 

262 content: Code content 

263 

264 Returns: 

265 Dictionary of complexity metrics 

266 """ 

267 lines = content.split("\n") 

268 non_empty_lines = [line for line in lines if line.strip()] 

269 

270 # Cyclomatic complexity indicators 

271 complexity_indicators = [ 

272 "if ", 

273 "elif ", 

274 "else:", 

275 "while ", 

276 "for ", 

277 "try:", 

278 "except:", 

279 "case ", 

280 "&&", 

281 "||", 

282 "?", 

283 "and ", 

284 "or ", 

285 "switch", 

286 ] 

287 

288 cyclomatic_complexity = 1 # Base complexity 

289 for indicator in complexity_indicators: 

290 cyclomatic_complexity += content.lower().count(indicator.lower()) 

291 

292 # Nesting depth 

293 max_nesting = 0 

294 current_nesting = 0 

295 

296 for line in lines: 

297 stripped = line.strip() 

298 if any( 

299 keyword in stripped 

300 for keyword in ["if", "for", "while", "try", "def", "class"] 

301 ): 

302 current_nesting += 1 

303 max_nesting = max(max_nesting, current_nesting) 

304 elif stripped in ["end", "}"] or ( 

305 stripped.startswith("except") or stripped.startswith("finally") 

306 ): 

307 current_nesting = max(0, current_nesting - 1) 

308 

309 return { 

310 "cyclomatic_complexity": cyclomatic_complexity, 

311 "lines_of_code": len(non_empty_lines), 

312 "total_lines": len(lines), 

313 "nesting_depth": max_nesting, 

314 "complexity_density": cyclomatic_complexity / max(len(non_empty_lines), 1), 

315 "maintainability_index": self._calculate_maintainability_index(content), 

316 } 

317 

318 def _calculate_maintainability_index(self, content: str) -> float: 

319 """Calculate maintainability index (0-100 scale). 

320 

321 Args: 

322 content: Code content 

323 

324 Returns: 

325 Maintainability index score 

326 """ 

327 import math 

328 

329 if not content.strip(): 

330 return 50 # Default for empty content 

331 

332 # Calculate lines of code and complexity 

333 lines = content.split("\n") 

334 non_empty_lines = [line for line in lines if line.strip()] 

335 loc = len(non_empty_lines) 

336 

337 # Simple cyclomatic complexity 

338 complexity_indicators = [ 

339 "if ", 

340 "elif ", 

341 "else:", 

342 "while ", 

343 "for ", 

344 "try:", 

345 "except:", 

346 "case ", 

347 ] 

348 complexity = 1 # Base complexity 

349 for indicator in complexity_indicators: 

350 complexity += content.lower().count(indicator.lower()) 

351 

352 # Simplified maintainability index calculation 

353 # Based on Halstead metrics and cyclomatic complexity 

354 

355 # Count operators and operands (simplified) 

356 operators = len(re.findall(r"[+\-*/=<>!&|%^~]", content)) 

357 operands = len(re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", content)) 

358 

359 # Avoid division by zero 

360 if operands == 0: 

361 halstead_volume = 0 

362 else: 

363 vocabulary = operators + operands 

364 length = operators + operands 

365 halstead_volume = length * math.log2(vocabulary) if vocabulary > 1 else 0 

366 

367 # Maintainability index formula (simplified) 

368 if loc > 0 and halstead_volume > 0: 

369 mi = ( 

370 171 

371 - 5.2 * math.log(halstead_volume) 

372 - 0.23 * complexity 

373 - 16.2 * math.log(loc) 

374 ) 

375 return max(0, min(100, mi)) 

376 

377 return 50 # Default moderate maintainability 

378 

379 def _identify_code_patterns(self, content: str) -> dict[str, Any]: 

380 """Identify common code patterns and design elements. 

381 

382 Args: 

383 content: Code content 

384 

385 Returns: 

386 Dictionary of identified patterns 

387 """ 

388 patterns = { 

389 "design_patterns": [], 

390 "anti_patterns": [], 

391 "best_practices": [], 

392 "code_smells": [], 

393 } 

394 

395 content_lower = content.lower() 

396 

397 # Design patterns 

398 if "singleton" in content_lower or "__new__" in content: 

399 patterns["design_patterns"].append("singleton") 

400 if "factory" in content_lower and ( 

401 "create" in content_lower or "build" in content_lower 

402 ): 

403 patterns["design_patterns"].append("factory") 

404 if "observer" in content_lower or "notify" in content_lower: 

405 patterns["design_patterns"].append("observer") 

406 if "strategy" in content_lower and "algorithm" in content_lower: 

407 patterns["design_patterns"].append("strategy") 

408 

409 # Anti-patterns and code smells 

410 if content.count("if ") > 5: 

411 patterns["code_smells"].append("too_many_conditionals") 

412 if len(content.split("\n")) > 100: 

413 patterns["code_smells"].append("long_method") 

414 if content.count("def ") > 20 or content.count("function ") > 20: 

415 patterns["code_smells"].append("large_class") 

416 if "global " in content: 

417 patterns["anti_patterns"].append("global_variables") 

418 

419 # Best practices 

420 if '"""' in content or "'''" in content: 

421 patterns["best_practices"].append("documentation") 

422 if "test_" in content or "Test" in content: 

423 patterns["best_practices"].append("testing") 

424 if any(keyword in content for keyword in ["typing", "Type", "Optional"]): 

425 patterns["best_practices"].append("type_hints") 

426 

427 return patterns 

428 

429 def _calculate_doc_coverage(self, content: str) -> dict[str, Any]: 

430 """Calculate documentation coverage metrics. 

431 

432 Args: 

433 content: Code content 

434 

435 Returns: 

436 Dictionary of documentation metrics 

437 """ 

438 # Count functions and classes using more precise regex patterns 

439 import re 

440 

441 # Match function definitions (def at start of line with optional whitespace) 

442 function_count = len(re.findall(r"^\s*def\s+\w+", content, re.MULTILINE)) 

443 function_count += len(re.findall(r"^\s*function\s+\w+", content, re.MULTILINE)) 

444 

445 # Match class definitions (class at start of line with optional whitespace) 

446 class_count = len(re.findall(r"^\s*class\s+\w+", content, re.MULTILINE)) 

447 

448 # Count docstrings 

449 docstring_count = content.count('"""') // 2 + content.count("'''") // 2 

450 

451 # Count comments 

452 comment_lines = len( 

453 [ 

454 line 

455 for line in content.split("\n") 

456 if line.strip().startswith(("#", "//", "/*")) 

457 ] 

458 ) 

459 

460 total_elements = function_count + class_count 

461 doc_coverage = ( 

462 (docstring_count / total_elements * 100) if total_elements > 0 else 0 

463 ) 

464 

465 return { 

466 "total_functions": function_count, 

467 "total_classes": class_count, 

468 "documented_elements": docstring_count, 

469 "comment_lines": comment_lines, 

470 "documentation_coverage_percent": doc_coverage, 

471 "has_module_docstring": content.strip().startswith('"""') 

472 or content.strip().startswith("'''"), 

473 "avg_comment_density": ( 

474 comment_lines / len(content.split("\n")) if content else 0 

475 ), 

476 } 

477 

478 def _identify_test_code(self, content: str) -> dict[str, Any]: 

479 """Identify test-related code indicators. 

480 

481 Args: 

482 content: Code content 

483 

484 Returns: 

485 Dictionary of test indicators 

486 """ 

487 test_indicators = { 

488 "is_test_file": False, 

489 "test_framework": "none", 

490 "test_count": 0, 

491 "assertion_count": 0, 

492 "mock_usage": False, 

493 "fixture_usage": False, 

494 } 

495 

496 content_lower = content.lower() 

497 

498 # Check if it's a test file 

499 test_keywords = ["test_", "test", "spec", "unittest", "pytest"] 

500 test_indicators["is_test_file"] = any( 

501 keyword in content_lower for keyword in test_keywords 

502 ) 

503 

504 # Identify test framework 

505 if "pytest" in content_lower or "@pytest" in content: 

506 test_indicators["test_framework"] = "pytest" 

507 elif "unittest" in content_lower: 

508 test_indicators["test_framework"] = "unittest" 

509 elif "jest" in content_lower or "describe(" in content: 

510 test_indicators["test_framework"] = "jest" 

511 elif "mocha" in content_lower: 

512 test_indicators["test_framework"] = "mocha" 

513 

514 # Count tests and assertions 

515 test_indicators["test_count"] = content.count("def test_") + content.count( 

516 "it(" 

517 ) 

518 

519 assertion_patterns = [ 

520 "assert ", 

521 "assert(", 

522 "expect(", 

523 "should", 

524 "assertEqual", 

525 "assertTrue", 

526 "pytest.raises", 

527 "self.assert", 

528 "with pytest.raises", 

529 "raises(", 

530 ] 

531 test_indicators["assertion_count"] = sum( 

532 content.count(pattern) for pattern in assertion_patterns 

533 ) 

534 

535 # Check for mocking and fixtures 

536 test_indicators["mock_usage"] = any( 

537 keyword in content_lower for keyword in ["mock", "stub", "spy", "patch"] 

538 ) 

539 test_indicators["fixture_usage"] = any( 

540 keyword in content_lower for keyword in ["fixture", "setup", "teardown"] 

541 ) 

542 

543 return test_indicators 

544 

545 def _analyze_security_patterns(self, content: str) -> dict[str, Any]: 

546 """Analyze security-related patterns in code. 

547 

548 Args: 

549 content: Code content 

550 

551 Returns: 

552 Dictionary of security indicators 

553 """ 

554 security_indicators = { 

555 "potential_vulnerabilities": [], 

556 "security_practices": [], 

557 "sensitive_data_handling": [], 

558 } 

559 

560 content_lower = content.lower() 

561 

562 # Potential vulnerabilities 

563 if "eval(" in content_lower: 

564 security_indicators["potential_vulnerabilities"].append("eval_usage") 

565 if "exec(" in content_lower: 

566 security_indicators["potential_vulnerabilities"].append("exec_usage") 

567 if "sql" in content_lower and any( 

568 keyword in content_lower for keyword in ["select", "insert", "update"] 

569 ): 

570 security_indicators["potential_vulnerabilities"].append("sql_queries") 

571 if "password" in content_lower and "plain" in content_lower: 

572 security_indicators["potential_vulnerabilities"].append( 

573 "plaintext_password" 

574 ) 

575 

576 # Security practices 

577 if any( 

578 keyword in content_lower 

579 for keyword in ["hash", "encrypt", "bcrypt", "pbkdf2"] 

580 ): 

581 security_indicators["security_practices"].append("password_hashing") 

582 if any(keyword in content_lower for keyword in ["csrf", "xss", "sanitize"]): 

583 security_indicators["security_practices"].append("web_security") 

584 if "https" in content_lower: 

585 security_indicators["security_practices"].append("secure_transport") 

586 

587 # Sensitive data patterns 

588 if any( 

589 keyword in content_lower 

590 for keyword in ["api_key", "secret", "token", "credential"] 

591 ): 

592 security_indicators["sensitive_data_handling"].append("credentials") 

593 if any( 

594 keyword in content_lower 

595 for keyword in ["email", "phone", "ssn", "credit_card"] 

596 ): 

597 security_indicators["sensitive_data_handling"].append("pii_data") 

598 

599 return security_indicators 

600 

601 def _analyze_performance_patterns(self, content: str) -> dict[str, Any]: 

602 """Analyze performance-related patterns in code. 

603 

604 Args: 

605 content: Code content 

606 

607 Returns: 

608 Dictionary of performance indicators 

609 """ 

610 performance_indicators = { 

611 "optimization_patterns": [], 

612 "potential_bottlenecks": [], 

613 "resource_usage": [], 

614 } 

615 

616 content_lower = content.lower() 

617 

618 # Optimization patterns 

619 if any(keyword in content_lower for keyword in ["cache", "memoize", "lazy"]): 

620 performance_indicators["optimization_patterns"].append("caching") 

621 if "async" in content_lower or "await" in content_lower: 

622 performance_indicators["optimization_patterns"].append("async_programming") 

623 if any( 

624 keyword in content_lower 

625 for keyword in ["parallel", "concurrent", "threading"] 

626 ): 

627 performance_indicators["optimization_patterns"].append("concurrency") 

628 

629 # Potential bottlenecks 

630 # Detect nested loops by looking for multiple for/while statements 

631 total_loops = content.count("for ") + content.count("while ") 

632 if total_loops >= 3: # 3+ loops likely indicates nesting 

633 performance_indicators["potential_bottlenecks"].append("nested_loops") 

634 

635 # Detect recursion patterns (exclude the definition line itself) 

636 lines = content.split("\n") 

637 def_pattern = re.compile( 

638 r"^\s*(?:async\s+)?def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(" 

639 ) 

640 for idx, line in enumerate(lines): 

641 match = def_pattern.match(line) 

642 if not match: 

643 continue 

644 func_name = match.group(1) 

645 # Count calls to the function name on lines other than the definition line 

646 bare_call_regex = re.compile(r"\b" + re.escape(func_name) + r"\s*\(") 

647 method_call_regex = re.compile(r"\." + re.escape(func_name) + r"\s*\(") 

648 call_count = 0 

649 for j, other_line in enumerate(lines): 

650 if j == idx: 

651 continue 

652 if bare_call_regex.search(other_line) or method_call_regex.search( 

653 other_line 

654 ): 

655 call_count += 1 

656 if call_count > 0: 

657 performance_indicators["potential_bottlenecks"].append("recursion") 

658 break 

659 

660 if content.count("database") > 5 or content.count("query") > 5: 

661 performance_indicators["potential_bottlenecks"].append("database_heavy") 

662 if content.count("file") > 10 or content.count("read") > 10: 

663 performance_indicators["potential_bottlenecks"].append("io_heavy") 

664 

665 # Resource usage 

666 if any( 

667 keyword in content_lower for keyword in ["memory", "buffer", "allocation"] 

668 ): 

669 performance_indicators["resource_usage"].append("memory_allocation") 

670 if any( 

671 keyword in content_lower for keyword in ["connection", "pool", "socket"] 

672 ): 

673 performance_indicators["resource_usage"].append("connection_management") 

674 

675 return performance_indicators 

676 

677 def _calculate_maintainability_metrics(self, content: str) -> dict[str, Any]: 

678 """Calculate maintainability-related metrics. 

679 

680 Args: 

681 content: Code content 

682 

683 Returns: 

684 Dictionary of maintainability metrics 

685 """ 

686 lines = content.split("\n") 

687 non_empty_lines = [line for line in lines if line.strip()] 

688 

689 # Calculate various metrics 

690 avg_line_length = sum(len(line) for line in lines) / len(lines) if lines else 0 

691 max_line_length = max(len(line) for line in lines) if lines else 0 

692 

693 # Count long lines (> 120 characters) 

694 long_lines = len([line for line in lines if len(line) > 120]) 

695 

696 # Calculate code density (non-empty lines / total lines) 

697 code_density = len(non_empty_lines) / len(lines) if lines else 0 

698 

699 # Estimate readability score based on various factors 

700 readability_score = 100 

701 if avg_line_length > 100: 

702 readability_score -= 20 

703 if max_line_length > 200: 

704 readability_score -= 15 

705 if long_lines > len(lines) * 0.3: 

706 readability_score -= 25 

707 if code_density < 0.5: 

708 readability_score -= 10 

709 

710 return { 

711 "avg_line_length": avg_line_length, 

712 "max_line_length": max_line_length, 

713 "long_lines_count": long_lines, 

714 "code_density": code_density, 

715 "readability_score": max(0, readability_score), 

716 "estimated_read_time_minutes": len(non_empty_lines) 

717 / 50, # ~50 lines per minute 

718 } 

719 

720 def _extract_language_specific_metadata( 

721 self, content: str, language: str 

722 ) -> dict[str, Any]: 

723 """Extract language-specific metadata. 

724 

725 Args: 

726 content: Code content 

727 language: Programming language 

728 

729 Returns: 

730 Language-specific metadata 

731 """ 

732 if language == "python": 

733 return self._extract_python_metadata(content) 

734 elif language in ["javascript", "typescript"]: 

735 return self._extract_javascript_metadata(content) 

736 elif language == "java": 

737 return self._extract_java_metadata(content) 

738 elif language in ["cpp", "c"]: 

739 return self._extract_c_cpp_metadata(content) 

740 else: 

741 # Return empty dict for unknown languages 

742 return {} 

743 

744 def _extract_python_metadata(self, content: str) -> dict[str, Any]: 

745 """Extract Python-specific metadata.""" 

746 features = [] 

747 

748 if "async def" in content or ("async" in content and "await" in content): 

749 features.append("async_await") 

750 if "@" in content: 

751 features.append("decorators") 

752 if "typing" in content or "Type" in content or ":" in content: 

753 features.append("type_hints") 

754 if "yield" in content: 

755 features.append("generators") 

756 if "__enter__" in content and "__exit__" in content: 

757 features.append("context_managers") 

758 if "__" in content: 

759 features.append("dunder_methods") 

760 if "lambda" in content: 

761 features.append("lambda_functions") 

762 if "dataclass" in content or "@dataclass" in content: 

763 features.append("dataclasses") 

764 

765 return { 

766 "python_features": features, 

767 "python_version_indicators": self._detect_python_version_features(content), 

768 } 

769 

770 def _extract_javascript_metadata(self, content: str) -> dict[str, Any]: 

771 """Extract JavaScript/TypeScript-specific metadata.""" 

772 features = [] 

773 

774 if "async" in content and "await" in content: 

775 features.append("async_await") 

776 if "=>" in content: 

777 features.append("arrow_functions") 

778 if "const" in content or "let" in content: 

779 features.append("es6_variables") 

780 if "class" in content: 

781 features.append("es6_classes") 

782 if "import" in content and "from" in content: 

783 features.append("es6_modules") 

784 if "${" in content: 

785 features.append("template_literals") 

786 if "{" in content and "}" in content and ("=" in content or "const" in content): 

787 features.append("destructuring") 

788 if "function*" in content or "yield" in content: 

789 features.append("generators") 

790 

791 return {"javascript_features": features} 

792 

793 def _extract_java_metadata(self, content: str) -> dict[str, Any]: 

794 """Extract Java-specific metadata.""" 

795 features = [] 

796 

797 if "interface" in content: 

798 features.append("interfaces") 

799 if "extends" in content: 

800 features.append("inheritance") 

801 if "implements" in content: 

802 features.append("interface_implementation") 

803 if "synchronized" in content: 

804 features.append("thread_synchronization") 

805 if "generic" in content or "<" in content and ">" in content: 

806 features.append("generics") 

807 if "@Override" in content or "@" in content: 

808 features.append("annotations") 

809 

810 return {"language_features": features} 

811 

812 def _extract_c_cpp_metadata(self, content: str) -> dict[str, Any]: 

813 """Extract C/C++-specific metadata.""" 

814 features = [] 

815 

816 if "#include" in content: 

817 features.append("header_includes") 

818 if "malloc" in content or "free" in content: 

819 features.append("manual_memory_management") 

820 if "pointer" in content or "->" in content: 

821 features.append("pointer_usage") 

822 if "template" in content: 

823 features.append("templates") 

824 if "namespace" in content: 

825 features.append("namespaces") 

826 if "inline" in content: 

827 features.append("inline_functions") 

828 

829 return {"language_features": features} 

830 

831 def _detect_python_version_features(self, content: str) -> list[str]: 

832 """Detect Python version-specific features.""" 

833 features = [] 

834 

835 if ":=" in content: 

836 features.append("walrus_operator_py38") 

837 if "match " in content and "case " in content: 

838 features.append("pattern_matching_py310") 

839 if 'f"' in content or "f'" in content: 

840 features.append("f_strings_py36") 

841 if "pathlib" in content: 

842 features.append("pathlib_py34") 

843 if "dataclass" in content: 

844 features.append("dataclasses_py37") 

845 

846 return features