Coverage for src/qdrant_loader/core/chunking/strategy/code/metadata/language_specific.py: 63%

93 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1from __future__ import annotations 

2 

3from typing import Any 

4 

5 

6def extract_language_specific_metadata(content: str, language: str) -> dict[str, Any]: 

7 if language == "python": 

8 return extract_python_metadata(content) 

9 elif language in ["javascript", "typescript"]: 

10 return extract_javascript_metadata(content) 

11 elif language == "java": 

12 return extract_java_metadata(content) 

13 elif language in ["cpp", "c"]: 

14 return extract_c_cpp_metadata(content) 

15 else: 

16 return {} 

17 

18 

19def extract_python_metadata(content: str) -> dict[str, Any]: 

20 features: list[str] = [] 

21 if "async def" in content or ("async" in content and "await" in content): 

22 features.append("async_await") 

23 if "@" in content: 

24 features.append("decorators") 

25 if "typing" in content or "Type" in content or ":" in content: 

26 features.append("type_hints") 

27 if "yield" in content: 

28 features.append("generators") 

29 if "__enter__" in content and "__exit__" in content: 

30 features.append("context_managers") 

31 if "__" in content: 

32 features.append("dunder_methods") 

33 if "lambda" in content: 

34 features.append("lambda_functions") 

35 if "dataclass" in content or "@dataclass" in content: 

36 features.append("dataclasses") 

37 

38 return { 

39 "python_features": features, 

40 "python_version_indicators": detect_python_version_features(content), 

41 } 

42 

43 

44def extract_javascript_metadata(content: str) -> dict[str, Any]: 

45 features: list[str] = [] 

46 if "async" in content and "await" in content: 

47 features.append("async_await") 

48 if "=>" in content: 

49 features.append("arrow_functions") 

50 if "const" in content or "let" in content: 

51 features.append("es6_variables") 

52 if "class" in content: 

53 features.append("es6_classes") 

54 if "import" in content and "from" in content: 

55 features.append("es6_modules") 

56 if "${" in content: 

57 features.append("template_literals") 

58 if "{" in content and "}" in content and ("=" in content or "const" in content): 

59 features.append("destructuring") 

60 if "function*" in content or "yield" in content: 

61 features.append("generators") 

62 return {"javascript_features": features} 

63 

64 

65def extract_java_metadata(content: str) -> dict[str, Any]: 

66 features: list[str] = [] 

67 if "interface" in content: 

68 features.append("interfaces") 

69 if "extends" in content: 

70 features.append("inheritance") 

71 if "implements" in content: 

72 features.append("interface_implementation") 

73 if "synchronized" in content: 

74 features.append("thread_synchronization") 

75 if "generic" in content or "<" in content and ">" in content: 

76 features.append("generics") 

77 if "@Override" in content or "@" in content: 

78 features.append("annotations") 

79 return {"language_features": features} 

80 

81 

82def extract_c_cpp_metadata(content: str) -> dict[str, Any]: 

83 features: list[str] = [] 

84 if "#include" in content: 

85 features.append("header_includes") 

86 if "malloc" in content or "free" in content: 

87 features.append("manual_memory_management") 

88 if "pointer" in content or "->" in content: 

89 features.append("pointer_usage") 

90 if "template" in content: 

91 features.append("templates") 

92 if "namespace" in content: 

93 features.append("namespaces") 

94 if "inline" in content: 

95 features.append("inline_functions") 

96 return {"language_features": features} 

97 

98 

99def detect_python_version_features(content: str) -> list[str]: 

100 features: list[str] = [] 

101 if ":=" in content: 

102 features.append("walrus_operator_py38") 

103 if "match " in content and "case " in content: 

104 features.append("pattern_matching_py310") 

105 if 'f"' in content or "f'" in content: 

106 features.append("f_strings_py36") 

107 if "pathlib" in content: 

108 features.append("pathlib_py34") 

109 if "dataclass" in content: 

110 features.append("dataclasses_py37") 

111 return features