Coverage for src/qdrant_loader/core/chunking/strategy/code/processor/analysis.py: 84%

1from __future__ import annotations

3from typing import Any

6def analyze_code_content(content: str) -> dict[str, Any]:

7 lines = content.split("\n")

8 non_empty_lines = [line for line in lines if line.strip()]

9 comment_lines = [

10 line for line in lines if line.strip().startswith(("#", "//", "/*", "--"))

11 ]

12 return {

13 "total_lines": len(lines),

14 "code_lines": len(non_empty_lines) - len(comment_lines),

15 "comment_lines": len(comment_lines),

16 "blank_lines": len(lines) - len(non_empty_lines),

17 "comment_ratio": (

18 len(comment_lines) / len(non_empty_lines) if non_empty_lines else 0

19 ),

20 "avg_line_length": (

21 sum(len(line) for line in lines) / len(lines) if lines else 0

22 ),

23 "max_line_length": max(len(line) for line in lines) if lines else 0,

24 "indentation_consistency": _check_indentation_consistency(lines),

25 "has_documentation": '"""' in content or "'''" in content or "/*" in content,

26 }

29def _check_indentation_consistency(lines: list[str]) -> bool:

30 spaces = sum(1 for line in lines if line.startswith(" "))

31 tabs = sum(1 for line in lines if line.startswith("\t"))

32 return not (spaces > 0 and tabs > 0)

35def extract_language_context(

36 content: str, chunk_metadata: dict[str, Any]

37) -> dict[str, Any]:

38 language = chunk_metadata.get("language", "unknown")

39 return {

40 "language": language,

41 "paradigm": _identify_programming_paradigm(content, language),

42 "framework_indicators": _identify_frameworks(content, language),

43 "version_indicators": _identify_language_version(content, language),

44 "style_conventions": _analyze_style_conventions(content, language),

45 }

48def _identify_programming_paradigm(content: str, language: str) -> str:

49 if language in ["python", "java", "c_sharp", "typescript", "javascript"]:

50 return "object_oriented"

51 if language in ["c", "cpp", "go", "rust"]:

52 return "procedural"

53 return "mixed"

56def _identify_frameworks(content: str, language: str) -> list[str]:

57 lower = content.lower()

58 if language in ["python"] and any(

59 k in lower for k in ["django", "flask", "fastapi"]

60 ):

61 return [k for k in ["django", "flask", "fastapi"] if k in lower]

62 if language in ["javascript", "typescript"] and any(

63 k in lower for k in ["react", "vue", "angular", "next"]

64 ):

65 return [k for k in ["react", "vue", "angular", "next"] if k in lower]

66 return []

69def _identify_language_version(content: str, language: str) -> str:

70 if language == "python" and (":=" in content or "match " in content):

71 return "3.x"

72 if language in ["javascript", "typescript"] and any(

73 k in content for k in ["=>", "const", "let"]

74 ):

75 return "ES6+"

76 return "unknown"

79def _analyze_style_conventions(content: str, language: str) -> dict[str, Any]:

80 lines = content.split("\n")

81 snake_case = sum(1 for line in lines if "_" in line)

82 camel_case = sum(1 for line in lines if any(c.isupper() for c in line.split()))

83 return {

84 "snake_case_indicators": snake_case,

85 "camel_case_indicators": camel_case,

86 }