Coverage for src/qdrant_loader/core/chunking/strategy/code/processor/analysis.py: 84%

38 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1from __future__ import annotations 

2 

3from typing import Any 

4 

5 

6def analyze_code_content(content: str) -> dict[str, Any]: 

7 lines = content.split("\n") 

8 non_empty_lines = [line for line in lines if line.strip()] 

9 comment_lines = [ 

10 line for line in lines if line.strip().startswith(("#", "//", "/*", "--")) 

11 ] 

12 return { 

13 "total_lines": len(lines), 

14 "code_lines": len(non_empty_lines) - len(comment_lines), 

15 "comment_lines": len(comment_lines), 

16 "blank_lines": len(lines) - len(non_empty_lines), 

17 "comment_ratio": ( 

18 len(comment_lines) / len(non_empty_lines) if non_empty_lines else 0 

19 ), 

20 "avg_line_length": ( 

21 sum(len(line) for line in lines) / len(lines) if lines else 0 

22 ), 

23 "max_line_length": max(len(line) for line in lines) if lines else 0, 

24 "indentation_consistency": _check_indentation_consistency(lines), 

25 "has_documentation": '"""' in content or "'''" in content or "/*" in content, 

26 } 

27 

28 

29def _check_indentation_consistency(lines: list[str]) -> bool: 

30 spaces = sum(1 for line in lines if line.startswith(" ")) 

31 tabs = sum(1 for line in lines if line.startswith("\t")) 

32 return not (spaces > 0 and tabs > 0) 

33 

34 

35def extract_language_context( 

36 content: str, chunk_metadata: dict[str, Any] 

37) -> dict[str, Any]: 

38 language = chunk_metadata.get("language", "unknown") 

39 return { 

40 "language": language, 

41 "paradigm": _identify_programming_paradigm(content, language), 

42 "framework_indicators": _identify_frameworks(content, language), 

43 "version_indicators": _identify_language_version(content, language), 

44 "style_conventions": _analyze_style_conventions(content, language), 

45 } 

46 

47 

48def _identify_programming_paradigm(content: str, language: str) -> str: 

49 if language in ["python", "java", "c_sharp", "typescript", "javascript"]: 

50 return "object_oriented" 

51 if language in ["c", "cpp", "go", "rust"]: 

52 return "procedural" 

53 return "mixed" 

54 

55 

56def _identify_frameworks(content: str, language: str) -> list[str]: 

57 lower = content.lower() 

58 if language in ["python"] and any( 

59 k in lower for k in ["django", "flask", "fastapi"] 

60 ): 

61 return [k for k in ["django", "flask", "fastapi"] if k in lower] 

62 if language in ["javascript", "typescript"] and any( 

63 k in lower for k in ["react", "vue", "angular", "next"] 

64 ): 

65 return [k for k in ["react", "vue", "angular", "next"] if k in lower] 

66 return [] 

67 

68 

69def _identify_language_version(content: str, language: str) -> str: 

70 if language == "python" and (":=" in content or "match " in content): 

71 return "3.x" 

72 if language in ["javascript", "typescript"] and any( 

73 k in content for k in ["=>", "const", "let"] 

74 ): 

75 return "ES6+" 

76 return "unknown" 

77 

78 

79def _analyze_style_conventions(content: str, language: str) -> dict[str, Any]: 

80 lines = content.split("\n") 

81 snake_case = sum(1 for line in lines if "_" in line) 

82 camel_case = sum(1 for line in lines if any(c.isupper() for c in line.split())) 

83 return { 

84 "snake_case_indicators": snake_case, 

85 "camel_case_indicators": camel_case, 

86 }