Coverage for src/qdrant_loader/core/chunking/strategy/code/processor/analysis.py: 84%
38 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1from __future__ import annotations
3from typing import Any
6def analyze_code_content(content: str) -> dict[str, Any]:
7 lines = content.split("\n")
8 non_empty_lines = [line for line in lines if line.strip()]
9 comment_lines = [
10 line for line in lines if line.strip().startswith(("#", "//", "/*", "--"))
11 ]
12 return {
13 "total_lines": len(lines),
14 "code_lines": len(non_empty_lines) - len(comment_lines),
15 "comment_lines": len(comment_lines),
16 "blank_lines": len(lines) - len(non_empty_lines),
17 "comment_ratio": (
18 len(comment_lines) / len(non_empty_lines) if non_empty_lines else 0
19 ),
20 "avg_line_length": (
21 sum(len(line) for line in lines) / len(lines) if lines else 0
22 ),
23 "max_line_length": max(len(line) for line in lines) if lines else 0,
24 "indentation_consistency": _check_indentation_consistency(lines),
25 "has_documentation": '"""' in content or "'''" in content or "/*" in content,
26 }
29def _check_indentation_consistency(lines: list[str]) -> bool:
30 spaces = sum(1 for line in lines if line.startswith(" "))
31 tabs = sum(1 for line in lines if line.startswith("\t"))
32 return not (spaces > 0 and tabs > 0)
35def extract_language_context(
36 content: str, chunk_metadata: dict[str, Any]
37) -> dict[str, Any]:
38 language = chunk_metadata.get("language", "unknown")
39 return {
40 "language": language,
41 "paradigm": _identify_programming_paradigm(content, language),
42 "framework_indicators": _identify_frameworks(content, language),
43 "version_indicators": _identify_language_version(content, language),
44 "style_conventions": _analyze_style_conventions(content, language),
45 }
48def _identify_programming_paradigm(content: str, language: str) -> str:
49 if language in ["python", "java", "c_sharp", "typescript", "javascript"]:
50 return "object_oriented"
51 if language in ["c", "cpp", "go", "rust"]:
52 return "procedural"
53 return "mixed"
56def _identify_frameworks(content: str, language: str) -> list[str]:
57 lower = content.lower()
58 if language in ["python"] and any(
59 k in lower for k in ["django", "flask", "fastapi"]
60 ):
61 return [k for k in ["django", "flask", "fastapi"] if k in lower]
62 if language in ["javascript", "typescript"] and any(
63 k in lower for k in ["react", "vue", "angular", "next"]
64 ):
65 return [k for k in ["react", "vue", "angular", "next"] if k in lower]
66 return []
69def _identify_language_version(content: str, language: str) -> str:
70 if language == "python" and (":=" in content or "match " in content):
71 return "3.x"
72 if language in ["javascript", "typescript"] and any(
73 k in content for k in ["=>", "const", "let"]
74 ):
75 return "ES6+"
76 return "unknown"
79def _analyze_style_conventions(content: str, language: str) -> dict[str, Any]:
80 lines = content.split("\n")
81 snake_case = sum(1 for line in lines if "_" in line)
82 camel_case = sum(1 for line in lines if any(c.isupper() for c in line.split()))
83 return {
84 "snake_case_indicators": snake_case,
85 "camel_case_indicators": camel_case,
86 }