Coverage for src/qdrant_loader/core/chunking/strategy/code/code_chunk_processor.py: 66%
96 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1"""Code chunk processor for creating enhanced code chunk documents."""
3from typing import Any
5import structlog
7from qdrant_loader.core.chunking.strategy.base.chunk_processor import BaseChunkProcessor
8from qdrant_loader.core.chunking.strategy.code.processor.analysis import (
9 analyze_code_content,
10 extract_language_context,
11)
12from qdrant_loader.core.chunking.strategy.code.processor.quality import (
13 assess_code_quality,
14 assess_educational_value,
15 calculate_reusability_score,
16)
17from qdrant_loader.core.document import Document
19logger = structlog.get_logger(__name__)
22class CodeChunkProcessor(BaseChunkProcessor):
23 """Chunk processor for code documents with programming language context."""
25 def __init__(self, settings):
26 super().__init__(settings)
27 self.logger = logger
28 self.code_config = getattr(
29 settings.global_config.chunking.strategies, "code", None
30 )
31 self.max_chunk_size_for_nlp = getattr(
32 self.code_config, "max_chunk_size_for_nlp", 20000
33 )
34 self.skip_conditions = {
35 "large_content": self.max_chunk_size_for_nlp,
36 "binary_patterns": ["\x00", "\xff", "\xfe"],
37 "minified_code_threshold": 0.1,
38 "generated_code_patterns": [
39 "auto-generated",
40 "do not edit",
41 "generated by",
42 ],
43 }
45 def create_chunk_document(
46 self,
47 original_doc: Document,
48 chunk_content: str,
49 chunk_index: int,
50 total_chunks: int,
51 chunk_metadata: dict[str, Any],
52 skip_nlp: bool = False,
53 ) -> Document:
54 chunk_id = self.generate_chunk_id(original_doc, chunk_index)
55 base_metadata = self.create_base_chunk_metadata(
56 original_doc, chunk_index, total_chunks, chunk_metadata
57 )
58 code_metadata = self._create_code_specific_metadata(
59 chunk_content, chunk_metadata, original_doc
60 )
61 base_metadata.update(code_metadata)
62 if not skip_nlp:
63 skip_nlp, skip_reason = self.should_skip_semantic_analysis(
64 chunk_content, chunk_metadata
65 )
66 if skip_nlp:
67 base_metadata["nlp_skip_reason"] = skip_reason
68 return Document(
69 id=chunk_id,
70 content=chunk_content,
71 metadata=base_metadata,
72 source=original_doc.source,
73 source_type=original_doc.source_type,
74 url=original_doc.url,
75 content_type=original_doc.content_type,
76 title=self._generate_chunk_title(original_doc, chunk_metadata, chunk_index),
77 )
79 def should_skip_semantic_analysis(
80 self, chunk_content: str, chunk_metadata: dict[str, Any]
81 ) -> tuple[bool, str]:
82 content_length = len(chunk_content)
83 if content_length > self.skip_conditions["large_content"]:
84 return True, "content_too_large"
85 if any(
86 pattern in chunk_content
87 for pattern in self.skip_conditions["binary_patterns"]
88 ):
89 return True, "binary_content"
90 if self._is_minified_code(chunk_content):
91 return True, "minified_code"
92 if self._is_generated_code(chunk_content):
93 return True, "generated_code"
94 if self._is_mostly_comments(chunk_content):
95 return True, "mostly_comments"
96 if chunk_metadata.get("element_type") == "test" and content_length < 500:
97 return True, "simple_test_code"
98 if chunk_metadata.get("language") in ["json", "yaml", "xml", "ini"]:
99 return True, "configuration_file"
100 return False, "suitable_for_nlp"
102 def _create_code_specific_metadata(
103 self, content: str, chunk_metadata: dict[str, Any], original_doc: Document
104 ) -> dict[str, Any]:
105 return {
106 "content_analysis": analyze_code_content(content),
107 "language_context": extract_language_context(content, chunk_metadata),
108 "code_quality": assess_code_quality(content, chunk_metadata),
109 "educational_value": assess_educational_value(content, chunk_metadata),
110 "reusability_score": calculate_reusability_score(content, chunk_metadata),
111 "chunking_strategy": "code_modular",
112 "element_context": (
113 self._extract_element_context(
114 content, chunk_metadata.get("element_type", "unknown")
115 )
116 if chunk_metadata.get("element_type", "unknown") != "unknown"
117 else None
118 ),
119 }
121 def _generate_chunk_title(
122 self, original_doc: Document, chunk_metadata: dict[str, Any], chunk_index: int
123 ) -> str:
124 base_title = original_doc.title
125 element_name = chunk_metadata.get("element_name")
126 element_type = chunk_metadata.get("element_type", "code")
127 if element_name:
128 return f"{base_title} — {element_type}: {element_name} (Part {chunk_index + 1})"
129 if element_type and element_type != "code":
130 return f"{base_title} — {element_type.title()} Part {chunk_index + 1}"
131 return f"{base_title} — Code Chunk {chunk_index + 1}"
133 # Local helpers (unchanged logic)
134 def _is_minified_code(self, content: str) -> bool:
135 lines = content.split("\n")
136 non_empty = [line for line in lines if line.strip()]
137 if not non_empty:
138 return False
139 avg_len = sum(len(line) for line in non_empty) / len(non_empty)
140 specials = sum(
141 1 for line in non_empty if any(ch in line for ch in ["{", "}", ";"])
142 )
143 ratio = specials / len(non_empty)
144 return avg_len > 200 and ratio > self.skip_conditions["minified_code_threshold"]
146 def _is_generated_code(self, content: str) -> bool:
147 lower = content.lower()
148 return any(
149 pat in lower for pat in self.skip_conditions["generated_code_patterns"]
150 )
152 def _is_mostly_comments(self, content: str) -> bool:
153 lines = content.split("\n")
154 if not lines:
155 return False
156 comment_lines = [
157 line for line in lines if line.strip().startswith(("#", "//", "/*", "--"))
158 ]
159 return len(comment_lines) / len(lines) > 0.6
161 def _has_meaningful_names(self, content: str) -> bool:
162 bad_names = ["tmp", "foo", "bar", "baz", "var", "data", "x", "y", "z"]
163 return not any(f" {n} " in content for n in bad_names)
165 def _determine_learning_level(
166 self, content: str, chunk_metadata: dict[str, Any]
167 ) -> str:
168 complexity = chunk_metadata.get("complexity", 0)
169 if complexity < 2:
170 return "beginner"
171 if complexity < 6:
172 return "intermediate"
173 return "advanced"
175 def _identify_programming_concepts(self, content: str) -> list[str]:
176 concepts: list[str] = []
177 lower = content.lower()
178 for k in [
179 "recursion",
180 "memoization",
181 "concurrency",
182 "polymorphism",
183 "inheritance",
184 ]:
185 if k in lower:
186 concepts.append(k)
187 return concepts
189 def _extract_element_context(
190 self, content: str, element_type: str
191 ) -> dict[str, Any]:
192 context = {"element_type": element_type}
193 if element_type in ["function", "method"]:
194 context["has_return_statement"] = "return" in content
195 context["param_count_estimate"] = (
196 content.split("(", 1)[-1].split(")")[0].count(",") + 1
197 if "(" in content and ")" in content
198 else 0
199 )
200 elif element_type == "class":
201 context["has_init"] = "__init__" in content
202 context["method_count_estimate"] = content.count("def ")
203 return context