Coverage for src/qdrant_loader/core/chunking/strategy/json/json_chunk_processor.py: 100%
97 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-13 09:19 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-13 09:19 +0000
1"""JSON chunk processor for creating optimized chunk documents."""
3from typing import Any
5import structlog
7from qdrant_loader.config import Settings
8from qdrant_loader.core.chunking.strategy.base.chunk_processor import BaseChunkProcessor
9from qdrant_loader.core.chunking.strategy.json.json_document_parser import JSONElement
10from qdrant_loader.core.document import Document
12logger = structlog.get_logger(__name__)
15class JSONChunkProcessor(BaseChunkProcessor):
16 """Chunk processor for JSON documents."""
18 def __init__(self, settings: Settings):
19 """Initialize JSON chunk processor.
21 Args:
22 settings: Configuration settings
23 """
24 super().__init__(settings)
25 self.json_config = settings.global_config.chunking.strategies.json_strategy
27 def create_chunk_document(
28 self,
29 original_doc: Document,
30 chunk_content: str,
31 chunk_index: int,
32 total_chunks: int,
33 chunk_metadata: dict[str, Any],
34 skip_nlp: bool = False,
35 ) -> Document:
36 """Create a chunk document with JSON-specific optimizations.
38 Args:
39 original_doc: Original source document
40 chunk_content: Content for this chunk
41 chunk_index: Index of this chunk
42 total_chunks: Total number of chunks
43 chunk_metadata: Metadata specific to this chunk
44 skip_nlp: Whether to skip expensive NLP processing
46 Returns:
47 Document representing the chunk
48 """
49 # Determine if we should skip NLP based on content characteristics
50 skip_nlp or self._should_skip_nlp_for_json(chunk_content, chunk_metadata)
52 # Create base chunk document
53 chunk_doc = Document(
54 content=chunk_content,
55 source=original_doc.source,
56 source_type=original_doc.source_type,
57 title=f"{original_doc.title}_chunk_{chunk_index + 1}",
58 url=original_doc.url,
59 content_type=original_doc.content_type,
60 metadata=self._create_enhanced_metadata(
61 original_doc, chunk_metadata, chunk_index, total_chunks
62 ),
63 )
65 return chunk_doc
67 def create_optimized_chunk_document(
68 self,
69 original_doc: Document,
70 chunk_content: str,
71 chunk_index: int,
72 total_chunks: int,
73 skip_nlp: bool = True,
74 ) -> Document:
75 """Create an optimized chunk document for large JSON elements.
77 Args:
78 original_doc: Original source document
79 chunk_content: Content for this chunk
80 chunk_index: Index of this chunk
81 total_chunks: Total number of chunks
82 skip_nlp: Whether to skip NLP processing (default True for optimization)
84 Returns:
85 Optimized Document representing the chunk
86 """
87 # Create minimal metadata for large chunks
88 minimal_metadata = {
89 "chunk_index": chunk_index,
90 "total_chunks": total_chunks,
91 "chunk_size": len(chunk_content),
92 "content_type": "json",
93 "processing_mode": "optimized",
94 "nlp_skipped": skip_nlp,
95 "optimization_reason": "large_json_chunk",
96 }
98 enhanced_metadata = self._create_enhanced_metadata(
99 original_doc, minimal_metadata, chunk_index, total_chunks
100 )
102 chunk_doc = Document(
103 content=chunk_content,
104 source=original_doc.source,
105 source_type=original_doc.source_type,
106 title=f"{original_doc.title}_chunk_{chunk_index + 1}",
107 url=original_doc.url,
108 content_type=original_doc.content_type,
109 metadata=enhanced_metadata,
110 )
112 return chunk_doc
114 def create_json_element_chunk_document(
115 self,
116 original_doc: Document,
117 element: JSONElement,
118 chunk_index: int,
119 total_chunks: int,
120 element_metadata: dict[str, Any] = None,
121 ) -> Document:
122 """Create a chunk document from a JSON element.
124 Args:
125 original_doc: Original source document
126 element: JSON element to create chunk from
127 chunk_index: Index of this chunk
128 total_chunks: Total number of chunks
129 element_metadata: Additional metadata for the element
131 Returns:
132 Document representing the chunk
133 """
134 # Determine if we should skip NLP
135 skip_nlp = element.size > self.json_config.max_chunk_size_for_nlp
137 # Combine element metadata with chunk metadata
138 chunk_metadata = {
139 "chunk_index": chunk_index,
140 "total_chunks": total_chunks,
141 "chunk_size": len(element.content),
142 "content_type": "json",
143 "element_type": element.element_type.value,
144 "element_name": element.name,
145 "json_path": element.path,
146 "nesting_level": element.level,
147 "item_count": element.item_count,
148 "nlp_skipped": skip_nlp,
149 }
151 if element_metadata:
152 chunk_metadata.update(element_metadata)
154 enhanced_metadata = self._create_enhanced_metadata(
155 original_doc, chunk_metadata, chunk_index, total_chunks
156 )
158 chunk_doc = Document(
159 content=element.content,
160 source=original_doc.source,
161 source_type=original_doc.source_type,
162 title=f"{original_doc.title}_chunk_{chunk_index + 1}",
163 url=original_doc.url,
164 content_type=original_doc.content_type,
165 metadata=enhanced_metadata,
166 )
168 return chunk_doc
170 def _should_skip_nlp_for_json(self, content: str, metadata: dict[str, Any]) -> bool:
171 """Determine if NLP processing should be skipped for JSON content.
173 Args:
174 content: JSON content to analyze
175 metadata: Chunk metadata
177 Returns:
178 True if NLP should be skipped
179 """
180 # Skip NLP for large chunks
181 if len(content) > self.json_config.max_chunk_size_for_nlp:
182 return True
184 # Skip NLP for certain JSON types that are primarily data
185 json_type = metadata.get("json_type", "")
186 if json_type in ["list", "dict"] and metadata.get("structure_type") in [
187 "primitive_collection",
188 "configuration",
189 "data_container",
190 ]:
191 return True
193 # Skip NLP for highly structured data with minimal text
194 if self._is_minimal_text_content(content):
195 return True
197 # Skip NLP for configuration-like structures
198 if self._is_configuration_structure(metadata):
199 return True
201 return False
203 def _is_minimal_text_content(self, content: str) -> bool:
204 """Check if JSON content has minimal natural language text.
206 Args:
207 content: JSON content to analyze
209 Returns:
210 True if content has minimal text suitable for NLP
211 """
212 try:
213 import json
215 data = json.loads(content)
217 # Count text vs structural characters
218 text_chars = 0
219 (
220 content.count("{")
221 + content.count("}")
222 + content.count("[")
223 + content.count("]")
224 + content.count(",")
225 + content.count(":")
226 )
228 def count_text_in_values(obj):
229 nonlocal text_chars
230 if isinstance(obj, str):
231 # Only count strings that look like natural language
232 if len(obj) > 10 and any(c.isalpha() for c in obj) and " " in obj:
233 text_chars += len(obj)
234 elif isinstance(obj, dict):
235 for value in obj.values():
236 count_text_in_values(value)
237 elif isinstance(obj, list):
238 for item in obj:
239 count_text_in_values(item)
241 count_text_in_values(data)
243 # If text content is less than 20% of total, consider it minimal
244 total_content_chars = len(content)
245 text_ratio = text_chars / max(total_content_chars, 1)
247 return text_ratio < 0.2
249 except json.JSONDecodeError:
250 # If not valid JSON, don't skip NLP
251 return False
253 def _is_configuration_structure(self, metadata: dict[str, Any]) -> bool:
254 """Check if the structure represents configuration data.
256 Args:
257 metadata: Chunk metadata
259 Returns:
260 True if structure looks like configuration
261 """
262 structure_type = metadata.get("structure_type", "")
263 if structure_type == "configuration":
264 return True
266 # Check for configuration patterns in metadata
267 config_patterns = metadata.get("configuration_indicators", [])
268 if len(config_patterns) >= 2: # Multiple configuration indicators
269 return True
271 # Check for schema patterns that indicate configuration
272 schema_patterns = metadata.get("schema_patterns", [])
273 config_schema_patterns = [
274 "configuration_object",
275 "feature_flags",
276 "typed_value",
277 ]
278 if any(pattern in config_schema_patterns for pattern in schema_patterns):
279 return True
281 return False
283 def _create_enhanced_metadata(
284 self,
285 original_doc: Document,
286 chunk_metadata: dict[str, Any],
287 chunk_index: int,
288 total_chunks: int,
289 ) -> dict[str, Any]:
290 """Create enhanced metadata for JSON chunk documents.
292 Args:
293 original_doc: Original source document
294 chunk_metadata: Chunk-specific metadata
295 chunk_index: Index of this chunk
296 total_chunks: Total number of chunks
298 Returns:
299 Enhanced metadata dictionary
300 """
301 # Start with original document metadata
302 enhanced_metadata = original_doc.metadata.copy()
304 # Add chunking information
305 enhanced_metadata.update(
306 {
307 "chunk_index": chunk_index,
308 "total_chunks": total_chunks,
309 "chunk_size": chunk_metadata.get(
310 "chunk_size", len(chunk_metadata.get("content", ""))
311 ),
312 "chunking_strategy": "json",
313 "is_chunk": True,
314 "parent_document_id": original_doc.id,
315 }
316 )
318 # Add JSON-specific metadata
319 enhanced_metadata.update(
320 {
321 "content_type": "json",
322 "json_processing_mode": "modular_architecture",
323 "supports_schema_inference": self.json_config.enable_schema_inference,
324 }
325 )
327 # Merge chunk-specific metadata
328 enhanced_metadata.update(chunk_metadata)
330 # Add processing indicators
331 enhanced_metadata.update(
332 {
333 "processed_with_json_components": True,
334 "json_config_version": "modular_v1",
335 "chunk_quality_indicators": self._calculate_chunk_quality_indicators(
336 chunk_metadata
337 ),
338 }
339 )
341 return enhanced_metadata
343 def _calculate_chunk_quality_indicators(
344 self, chunk_metadata: dict[str, Any]
345 ) -> dict[str, Any]:
346 """Calculate quality indicators for JSON chunks.
348 Args:
349 chunk_metadata: Chunk metadata
351 Returns:
352 Dictionary of quality indicators
353 """
354 indicators = {
355 "size_appropriate": True,
356 "structure_preserved": True,
357 "schema_coherent": True,
358 "nlp_suitable": True,
359 }
361 # Size appropriateness
362 chunk_size = chunk_metadata.get("chunk_size", 0)
363 if chunk_size < 100:
364 indicators["size_appropriate"] = False
365 elif chunk_size > self.settings.global_config.chunking.chunk_size * 2:
366 indicators["size_appropriate"] = False
368 # Structure preservation
369 element_type = chunk_metadata.get("element_type", "")
370 if element_type in ["grouped_elements", "chunk"]:
371 indicators["structure_preserved"] = False
373 # Schema coherence
374 if not chunk_metadata.get("is_valid_json", True):
375 indicators["schema_coherent"] = False
377 # NLP suitability
378 if chunk_metadata.get("nlp_skipped", False):
379 indicators["nlp_suitable"] = False
381 # Overall quality score
382 quality_score = sum(indicators.values()) / len(indicators)
383 indicators["overall_quality_score"] = quality_score
385 return indicators