Coverage for src/qdrant_loader/core/chunking/strategy/json/json_chunk_processor.py: 100%

97 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-13 09:19 +0000

1"""JSON chunk processor for creating optimized chunk documents.""" 

2 

3from typing import Any 

4 

5import structlog 

6 

7from qdrant_loader.config import Settings 

8from qdrant_loader.core.chunking.strategy.base.chunk_processor import BaseChunkProcessor 

9from qdrant_loader.core.chunking.strategy.json.json_document_parser import JSONElement 

10from qdrant_loader.core.document import Document 

11 

12logger = structlog.get_logger(__name__) 

13 

14 

15class JSONChunkProcessor(BaseChunkProcessor): 

16 """Chunk processor for JSON documents.""" 

17 

18 def __init__(self, settings: Settings): 

19 """Initialize JSON chunk processor. 

20 

21 Args: 

22 settings: Configuration settings 

23 """ 

24 super().__init__(settings) 

25 self.json_config = settings.global_config.chunking.strategies.json_strategy 

26 

27 def create_chunk_document( 

28 self, 

29 original_doc: Document, 

30 chunk_content: str, 

31 chunk_index: int, 

32 total_chunks: int, 

33 chunk_metadata: dict[str, Any], 

34 skip_nlp: bool = False, 

35 ) -> Document: 

36 """Create a chunk document with JSON-specific optimizations. 

37 

38 Args: 

39 original_doc: Original source document 

40 chunk_content: Content for this chunk 

41 chunk_index: Index of this chunk 

42 total_chunks: Total number of chunks 

43 chunk_metadata: Metadata specific to this chunk 

44 skip_nlp: Whether to skip expensive NLP processing 

45 

46 Returns: 

47 Document representing the chunk 

48 """ 

49 # Determine if we should skip NLP based on content characteristics 

50 skip_nlp or self._should_skip_nlp_for_json(chunk_content, chunk_metadata) 

51 

52 # Create base chunk document 

53 chunk_doc = Document( 

54 content=chunk_content, 

55 source=original_doc.source, 

56 source_type=original_doc.source_type, 

57 title=f"{original_doc.title}_chunk_{chunk_index + 1}", 

58 url=original_doc.url, 

59 content_type=original_doc.content_type, 

60 metadata=self._create_enhanced_metadata( 

61 original_doc, chunk_metadata, chunk_index, total_chunks 

62 ), 

63 ) 

64 

65 return chunk_doc 

66 

67 def create_optimized_chunk_document( 

68 self, 

69 original_doc: Document, 

70 chunk_content: str, 

71 chunk_index: int, 

72 total_chunks: int, 

73 skip_nlp: bool = True, 

74 ) -> Document: 

75 """Create an optimized chunk document for large JSON elements. 

76 

77 Args: 

78 original_doc: Original source document 

79 chunk_content: Content for this chunk 

80 chunk_index: Index of this chunk 

81 total_chunks: Total number of chunks 

82 skip_nlp: Whether to skip NLP processing (default True for optimization) 

83 

84 Returns: 

85 Optimized Document representing the chunk 

86 """ 

87 # Create minimal metadata for large chunks 

88 minimal_metadata = { 

89 "chunk_index": chunk_index, 

90 "total_chunks": total_chunks, 

91 "chunk_size": len(chunk_content), 

92 "content_type": "json", 

93 "processing_mode": "optimized", 

94 "nlp_skipped": skip_nlp, 

95 "optimization_reason": "large_json_chunk", 

96 } 

97 

98 enhanced_metadata = self._create_enhanced_metadata( 

99 original_doc, minimal_metadata, chunk_index, total_chunks 

100 ) 

101 

102 chunk_doc = Document( 

103 content=chunk_content, 

104 source=original_doc.source, 

105 source_type=original_doc.source_type, 

106 title=f"{original_doc.title}_chunk_{chunk_index + 1}", 

107 url=original_doc.url, 

108 content_type=original_doc.content_type, 

109 metadata=enhanced_metadata, 

110 ) 

111 

112 return chunk_doc 

113 

114 def create_json_element_chunk_document( 

115 self, 

116 original_doc: Document, 

117 element: JSONElement, 

118 chunk_index: int, 

119 total_chunks: int, 

120 element_metadata: dict[str, Any] = None, 

121 ) -> Document: 

122 """Create a chunk document from a JSON element. 

123 

124 Args: 

125 original_doc: Original source document 

126 element: JSON element to create chunk from 

127 chunk_index: Index of this chunk 

128 total_chunks: Total number of chunks 

129 element_metadata: Additional metadata for the element 

130 

131 Returns: 

132 Document representing the chunk 

133 """ 

134 # Determine if we should skip NLP 

135 skip_nlp = element.size > self.json_config.max_chunk_size_for_nlp 

136 

137 # Combine element metadata with chunk metadata 

138 chunk_metadata = { 

139 "chunk_index": chunk_index, 

140 "total_chunks": total_chunks, 

141 "chunk_size": len(element.content), 

142 "content_type": "json", 

143 "element_type": element.element_type.value, 

144 "element_name": element.name, 

145 "json_path": element.path, 

146 "nesting_level": element.level, 

147 "item_count": element.item_count, 

148 "nlp_skipped": skip_nlp, 

149 } 

150 

151 if element_metadata: 

152 chunk_metadata.update(element_metadata) 

153 

154 enhanced_metadata = self._create_enhanced_metadata( 

155 original_doc, chunk_metadata, chunk_index, total_chunks 

156 ) 

157 

158 chunk_doc = Document( 

159 content=element.content, 

160 source=original_doc.source, 

161 source_type=original_doc.source_type, 

162 title=f"{original_doc.title}_chunk_{chunk_index + 1}", 

163 url=original_doc.url, 

164 content_type=original_doc.content_type, 

165 metadata=enhanced_metadata, 

166 ) 

167 

168 return chunk_doc 

169 

170 def _should_skip_nlp_for_json(self, content: str, metadata: dict[str, Any]) -> bool: 

171 """Determine if NLP processing should be skipped for JSON content. 

172 

173 Args: 

174 content: JSON content to analyze 

175 metadata: Chunk metadata 

176 

177 Returns: 

178 True if NLP should be skipped 

179 """ 

180 # Skip NLP for large chunks 

181 if len(content) > self.json_config.max_chunk_size_for_nlp: 

182 return True 

183 

184 # Skip NLP for certain JSON types that are primarily data 

185 json_type = metadata.get("json_type", "") 

186 if json_type in ["list", "dict"] and metadata.get("structure_type") in [ 

187 "primitive_collection", 

188 "configuration", 

189 "data_container", 

190 ]: 

191 return True 

192 

193 # Skip NLP for highly structured data with minimal text 

194 if self._is_minimal_text_content(content): 

195 return True 

196 

197 # Skip NLP for configuration-like structures 

198 if self._is_configuration_structure(metadata): 

199 return True 

200 

201 return False 

202 

203 def _is_minimal_text_content(self, content: str) -> bool: 

204 """Check if JSON content has minimal natural language text. 

205 

206 Args: 

207 content: JSON content to analyze 

208 

209 Returns: 

210 True if content has minimal text suitable for NLP 

211 """ 

212 try: 

213 import json 

214 

215 data = json.loads(content) 

216 

217 # Count text vs structural characters 

218 text_chars = 0 

219 ( 

220 content.count("{") 

221 + content.count("}") 

222 + content.count("[") 

223 + content.count("]") 

224 + content.count(",") 

225 + content.count(":") 

226 ) 

227 

228 def count_text_in_values(obj): 

229 nonlocal text_chars 

230 if isinstance(obj, str): 

231 # Only count strings that look like natural language 

232 if len(obj) > 10 and any(c.isalpha() for c in obj) and " " in obj: 

233 text_chars += len(obj) 

234 elif isinstance(obj, dict): 

235 for value in obj.values(): 

236 count_text_in_values(value) 

237 elif isinstance(obj, list): 

238 for item in obj: 

239 count_text_in_values(item) 

240 

241 count_text_in_values(data) 

242 

243 # If text content is less than 20% of total, consider it minimal 

244 total_content_chars = len(content) 

245 text_ratio = text_chars / max(total_content_chars, 1) 

246 

247 return text_ratio < 0.2 

248 

249 except json.JSONDecodeError: 

250 # If not valid JSON, don't skip NLP 

251 return False 

252 

253 def _is_configuration_structure(self, metadata: dict[str, Any]) -> bool: 

254 """Check if the structure represents configuration data. 

255 

256 Args: 

257 metadata: Chunk metadata 

258 

259 Returns: 

260 True if structure looks like configuration 

261 """ 

262 structure_type = metadata.get("structure_type", "") 

263 if structure_type == "configuration": 

264 return True 

265 

266 # Check for configuration patterns in metadata 

267 config_patterns = metadata.get("configuration_indicators", []) 

268 if len(config_patterns) >= 2: # Multiple configuration indicators 

269 return True 

270 

271 # Check for schema patterns that indicate configuration 

272 schema_patterns = metadata.get("schema_patterns", []) 

273 config_schema_patterns = [ 

274 "configuration_object", 

275 "feature_flags", 

276 "typed_value", 

277 ] 

278 if any(pattern in config_schema_patterns for pattern in schema_patterns): 

279 return True 

280 

281 return False 

282 

283 def _create_enhanced_metadata( 

284 self, 

285 original_doc: Document, 

286 chunk_metadata: dict[str, Any], 

287 chunk_index: int, 

288 total_chunks: int, 

289 ) -> dict[str, Any]: 

290 """Create enhanced metadata for JSON chunk documents. 

291 

292 Args: 

293 original_doc: Original source document 

294 chunk_metadata: Chunk-specific metadata 

295 chunk_index: Index of this chunk 

296 total_chunks: Total number of chunks 

297 

298 Returns: 

299 Enhanced metadata dictionary 

300 """ 

301 # Start with original document metadata 

302 enhanced_metadata = original_doc.metadata.copy() 

303 

304 # Add chunking information 

305 enhanced_metadata.update( 

306 { 

307 "chunk_index": chunk_index, 

308 "total_chunks": total_chunks, 

309 "chunk_size": chunk_metadata.get( 

310 "chunk_size", len(chunk_metadata.get("content", "")) 

311 ), 

312 "chunking_strategy": "json", 

313 "is_chunk": True, 

314 "parent_document_id": original_doc.id, 

315 } 

316 ) 

317 

318 # Add JSON-specific metadata 

319 enhanced_metadata.update( 

320 { 

321 "content_type": "json", 

322 "json_processing_mode": "modular_architecture", 

323 "supports_schema_inference": self.json_config.enable_schema_inference, 

324 } 

325 ) 

326 

327 # Merge chunk-specific metadata 

328 enhanced_metadata.update(chunk_metadata) 

329 

330 # Add processing indicators 

331 enhanced_metadata.update( 

332 { 

333 "processed_with_json_components": True, 

334 "json_config_version": "modular_v1", 

335 "chunk_quality_indicators": self._calculate_chunk_quality_indicators( 

336 chunk_metadata 

337 ), 

338 } 

339 ) 

340 

341 return enhanced_metadata 

342 

343 def _calculate_chunk_quality_indicators( 

344 self, chunk_metadata: dict[str, Any] 

345 ) -> dict[str, Any]: 

346 """Calculate quality indicators for JSON chunks. 

347 

348 Args: 

349 chunk_metadata: Chunk metadata 

350 

351 Returns: 

352 Dictionary of quality indicators 

353 """ 

354 indicators = { 

355 "size_appropriate": True, 

356 "structure_preserved": True, 

357 "schema_coherent": True, 

358 "nlp_suitable": True, 

359 } 

360 

361 # Size appropriateness 

362 chunk_size = chunk_metadata.get("chunk_size", 0) 

363 if chunk_size < 100: 

364 indicators["size_appropriate"] = False 

365 elif chunk_size > self.settings.global_config.chunking.chunk_size * 2: 

366 indicators["size_appropriate"] = False 

367 

368 # Structure preservation 

369 element_type = chunk_metadata.get("element_type", "") 

370 if element_type in ["grouped_elements", "chunk"]: 

371 indicators["structure_preserved"] = False 

372 

373 # Schema coherence 

374 if not chunk_metadata.get("is_valid_json", True): 

375 indicators["schema_coherent"] = False 

376 

377 # NLP suitability 

378 if chunk_metadata.get("nlp_skipped", False): 

379 indicators["nlp_suitable"] = False 

380 

381 # Overall quality score 

382 quality_score = sum(indicators.values()) / len(indicators) 

383 indicators["overall_quality_score"] = quality_score 

384 

385 return indicators