Coverage for src/qdrant_loader/core/chunking/strategy/json/json_chunk

1"""JSON chunk processor for creating optimized chunk documents."""

3from typing import Any

5import structlog

7from qdrant_loader.config import Settings

8from qdrant_loader.core.chunking.strategy.base.chunk_processor import BaseChunkProcessor

9from qdrant_loader.core.chunking.strategy.json.json_document_parser import JSONElement

10from qdrant_loader.core.document import Document

12logger = structlog.get_logger(__name__)

15class JSONChunkProcessor(BaseChunkProcessor):

16 """Chunk processor for JSON documents."""

18 def __init__(self, settings: Settings):

19 """Initialize JSON chunk processor.

21 Args:

22 settings: Configuration settings

23 """

24 super().__init__(settings)

25 self.json_config = settings.global_config.chunking.strategies.json_strategy

27 def create_chunk_document(

28 self,

29 original_doc: Document,

30 chunk_content: str,

31 chunk_index: int,

32 total_chunks: int,

33 chunk_metadata: dict[str, Any],

34 skip_nlp: bool = False,

35 ) -> Document:

36 """Create a chunk document with JSON-specific optimizations.

38 Args:

39 original_doc: Original source document

40 chunk_content: Content for this chunk

41 chunk_index: Index of this chunk

42 total_chunks: Total number of chunks

43 chunk_metadata: Metadata specific to this chunk

44 skip_nlp: Whether to skip expensive NLP processing

46 Returns:

47 Document representing the chunk

48 """

49 # Determine if we should skip NLP based on content characteristics

50 skip_nlp or self._should_skip_nlp_for_json(chunk_content, chunk_metadata)

52 # Create base chunk document

53 chunk_doc = Document(

54 content=chunk_content,

55 source=original_doc.source,

56 source_type=original_doc.source_type,

57 title=f"{original_doc.title}_chunk_{chunk_index + 1}",

58 url=original_doc.url,

59 content_type=original_doc.content_type,

60 metadata=self._create_enhanced_metadata(

61 original_doc, chunk_metadata, chunk_index, total_chunks

62 ),

63 )

65 return chunk_doc

67 def create_optimized_chunk_document(

68 self,

69 original_doc: Document,

70 chunk_content: str,

71 chunk_index: int,

72 total_chunks: int,

73 skip_nlp: bool = True,

74 ) -> Document:

75 """Create an optimized chunk document for large JSON elements.

77 Args:

78 original_doc: Original source document

79 chunk_content: Content for this chunk

80 chunk_index: Index of this chunk

81 total_chunks: Total number of chunks

82 skip_nlp: Whether to skip NLP processing (default True for optimization)

84 Returns:

85 Optimized Document representing the chunk

86 """

87 # Create minimal metadata for large chunks

88 minimal_metadata = {

89 "chunk_index": chunk_index,

90 "total_chunks": total_chunks,

91 "chunk_size": len(chunk_content),

92 "content_type": "json",

93 "processing_mode": "optimized",

94 "nlp_skipped": skip_nlp,

95 "optimization_reason": "large_json_chunk",

96 }

98 enhanced_metadata = self._create_enhanced_metadata(

99 original_doc, minimal_metadata, chunk_index, total_chunks

100 )

101

102 chunk_doc = Document(

103 content=chunk_content,

104 source=original_doc.source,

105 source_type=original_doc.source_type,

106 title=f"{original_doc.title}_chunk_{chunk_index + 1}",

107 url=original_doc.url,

108 content_type=original_doc.content_type,

109 metadata=enhanced_metadata,

110 )

111

112 return chunk_doc

113

114 def create_json_element_chunk_document(

115 self,

116 original_doc: Document,

117 element: JSONElement,

118 chunk_index: int,

119 total_chunks: int,

120 element_metadata: dict[str, Any] = None,

121 ) -> Document:

122 """Create a chunk document from a JSON element.

123

124 Args:

125 original_doc: Original source document

126 element: JSON element to create chunk from

127 chunk_index: Index of this chunk

128 total_chunks: Total number of chunks

129 element_metadata: Additional metadata for the element

130

131 Returns:

132 Document representing the chunk

133 """

134 # Determine if we should skip NLP

135 skip_nlp = element.size > self.json_config.max_chunk_size_for_nlp

136

137 # Combine element metadata with chunk metadata

138 chunk_metadata = {

139 "chunk_index": chunk_index,

140 "total_chunks": total_chunks,

141 "chunk_size": len(element.content),

142 "content_type": "json",

143 "element_type": element.element_type.value,

144 "element_name": element.name,

145 "json_path": element.path,

146 "nesting_level": element.level,

147 "item_count": element.item_count,

148 "nlp_skipped": skip_nlp,

149 }

150

151 if element_metadata:

152 chunk_metadata.update(element_metadata)

153

154 enhanced_metadata = self._create_enhanced_metadata(

155 original_doc, chunk_metadata, chunk_index, total_chunks

156 )

157

158 chunk_doc = Document(

159 content=element.content,

160 source=original_doc.source,

161 source_type=original_doc.source_type,

162 title=f"{original_doc.title}_chunk_{chunk_index + 1}",

163 url=original_doc.url,

164 content_type=original_doc.content_type,

165 metadata=enhanced_metadata,

166 )

167

168 return chunk_doc

169

170 def _should_skip_nlp_for_json(self, content: str, metadata: dict[str, Any]) -> bool:

171 """Determine if NLP processing should be skipped for JSON content.

172

173 Args:

174 content: JSON content to analyze

175 metadata: Chunk metadata

176

177 Returns:

178 True if NLP should be skipped

179 """

180 # Skip NLP for large chunks

181 if len(content) > self.json_config.max_chunk_size_for_nlp:

182 return True

183

184 # Skip NLP for certain JSON types that are primarily data

185 json_type = metadata.get("json_type", "")

186 if json_type in ["list", "dict"] and metadata.get("structure_type") in [

187 "primitive_collection",

188 "configuration",

189 "data_container",

190 ]:

191 return True

192

193 # Skip NLP for highly structured data with minimal text

194 if self._is_minimal_text_content(content):

195 return True

196

197 # Skip NLP for configuration-like structures

198 if self._is_configuration_structure(metadata):

199 return True

200

201 return False

202

203 def _is_minimal_text_content(self, content: str) -> bool:

204 """Check if JSON content has minimal natural language text.

205

206 Args:

207 content: JSON content to analyze

208

209 Returns:

210 True if content has minimal text suitable for NLP

211 """

212 try:

213 import json

214

215 data = json.loads(content)

216

217 # Count text vs structural characters

218 text_chars = 0

219 (

220 content.count("{")

221 + content.count("}")

222 + content.count("[")

223 + content.count("]")

224 + content.count(",")

225 + content.count(":")

226 )

227

228 def count_text_in_values(obj):

229 nonlocal text_chars

230 if isinstance(obj, str):

231 # Only count strings that look like natural language

232 if len(obj) > 10 and any(c.isalpha() for c in obj) and " " in obj:

233 text_chars += len(obj)

234 elif isinstance(obj, dict):

235 for value in obj.values():

236 count_text_in_values(value)

237 elif isinstance(obj, list):

238 for item in obj:

239 count_text_in_values(item)

240

241 count_text_in_values(data)

242

243 # If text content is less than 20% of total, consider it minimal

244 total_content_chars = len(content)

245 text_ratio = text_chars / max(total_content_chars, 1)

246

247 return text_ratio < 0.2

248

249 except json.JSONDecodeError:

250 # If not valid JSON, don't skip NLP

251 return False

252

253 def _is_configuration_structure(self, metadata: dict[str, Any]) -> bool:

254 """Check if the structure represents configuration data.

255

256 Args:

257 metadata: Chunk metadata

258

259 Returns:

260 True if structure looks like configuration

261 """

262 structure_type = metadata.get("structure_type", "")

263 if structure_type == "configuration":

264 return True

265

266 # Check for configuration patterns in metadata

267 config_patterns = metadata.get("configuration_indicators", [])

268 if len(config_patterns) >= 2: # Multiple configuration indicators

269 return True

270

271 # Check for schema patterns that indicate configuration

272 schema_patterns = metadata.get("schema_patterns", [])

273 config_schema_patterns = [

274 "configuration_object",

275 "feature_flags",

276 "typed_value",

277 ]

278 if any(pattern in config_schema_patterns for pattern in schema_patterns):

279 return True

280

281 return False

282

283 def _create_enhanced_metadata(

284 self,

285 original_doc: Document,

286 chunk_metadata: dict[str, Any],

287 chunk_index: int,

288 total_chunks: int,

289 ) -> dict[str, Any]:

290 """Create enhanced metadata for JSON chunk documents.

291

292 Args:

293 original_doc: Original source document

294 chunk_metadata: Chunk-specific metadata

295 chunk_index: Index of this chunk

296 total_chunks: Total number of chunks

297

298 Returns:

299 Enhanced metadata dictionary

300 """

301 # Start with original document metadata

302 enhanced_metadata = original_doc.metadata.copy()

303

304 # Add chunking information

305 enhanced_metadata.update(

306 {

307 "chunk_index": chunk_index,

308 "total_chunks": total_chunks,

309 "chunk_size": chunk_metadata.get(

310 "chunk_size", len(chunk_metadata.get("content", ""))

311 ),

312 "chunking_strategy": "json",

313 "is_chunk": True,

314 "parent_document_id": original_doc.id,

315 }

316 )

317

318 # Add JSON-specific metadata

319 enhanced_metadata.update(

320 {

321 "content_type": "json",

322 "json_processing_mode": "modular_architecture",

323 "supports_schema_inference": self.json_config.enable_schema_inference,

324 }

325 )

326

327 # Merge chunk-specific metadata

328 enhanced_metadata.update(chunk_metadata)

329

330 # Add processing indicators

331 enhanced_metadata.update(

332 {

333 "processed_with_json_components": True,

334 "json_config_version": "modular_v1",

335 "chunk_quality_indicators": self._calculate_chunk_quality_indicators(

336 chunk_metadata

337 ),

338 }

339 )

340

341 return enhanced_metadata

342

343 def _calculate_chunk_quality_indicators(

344 self, chunk_metadata: dict[str, Any]

345 ) -> dict[str, Any]:

346 """Calculate quality indicators for JSON chunks.

347

348 Args:

349 chunk_metadata: Chunk metadata

350

351 Returns:

352 Dictionary of quality indicators

353 """

354 indicators = {

355 "size_appropriate": True,

356 "structure_preserved": True,

357 "schema_coherent": True,

358 "nlp_suitable": True,

359 }

360

361 # Size appropriateness

362 chunk_size = chunk_metadata.get("chunk_size", 0)

363 if chunk_size < 100:

364 indicators["size_appropriate"] = False

365 elif chunk_size > self.settings.global_config.chunking.chunk_size * 2:

366 indicators["size_appropriate"] = False

367

368 # Structure preservation

369 element_type = chunk_metadata.get("element_type", "")

370 if element_type in ["grouped_elements", "chunk"]:

371 indicators["structure_preserved"] = False

372

373 # Schema coherence

374 if not chunk_metadata.get("is_valid_json", True):

375 indicators["schema_coherent"] = False

376

377 # NLP suitability

378 if chunk_metadata.get("nlp_skipped", False):

379 indicators["nlp_suitable"] = False

380

381 # Overall quality score

382 quality_score = sum(indicators.values()) / len(indicators)

383 indicators["overall_quality_score"] = quality_score

384

385 return indicators

Coverage for src/qdrant_loader/core/chunking/strategy/json/json_chunk_processor.py: 100%

97 statements