Coverage for src/qdrant_loader_mcp_server/search/hybrid_search.py: 72%

716 statements  

« prev     ^ index     » next       coverage.py v7.10.0, created at 2025-07-25 11:38 +0000

1"""Hybrid search implementation combining vector and keyword search.""" 

2 

3import re 

4from dataclasses import dataclass 

5from datetime import datetime 

6from typing import Any 

7 

8import numpy as np 

9from openai import AsyncOpenAI 

10from qdrant_client import QdrantClient 

11from qdrant_client.http import models 

12from rank_bm25 import BM25Okapi 

13 

14from ..utils.logging import LoggingConfig 

15from .models import SearchResult 

16from .nlp.spacy_analyzer import SpaCyQueryAnalyzer 

17# 🔥 NEW: Phase 2.2 Intent-Aware Adaptive Search 

18from .enhanced.intent_classifier import IntentClassifier, AdaptiveSearchStrategy, SearchIntent 

19from .enhanced.knowledge_graph import DocumentKnowledgeGraph 

20# 🔥 NEW: Phase 1.2 Topic-Driven Search Chaining  

21from .enhanced.topic_search_chain import ( 

22 TopicSearchChainGenerator, 

23 TopicSearchChain, 

24 ChainStrategy, 

25 TopicChainLink 

26) 

27# 🔥 NEW: Phase 1.3 Dynamic Faceted Search Interface 

28from .enhanced.faceted_search import ( 

29 FacetType, 

30 FacetFilter, 

31 FacetedSearchResults, 

32 DynamicFacetGenerator, 

33 FacetedSearchEngine 

34) 

35# 🔥 NEW: Phase 2.3 Cross-Document Intelligence 

36from .enhanced.cross_document_intelligence import ( 

37 CrossDocumentIntelligenceEngine, 

38 SimilarityMetric, 

39 ClusteringStrategy, 

40 DocumentSimilarityCalculator 

41) 

42 

43logger = LoggingConfig.get_logger(__name__) 

44 

45 

46@dataclass 

47class HybridSearchResult: 

48 """Container for hybrid search results with comprehensive metadata.""" 

49 

50 score: float 

51 text: str 

52 source_type: str 

53 source_title: str 

54 source_url: str | None = None 

55 file_path: str | None = None 

56 repo_name: str | None = None 

57 vector_score: float = 0.0 

58 keyword_score: float = 0.0 

59 

60 # Project information (for multi-project support) 

61 project_id: str | None = None 

62 project_name: str | None = None 

63 project_description: str | None = None 

64 collection_name: str | None = None 

65 

66 # Hierarchy information (primarily for Confluence) 

67 parent_id: str | None = None 

68 parent_title: str | None = None 

69 breadcrumb_text: str | None = None 

70 depth: int | None = None 

71 children_count: int | None = None 

72 hierarchy_context: str | None = None 

73 

74 # Attachment information (for files attached to documents) 

75 is_attachment: bool = False 

76 parent_document_id: str | None = None 

77 parent_document_title: str | None = None 

78 attachment_id: str | None = None 

79 original_filename: str | None = None 

80 file_size: int | None = None 

81 mime_type: str | None = None 

82 attachment_author: str | None = None 

83 attachment_context: str | None = None 

84 

85 # 🔥 NEW: Section-level intelligence 

86 section_title: str | None = None 

87 section_type: str | None = None # e.g., "h1", "h2", "content" 

88 section_level: int | None = None 

89 section_anchor: str | None = None 

90 section_breadcrumb: str | None = None 

91 section_depth: int | None = None 

92 

93 # 🔥 NEW: Content analysis 

94 has_code_blocks: bool = False 

95 has_tables: bool = False 

96 has_images: bool = False 

97 has_links: bool = False 

98 word_count: int | None = None 

99 char_count: int | None = None 

100 estimated_read_time: int | None = None # minutes 

101 paragraph_count: int | None = None 

102 

103 # 🔥 NEW: Semantic analysis (NLP results) 

104 entities: list[dict | str] = None 

105 topics: list[dict | str] = None 

106 key_phrases: list[dict | str] = None 

107 pos_tags: list[dict] = None 

108 

109 # 🔥 NEW: Navigation context 

110 previous_section: str | None = None 

111 next_section: str | None = None 

112 sibling_sections: list[str] = None 

113 subsections: list[str] = None 

114 document_hierarchy: list[str] = None 

115 

116 # 🔥 NEW: Chunking context 

117 chunk_index: int | None = None 

118 total_chunks: int | None = None 

119 chunking_strategy: str | None = None 

120 

121 # 🔥 NEW: File conversion intelligence 

122 original_file_type: str | None = None 

123 conversion_method: str | None = None 

124 is_excel_sheet: bool = False 

125 is_converted: bool = False 

126 

127 # 🔥 NEW: Cross-references and enhanced context 

128 cross_references: list[dict] = None 

129 topic_analysis: dict | None = None 

130 content_type_context: str | None = None # Human-readable content description 

131 

132 def __post_init__(self): 

133 """Initialize default values for list fields.""" 

134 if self.entities is None: 

135 self.entities = [] 

136 if self.topics is None: 

137 self.topics = [] 

138 if self.key_phrases is None: 

139 self.key_phrases = [] 

140 if self.pos_tags is None: 

141 self.pos_tags = [] 

142 if self.sibling_sections is None: 

143 self.sibling_sections = [] 

144 if self.subsections is None: 

145 self.subsections = [] 

146 if self.document_hierarchy is None: 

147 self.document_hierarchy = [] 

148 if self.cross_references is None: 

149 self.cross_references = [] 

150 

151 

152class HybridSearchEngine: 

153 """Service for hybrid search combining vector and keyword search.""" 

154 

155 def __init__( 

156 self, 

157 qdrant_client: QdrantClient, 

158 openai_client: AsyncOpenAI, 

159 collection_name: str, 

160 vector_weight: float = 0.6, 

161 keyword_weight: float = 0.3, 

162 metadata_weight: float = 0.1, 

163 min_score: float = 0.3, 

164 dense_vector_name: str = "dense", 

165 sparse_vector_name: str = "sparse", 

166 alpha: float = 0.5, 

167 # 🔥 NEW: Phase 2.2 parameters 

168 knowledge_graph: DocumentKnowledgeGraph = None, 

169 enable_intent_adaptation: bool = True, 

170 ): 

171 """Initialize the hybrid search service. 

172 

173 Args: 

174 qdrant_client: Qdrant client instance 

175 openai_client: OpenAI client instance 

176 collection_name: Name of the Qdrant collection 

177 vector_weight: Weight for vector search scores (0-1) 

178 keyword_weight: Weight for keyword search scores (0-1) 

179 metadata_weight: Weight for metadata-based scoring (0-1) 

180 min_score: Minimum combined score threshold 

181 dense_vector_name: Name of the dense vector field 

182 sparse_vector_name: Name of the sparse vector field 

183 alpha: Weight for dense search (1-alpha for sparse search) 

184 knowledge_graph: Optional knowledge graph for Phase 2.1 integration 

185 enable_intent_adaptation: Enable Phase 2.2 intent-aware adaptive search 

186 

187 """ 

188 self.qdrant_client = qdrant_client 

189 self.openai_client = openai_client 

190 self.collection_name = collection_name 

191 self.vector_weight = vector_weight 

192 self.keyword_weight = keyword_weight 

193 self.metadata_weight = metadata_weight 

194 self.min_score = min_score 

195 self.dense_vector_name = dense_vector_name 

196 self.sparse_vector_name = sparse_vector_name 

197 self.alpha = alpha 

198 self.logger = LoggingConfig.get_logger(__name__) 

199 

200 # 🔥 NEW: Initialize spaCy query analyzer for intelligent query processing 

201 self.spacy_analyzer = SpaCyQueryAnalyzer(spacy_model="en_core_web_md") 

202 

203 # 🔥 NEW: Phase 2.2 Intent-Aware Adaptive Search 

204 self.enable_intent_adaptation = enable_intent_adaptation 

205 self.knowledge_graph = knowledge_graph 

206 

207 if self.enable_intent_adaptation: 

208 self.intent_classifier = IntentClassifier(self.spacy_analyzer) 

209 self.adaptive_strategy = AdaptiveSearchStrategy(self.knowledge_graph) 

210 logger.info("🔥 Phase 2.2: Intent-aware adaptive search ENABLED") 

211 else: 

212 self.intent_classifier = None 

213 self.adaptive_strategy = None 

214 logger.info("Intent-aware adaptive search DISABLED") 

215 

216 # 🔥 NEW: Phase 1.2 Topic-Driven Search Chaining 

217 self.topic_chain_generator = TopicSearchChainGenerator( 

218 self.spacy_analyzer, 

219 self.knowledge_graph 

220 ) 

221 self._topic_chains_initialized = False 

222 logger.info("🔥 Phase 1.2: Topic-driven search chaining ENABLED") 

223 

224 # 🔥 NEW: Phase 1.3 Dynamic Faceted Search Interface 

225 self.faceted_search_engine = FacetedSearchEngine() 

226 logger.info("🔥 Phase 1.3: Dynamic faceted search interface ENABLED") 

227 

228 # Cross-Document Intelligence (always enabled) 

229 self.cross_document_engine = CrossDocumentIntelligenceEngine( 

230 self.spacy_analyzer, 

231 self.knowledge_graph 

232 ) 

233 logger.info("Cross-document intelligence ENABLED") 

234 

235 # Enhanced query expansions leveraging spaCy semantic understanding 

236 self.query_expansions = { 

237 "product requirements": [ 

238 "PRD", 

239 "requirements document", 

240 "product specification", 

241 ], 

242 "requirements": ["specs", "requirements document", "features"], 

243 "architecture": ["system design", "technical architecture"], 

244 "UI": ["user interface", "frontend", "design"], 

245 "API": ["interface", "endpoints", "REST"], 

246 "database": ["DB", "data storage", "persistence"], 

247 "security": ["auth", "authentication", "authorization"], 

248 # 🔥 NEW: Content-type aware expansions 

249 "code": ["implementation", "function", "method", "class"], 

250 "documentation": ["docs", "guide", "manual", "instructions"], 

251 "config": ["configuration", "settings", "setup"], 

252 "table": ["data", "spreadsheet", "excel", "csv"], 

253 "image": ["screenshot", "diagram", "chart", "visual"], 

254 "link": ["reference", "url", "external", "connection"], 

255 } 

256 

257 async def _expand_query(self, query: str) -> str: 

258 """🔥 ENHANCED: Expand query with spaCy semantic understanding and related terms.""" 

259 # Use spaCy analyzer for intelligent query expansion 

260 try: 

261 query_analysis = self.spacy_analyzer.analyze_query_semantic(query) 

262 

263 # Start with original query 

264 expanded_query = query 

265 

266 # Add semantic keywords for broader matching 

267 if query_analysis.semantic_keywords: 

268 # Add top semantic keywords 

269 semantic_terms = " ".join(query_analysis.semantic_keywords[:3]) 

270 expanded_query = f"{query} {semantic_terms}" 

271 

272 # Add main concepts for concept-based expansion 

273 if query_analysis.main_concepts: 

274 concept_terms = " ".join(query_analysis.main_concepts[:2]) 

275 expanded_query = f"{expanded_query} {concept_terms}" 

276 

277 # Legacy expansion logic as fallback 

278 lower_query = query.lower() 

279 for key, expansions in self.query_expansions.items(): 

280 if key.lower() in lower_query: 

281 expansion_terms = " ".join(expansions[:2]) # Limit to avoid over-expansion 

282 expanded_query = f"{expanded_query} {expansion_terms}" 

283 break 

284 

285 if expanded_query != query: 

286 self.logger.debug( 

287 "🔥 spaCy-enhanced query expansion", 

288 original_query=query, 

289 expanded_query=expanded_query, 

290 semantic_keywords=query_analysis.semantic_keywords[:3], 

291 main_concepts=query_analysis.main_concepts[:2], 

292 ) 

293 

294 return expanded_query 

295 

296 except Exception as e: 

297 self.logger.warning(f"spaCy expansion failed, using fallback: {e}") 

298 # Fallback to original expansion logic 

299 expanded_query = query 

300 lower_query = query.lower() 

301 

302 for key, expansions in self.query_expansions.items(): 

303 if key.lower() in lower_query: 

304 expansion_terms = " ".join(expansions) 

305 expanded_query = f"{query} {expansion_terms}" 

306 self.logger.debug( 

307 "Expanded query (fallback)", 

308 original_query=query, 

309 expanded_query=expanded_query, 

310 ) 

311 break 

312 

313 return expanded_query 

314 

315 async def _expand_query_aggressive(self, query: str) -> str: 

316 """🔥 NEW: More aggressive query expansion for exploratory searches.""" 

317 try: 

318 query_analysis = self.spacy_analyzer.analyze_query_semantic(query) 

319 

320 # Start with original query 

321 expanded_query = query 

322 

323 # Add more semantic keywords (increased from 3 to 5) 

324 if query_analysis.semantic_keywords: 

325 semantic_terms = " ".join(query_analysis.semantic_keywords[:5]) 

326 expanded_query = f"{query} {semantic_terms}" 

327 

328 # Add more main concepts (increased from 2 to 4) 

329 if query_analysis.main_concepts: 

330 concept_terms = " ".join(query_analysis.main_concepts[:4]) 

331 expanded_query = f"{expanded_query} {concept_terms}" 

332 

333 # Add entity-based expansion 

334 if query_analysis.entities: 

335 entity_terms = " ".join([ent[0] for ent in query_analysis.entities[:3]]) 

336 expanded_query = f"{expanded_query} {entity_terms}" 

337 

338 # Apply multiple legacy expansions for exploration 

339 lower_query = query.lower() 

340 expansion_count = 0 

341 for key, expansions in self.query_expansions.items(): 

342 if key.lower() in lower_query and expansion_count < 3: # Max 3 expansions 

343 expansion_terms = " ".join(expansions[:3]) 

344 expanded_query = f"{expanded_query} {expansion_terms}" 

345 expansion_count += 1 

346 

347 self.logger.debug( 

348 "🔥 Aggressive query expansion for exploration", 

349 original_query=query, 

350 expanded_query=expanded_query, 

351 expansion_ratio=len(expanded_query.split()) / len(query.split()), 

352 ) 

353 

354 return expanded_query 

355 

356 except Exception as e: 

357 self.logger.warning(f"Aggressive expansion failed, using standard: {e}") 

358 return await self._expand_query(query) 

359 

360 async def _get_embedding(self, text: str) -> list[float]: 

361 """Get embedding for text using OpenAI.""" 

362 try: 

363 response = await self.openai_client.embeddings.create( 

364 model="text-embedding-3-small", 

365 input=text, 

366 ) 

367 return response.data[0].embedding 

368 except Exception as e: 

369 self.logger.error("Failed to get embedding", error=str(e)) 

370 raise 

371 

372 async def search( 

373 self, 

374 query: str, 

375 limit: int = 5, 

376 source_types: list[str] | None = None, 

377 project_ids: list[str] | None = None, 

378 # 🔥 NEW: Phase 2.2 parameters 

379 session_context: dict[str, Any] | None = None, 

380 behavioral_context: list[str] | None = None, 

381 ) -> list[SearchResult]: 

382 """Perform hybrid search combining vector and keyword search. 

383 

384 Args: 

385 query: Search query text 

386 limit: Maximum number of results to return 

387 source_types: Optional list of source types to filter by 

388 project_ids: Optional list of project IDs to filter by 

389 session_context: Optional session context for intent classification 

390 behavioral_context: Optional behavioral context (previous intents) 

391 """ 

392 self.logger.debug( 

393 "Starting hybrid search", 

394 query=query, 

395 limit=limit, 

396 source_types=source_types, 

397 project_ids=project_ids, 

398 intent_adaptation_enabled=self.enable_intent_adaptation, 

399 ) 

400 

401 try: 

402 # 🔥 NEW: Phase 2.2 Intent Classification and Adaptive Search 

403 search_intent = None 

404 adaptive_config = None 

405 

406 if self.enable_intent_adaptation and self.intent_classifier: 

407 # Classify search intent using comprehensive spaCy analysis 

408 search_intent = self.intent_classifier.classify_intent( 

409 query, session_context, behavioral_context 

410 ) 

411 

412 # Adapt search configuration based on classified intent 

413 adaptive_config = self.adaptive_strategy.adapt_search( 

414 search_intent, query 

415 ) 

416 

417 # Update search parameters based on adaptive configuration 

418 if adaptive_config: 

419 # Override weights based on intent 

420 original_vector_weight = self.vector_weight 

421 original_keyword_weight = self.keyword_weight 

422 original_min_score = self.min_score 

423 

424 self.vector_weight = adaptive_config.vector_weight 

425 self.keyword_weight = adaptive_config.keyword_weight 

426 self.min_score = adaptive_config.min_score_threshold 

427 

428 # Adjust limit based on intent configuration 

429 limit = min(adaptive_config.max_results, limit * 2) 

430 

431 self.logger.debug( 

432 "🔥 Adapted search parameters based on intent", 

433 intent=search_intent.intent_type.value, 

434 confidence=search_intent.confidence, 

435 vector_weight=self.vector_weight, 

436 keyword_weight=self.keyword_weight, 

437 adjusted_limit=limit, 

438 use_kg=adaptive_config.use_knowledge_graph, 

439 ) 

440 

441 # Expand query with related terms (now potentially adapted) 

442 expanded_query = await self._expand_query(query) 

443 

444 # Apply intent-specific query expansion if available 

445 if adaptive_config and adaptive_config.expand_query: 

446 if adaptive_config.expansion_aggressiveness > 0.5: 

447 # More aggressive expansion for exploratory queries 

448 expanded_query = await self._expand_query_aggressive(query) 

449 

450 # Get vector search results 

451 vector_results = await self._vector_search( 

452 expanded_query, limit * 3, project_ids 

453 ) 

454 

455 # Get keyword search results  

456 keyword_results = await self._keyword_search(query, limit * 3, project_ids) 

457 

458 # Analyze query for context 

459 query_context = self._analyze_query(query) 

460 

461 # 🔥 NEW: Add intent information to query context 

462 if search_intent: 

463 query_context["search_intent"] = search_intent 

464 query_context["adaptive_config"] = adaptive_config 

465 

466 # Combine and rerank results 

467 combined_results = await self._combine_results( 

468 vector_results, 

469 keyword_results, 

470 query_context, 

471 limit, 

472 source_types, 

473 project_ids, 

474 ) 

475 

476 # 🔥 NEW: Restore original search parameters if they were modified 

477 if adaptive_config: 

478 self.vector_weight = original_vector_weight 

479 self.keyword_weight = original_keyword_weight 

480 self.min_score = original_min_score 

481 

482 # Convert to SearchResult objects 

483 return [ 

484 SearchResult( 

485 score=result.score, 

486 text=result.text, 

487 source_type=result.source_type, 

488 source_title=result.source_title, 

489 source_url=result.source_url, 

490 file_path=result.file_path, 

491 repo_name=result.repo_name, 

492 

493 # Project information 

494 project_id=result.project_id, 

495 project_name=result.project_name, 

496 project_description=result.project_description, 

497 collection_name=result.collection_name, 

498 

499 # Basic hierarchy and attachment (existing) 

500 parent_id=result.parent_id, 

501 parent_title=result.parent_title, 

502 breadcrumb_text=result.breadcrumb_text, 

503 depth=result.depth, 

504 children_count=result.children_count, 

505 hierarchy_context=result.hierarchy_context, 

506 is_attachment=result.is_attachment, 

507 parent_document_id=result.parent_document_id, 

508 parent_document_title=result.parent_document_title, 

509 attachment_id=result.attachment_id, 

510 original_filename=result.original_filename, 

511 file_size=result.file_size, 

512 mime_type=result.mime_type, 

513 attachment_author=result.attachment_author, 

514 attachment_context=result.attachment_context, 

515 

516 # 🔥 NEW: Section-level intelligence 

517 section_title=result.section_title, 

518 section_type=result.section_type, 

519 section_level=result.section_level, 

520 section_anchor=result.section_anchor, 

521 section_breadcrumb=result.section_breadcrumb, 

522 section_depth=result.section_depth, 

523 

524 # 🔥 NEW: Content analysis 

525 has_code_blocks=result.has_code_blocks, 

526 has_tables=result.has_tables, 

527 has_images=result.has_images, 

528 has_links=result.has_links, 

529 word_count=result.word_count, 

530 char_count=result.char_count, 

531 estimated_read_time=result.estimated_read_time, 

532 paragraph_count=result.paragraph_count, 

533 

534 # 🔥 NEW: Semantic analysis 

535 entities=result.entities, 

536 topics=result.topics, 

537 key_phrases=result.key_phrases, 

538 pos_tags=result.pos_tags, 

539 

540 # 🔥 NEW: Navigation context 

541 previous_section=result.previous_section, 

542 next_section=result.next_section, 

543 sibling_sections=result.sibling_sections, 

544 subsections=result.subsections, 

545 document_hierarchy=result.document_hierarchy, 

546 

547 # 🔥 NEW: Chunking context 

548 chunk_index=result.chunk_index, 

549 total_chunks=result.total_chunks, 

550 chunking_strategy=result.chunking_strategy, 

551 

552 # 🔥 NEW: File conversion intelligence 

553 original_file_type=result.original_file_type, 

554 conversion_method=result.conversion_method, 

555 is_excel_sheet=result.is_excel_sheet, 

556 is_converted=result.is_converted, 

557 

558 # 🔥 NEW: Cross-references and enhanced context 

559 cross_references=result.cross_references, 

560 topic_analysis=result.topic_analysis, 

561 content_type_context=result.content_type_context, 

562 ) 

563 for result in combined_results 

564 ] 

565 

566 except Exception as e: 

567 self.logger.error("Error in hybrid search", error=str(e), query=query) 

568 raise 

569 

570 async def generate_topic_search_chain( 

571 self, 

572 query: str, 

573 strategy: ChainStrategy = ChainStrategy.MIXED_EXPLORATION, 

574 max_links: int = 5, 

575 initialize_from_search: bool = True 

576 ) -> TopicSearchChain: 

577 """🔥 NEW: Generate a topic-driven search chain for progressive content discovery. 

578  

579 Args: 

580 query: Original search query 

581 strategy: Strategy for chain generation 

582 max_links: Maximum number of links in the chain 

583 initialize_from_search: Whether to initialize topic relationships from search results 

584  

585 Returns: 

586 TopicSearchChain with progressive queries for exploration 

587 """ 

588 self.logger.debug( 

589 "Generating topic search chain", 

590 query=query, 

591 strategy=strategy.value, 

592 max_links=max_links 

593 ) 

594 

595 try: 

596 # Initialize topic relationships from search results if needed 

597 if initialize_from_search and not self._topic_chains_initialized: 

598 await self._initialize_topic_relationships(query) 

599 

600 # Generate the topic search chain 

601 topic_chain = self.topic_chain_generator.generate_search_chain( 

602 original_query=query, 

603 strategy=strategy, 

604 max_links=max_links 

605 ) 

606 

607 self.logger.info( 

608 "Topic search chain generated successfully", 

609 chain_length=len(topic_chain.chain_links), 

610 strategy=strategy.value, 

611 topics_covered=topic_chain.total_topics_covered, 

612 discovery_potential=f"{topic_chain.estimated_discovery_potential:.2f}", 

613 generation_time=f"{topic_chain.generation_time_ms:.1f}ms" 

614 ) 

615 

616 return topic_chain 

617 

618 except Exception as e: 

619 self.logger.error("Error generating topic search chain", error=str(e), query=query) 

620 raise 

621 

622 async def execute_topic_chain_search( 

623 self, 

624 topic_chain: TopicSearchChain, 

625 results_per_link: int = 3, 

626 source_types: list[str] | None = None, 

627 project_ids: list[str] | None = None 

628 ) -> dict[str, list[SearchResult]]: 

629 """🔥 NEW: Execute searches for all links in a topic chain. 

630  

631 Args: 

632 topic_chain: The topic search chain to execute 

633 results_per_link: Number of results to return per chain link 

634 source_types: Optional source type filters 

635 project_ids: Optional project ID filters 

636  

637 Returns: 

638 Dictionary mapping chain link queries to their search results 

639 """ 

640 self.logger.debug( 

641 "Executing topic chain search", 

642 chain_length=len(topic_chain.chain_links), 

643 results_per_link=results_per_link 

644 ) 

645 

646 chain_results = {} 

647 

648 try: 

649 # Execute search for original query 

650 original_results = await self.search( 

651 query=topic_chain.original_query, 

652 limit=results_per_link, 

653 source_types=source_types, 

654 project_ids=project_ids 

655 ) 

656 chain_results[topic_chain.original_query] = original_results 

657 

658 # Execute search for each chain link 

659 for link in topic_chain.chain_links: 

660 try: 

661 link_results = await self.search( 

662 query=link.query, 

663 limit=results_per_link, 

664 source_types=source_types, 

665 project_ids=project_ids 

666 ) 

667 chain_results[link.query] = link_results 

668 

669 self.logger.debug( 

670 "Executed chain link search", 

671 query=link.query, 

672 results_count=len(link_results), 

673 topic_focus=link.topic_focus, 

674 exploration_type=link.exploration_type 

675 ) 

676 

677 except Exception as e: 

678 self.logger.warning( 

679 "Failed to execute chain link search", 

680 query=link.query, 

681 error=str(e) 

682 ) 

683 chain_results[link.query] = [] 

684 

685 total_results = sum(len(results) for results in chain_results.values()) 

686 self.logger.info( 

687 "Topic chain search execution completed", 

688 total_queries=len(chain_results), 

689 total_results=total_results, 

690 original_query=topic_chain.original_query 

691 ) 

692 

693 return chain_results 

694 

695 except Exception as e: 

696 self.logger.error("Error executing topic chain search", error=str(e)) 

697 raise 

698 

699 async def _initialize_topic_relationships(self, sample_query: str) -> None: 

700 """Initialize topic relationships from a sample search to bootstrap topic chaining.""" 

701 try: 

702 # Perform a broad search to get diverse results for topic relationship mapping 

703 sample_results = await self.search( 

704 query=sample_query, 

705 limit=20, # Get more results for better topic coverage 

706 source_types=None, 

707 project_ids=None 

708 ) 

709 

710 if sample_results: 

711 # Initialize topic relationships from the sample results 

712 self.topic_chain_generator.initialize_from_results(sample_results) 

713 self._topic_chains_initialized = True 

714 

715 self.logger.info( 

716 "Topic relationships initialized from search results", 

717 sample_query=sample_query, 

718 sample_results_count=len(sample_results) 

719 ) 

720 else: 

721 self.logger.warning( 

722 "No search results available for topic relationship initialization", 

723 sample_query=sample_query 

724 ) 

725 

726 except Exception as e: 

727 self.logger.error( 

728 "Failed to initialize topic relationships", 

729 error=str(e), 

730 sample_query=sample_query 

731 ) 

732 # Don't raise - topic chaining can still work with limited relationships 

733 

734 def _analyze_query(self, query: str) -> dict[str, Any]: 

735 """🔥 ENHANCED: Analyze query using spaCy NLP instead of regex patterns.""" 

736 try: 

737 # Use spaCy analyzer for comprehensive query analysis 

738 query_analysis = self.spacy_analyzer.analyze_query_semantic(query) 

739 

740 # Create enhanced query context using spaCy analysis 

741 context = { 

742 # Basic query characteristics 

743 "is_question": query_analysis.is_question, 

744 "is_broad": len(query.split()) < 5, 

745 "is_specific": len(query.split()) > 7, 

746 "is_technical": query_analysis.is_technical, 

747 "complexity_score": query_analysis.complexity_score, 

748 

749 # spaCy-powered intent detection 

750 "probable_intent": query_analysis.intent_signals.get("primary_intent", "informational"), 

751 "intent_confidence": query_analysis.intent_signals.get("confidence", 0.0), 

752 "linguistic_features": query_analysis.intent_signals.get("linguistic_features", {}), 

753 

754 # Enhanced keyword extraction using spaCy 

755 "keywords": query_analysis.semantic_keywords, 

756 "entities": [entity[0] for entity in query_analysis.entities], # Extract entity text 

757 "entity_types": [entity[1] for entity in query_analysis.entities], # Extract entity labels 

758 "main_concepts": query_analysis.main_concepts, 

759 "pos_patterns": query_analysis.pos_patterns, 

760 

761 # Store query analysis for later use 

762 "spacy_analysis": query_analysis, 

763 } 

764 

765 # Enhanced content type preference detection using spaCy 

766 semantic_keywords_set = set(query_analysis.semantic_keywords) 

767 

768 # Code preference detection 

769 code_keywords = {"code", "function", "implementation", "script", "method", "class", "api"} 

770 if semantic_keywords_set.intersection(code_keywords): 

771 context["prefers_code"] = True 

772 

773 # Table/data preference detection  

774 table_keywords = {"table", "data", "excel", "spreadsheet", "csv", "sheet"} 

775 if semantic_keywords_set.intersection(table_keywords): 

776 context["prefers_tables"] = True 

777 

778 # Image preference detection 

779 image_keywords = {"image", "diagram", "screenshot", "visual", "chart", "graph"} 

780 if semantic_keywords_set.intersection(image_keywords): 

781 context["prefers_images"] = True 

782 

783 # Documentation preference detection 

784 doc_keywords = {"documentation", "doc", "guide", "manual", "instruction", "help"} 

785 if semantic_keywords_set.intersection(doc_keywords): 

786 context["prefers_docs"] = True 

787 

788 self.logger.debug( 

789 "🔥 spaCy query analysis completed", 

790 intent=context["probable_intent"], 

791 confidence=context["intent_confidence"], 

792 entities_found=len(query_analysis.entities), 

793 keywords_extracted=len(query_analysis.semantic_keywords), 

794 processing_time_ms=query_analysis.processing_time_ms, 

795 ) 

796 

797 return context 

798 

799 except Exception as e: 

800 self.logger.warning(f"spaCy analysis failed, using fallback: {e}") 

801 # Fallback to original regex-based analysis 

802 return self._analyze_query_fallback(query) 

803 

804 def _analyze_query_fallback(self, query: str) -> dict[str, Any]: 

805 """Fallback query analysis using original regex patterns.""" 

806 context = { 

807 "is_question": bool( 

808 re.search(r"\?|what|how|why|when|who|where", query.lower()) 

809 ), 

810 "is_broad": len(query.split()) < 5, 

811 "is_specific": len(query.split()) > 7, 

812 "probable_intent": "informational", 

813 "keywords": [ 

814 word.lower() for word in re.findall(r"\b\w{3,}\b", query.lower()) 

815 ], 

816 } 

817 

818 lower_query = query.lower() 

819 if "how to" in lower_query or "steps" in lower_query: 

820 context["probable_intent"] = "procedural" 

821 elif any( 

822 term in lower_query for term in ["requirements", "prd", "specification"] 

823 ): 

824 context["probable_intent"] = "requirements" 

825 elif any( 

826 term in lower_query for term in ["architecture", "design", "structure"] 

827 ): 

828 context["probable_intent"] = "architecture" 

829 

830 # Content type preferences (original logic) 

831 if any(term in lower_query for term in ["code", "function", "implementation", "script"]): 

832 context["prefers_code"] = True 

833 if any(term in lower_query for term in ["table", "data", "excel", "spreadsheet"]): 

834 context["prefers_tables"] = True 

835 if any(term in lower_query for term in ["image", "diagram", "screenshot", "visual"]): 

836 context["prefers_images"] = True 

837 if any(term in lower_query for term in ["documentation", "docs", "guide", "manual"]): 

838 context["prefers_docs"] = True 

839 

840 return context 

841 

842 def _boost_score_with_metadata( 

843 self, base_score: float, metadata_info: dict, query_context: dict 

844 ) -> float: 

845 """🔥 ENHANCED: Boost search scores using spaCy semantic analysis and metadata context.""" 

846 boosted_score = base_score 

847 boost_factor = 0.0 

848 

849 # 🔥 NEW: Phase 2.2 Intent-Aware Boosting 

850 search_intent = query_context.get("search_intent") 

851 adaptive_config = query_context.get("adaptive_config") 

852 

853 if search_intent and adaptive_config: 

854 # Apply intent-specific ranking boosts 

855 ranking_boosts = adaptive_config.ranking_boosts 

856 source_type_preferences = adaptive_config.source_type_preferences 

857 

858 # Source type preference boosting 

859 source_type = metadata_info.get("source_type", "") 

860 if source_type in source_type_preferences: 

861 source_boost = (source_type_preferences[source_type] - 1.0) * 0.2 

862 boost_factor += source_boost 

863 

864 # Content type boosting from ranking_boosts 

865 for boost_key, boost_value in ranking_boosts.items(): 

866 if boost_key == "section_type" and isinstance(boost_value, dict): 

867 section_type = metadata_info.get("section_type", "") 

868 if section_type in boost_value: 

869 section_boost = (boost_value[section_type] - 1.0) * 0.15 

870 boost_factor += section_boost 

871 elif boost_key == "source_type" and isinstance(boost_value, dict): 

872 if source_type in boost_value: 

873 source_boost = (boost_value[source_type] - 1.0) * 0.15 

874 boost_factor += source_boost 

875 elif boost_key in metadata_info and metadata_info[boost_key]: 

876 # Boolean metadata boosting (e.g., has_money_entities, has_org_entities) 

877 if isinstance(boost_value, (int, float)): 

878 bool_boost = (boost_value - 1.0) * 0.1 

879 boost_factor += bool_boost 

880 

881 # Intent-specific confidence boosting 

882 confidence_boost = search_intent.confidence * 0.05 # Up to 5% boost for high confidence 

883 boost_factor += confidence_boost 

884 

885 self.logger.debug( 

886 "🔥 Applied intent-aware boosting", 

887 intent=search_intent.intent_type.value, 

888 confidence=search_intent.confidence, 

889 source_type=source_type, 

890 total_intent_boost=boost_factor, 

891 ) 

892 

893 # 🔥 Content type relevance boosting (enhanced) 

894 if query_context.get("prefers_code") and metadata_info.get("has_code_blocks"): 

895 boost_factor += 0.15 

896 

897 if query_context.get("prefers_tables") and metadata_info.get("has_tables"): 

898 boost_factor += 0.12 

899 

900 if query_context.get("prefers_images") and metadata_info.get("has_images"): 

901 boost_factor += 0.10 

902 

903 if query_context.get("prefers_docs") and not metadata_info.get("has_code_blocks"): 

904 boost_factor += 0.08 

905 

906 # 🔥 Section level relevance (higher level = more important) 

907 section_level = metadata_info.get("section_level") 

908 if section_level is not None: 

909 if section_level <= 2: # H1, H2 are more important 

910 boost_factor += 0.10 

911 elif section_level <= 3: # H3 moderately important  

912 boost_factor += 0.05 

913 

914 # 🔥 Content quality indicators 

915 word_count = metadata_info.get("word_count") or 0 

916 if word_count > 100: # Substantial content 

917 boost_factor += 0.05 

918 if word_count > 500: # Very detailed content 

919 boost_factor += 0.05 

920 

921 # 🔥 Converted file boosting (often contains rich content) 

922 if metadata_info.get("is_converted") and metadata_info.get("original_file_type") in ["docx", "xlsx", "pdf"]: 

923 boost_factor += 0.08 

924 

925 # 🔥 Excel sheet specific boosting for data queries 

926 if metadata_info.get("is_excel_sheet") and any( 

927 term in " ".join(query_context.get("keywords", [])) 

928 for term in ["data", "table", "sheet", "excel", "csv"] 

929 ): 

930 boost_factor += 0.12 

931 

932 # 🔥 NEW: spaCy-powered semantic entity relevance 

933 if "spacy_analysis" in query_context: 

934 spacy_analysis = query_context["spacy_analysis"] 

935 

936 # Enhanced entity matching using spaCy similarity 

937 entities = metadata_info.get("entities", []) 

938 if entities and spacy_analysis.entities: 

939 max_entity_similarity = 0.0 

940 for entity in entities: 

941 entity_text = entity if isinstance(entity, str) else entity.get("text", str(entity)) 

942 similarity = self.spacy_analyzer.semantic_similarity_matching( 

943 spacy_analysis, entity_text 

944 ) 

945 max_entity_similarity = max(max_entity_similarity, similarity) 

946 

947 # Apply semantic entity boost based on similarity 

948 if max_entity_similarity > 0.6: # High similarity 

949 boost_factor += 0.15 

950 elif max_entity_similarity > 0.4: # Medium similarity 

951 boost_factor += 0.10 

952 elif max_entity_similarity > 0.2: # Low similarity 

953 boost_factor += 0.05 

954 

955 # Enhanced topic relevance using spaCy 

956 topics = metadata_info.get("topics", []) 

957 if topics and spacy_analysis.main_concepts: 

958 max_topic_similarity = 0.0 

959 for topic in topics: 

960 topic_text = topic if isinstance(topic, str) else topic.get("text", str(topic)) 

961 for concept in spacy_analysis.main_concepts: 

962 similarity = self.spacy_analyzer.semantic_similarity_matching( 

963 spacy_analysis, f"{topic_text} {concept}" 

964 ) 

965 max_topic_similarity = max(max_topic_similarity, similarity) 

966 

967 # Apply semantic topic boost 

968 if max_topic_similarity > 0.5: 

969 boost_factor += 0.12 

970 elif max_topic_similarity > 0.3: 

971 boost_factor += 0.08 

972 

973 else: 

974 # Fallback to original entity/topic matching 

975 entities = metadata_info.get("entities", []) 

976 if entities: 

977 query_keywords = set(query_context.get("keywords", [])) 

978 entity_texts = set() 

979 for entity in entities: 

980 if isinstance(entity, str): 

981 entity_texts.add(entity.lower()) 

982 elif isinstance(entity, dict): 

983 if "text" in entity: 

984 entity_texts.add(str(entity["text"]).lower()) 

985 elif "entity" in entity: 

986 entity_texts.add(str(entity["entity"]).lower()) 

987 else: 

988 entity_texts.add(str(entity).lower()) 

989 

990 if query_keywords.intersection(entity_texts): 

991 boost_factor += 0.10 

992 

993 # Original topic relevance 

994 topics = metadata_info.get("topics", []) 

995 if topics: 

996 query_keywords = set(query_context.get("keywords", [])) 

997 topic_texts = set() 

998 for topic in topics: 

999 if isinstance(topic, str): 

1000 topic_texts.add(topic.lower()) 

1001 elif isinstance(topic, dict): 

1002 if "text" in topic: 

1003 topic_texts.add(str(topic["text"]).lower()) 

1004 elif "topic" in topic: 

1005 topic_texts.add(str(topic["topic"]).lower()) 

1006 else: 

1007 topic_texts.add(str(topic).lower()) 

1008 

1009 if query_keywords.intersection(topic_texts): 

1010 boost_factor += 0.08 

1011 

1012 # Apply boost (cap at reasonable maximum) 

1013 boost_factor = min(boost_factor, 0.5) # Maximum 50% boost (increased from 40%) 

1014 return boosted_score * (1 + boost_factor) 

1015 

1016 async def _vector_search( 

1017 self, query: str, limit: int, project_ids: list[str] | None = None 

1018 ) -> list[dict[str, Any]]: 

1019 """Perform vector search using Qdrant.""" 

1020 query_embedding = await self._get_embedding(query) 

1021 

1022 search_params = models.SearchParams(hnsw_ef=128, exact=False) 

1023 

1024 results = await self.qdrant_client.search( 

1025 collection_name=self.collection_name, 

1026 query_vector=query_embedding, 

1027 limit=limit, 

1028 score_threshold=self.min_score, 

1029 search_params=search_params, 

1030 query_filter=self._build_filter(project_ids), 

1031 ) 

1032 

1033 return [ 

1034 { 

1035 "score": hit.score, 

1036 "text": hit.payload.get("content", "") if hit.payload else "", 

1037 "metadata": hit.payload.get("metadata", {}) if hit.payload else {}, 

1038 "source_type": ( 

1039 hit.payload.get("source_type", "unknown") 

1040 if hit.payload 

1041 else "unknown" 

1042 ), 

1043 } 

1044 for hit in results 

1045 ] 

1046 

1047 async def _keyword_search( 

1048 self, query: str, limit: int, project_ids: list[str] | None = None 

1049 ) -> list[dict[str, Any]]: 

1050 """Perform keyword search using BM25.""" 

1051 scroll_results = await self.qdrant_client.scroll( 

1052 collection_name=self.collection_name, 

1053 limit=10000, 

1054 with_payload=True, 

1055 with_vectors=False, 

1056 scroll_filter=self._build_filter(project_ids), 

1057 ) 

1058 

1059 documents = [] 

1060 metadata_list = [] 

1061 source_types = [] 

1062 

1063 for point in scroll_results[0]: 

1064 if point.payload: 

1065 content = point.payload.get("content", "") 

1066 metadata = point.payload.get("metadata", {}) 

1067 source_type = point.payload.get("source_type", "unknown") 

1068 documents.append(content) 

1069 metadata_list.append(metadata) 

1070 source_types.append(source_type) 

1071 

1072 tokenized_docs = [doc.split() for doc in documents] 

1073 bm25 = BM25Okapi(tokenized_docs) 

1074 

1075 tokenized_query = query.split() 

1076 scores = bm25.get_scores(tokenized_query) 

1077 

1078 top_indices = np.argsort(scores)[-limit:][::-1] 

1079 

1080 return [ 

1081 { 

1082 "score": float(scores[idx]), 

1083 "text": documents[idx], 

1084 "metadata": metadata_list[idx], 

1085 "source_type": source_types[idx], 

1086 } 

1087 for idx in top_indices 

1088 if scores[idx] > 0 

1089 ] 

1090 

1091 async def _combine_results( 

1092 self, 

1093 vector_results: list[dict[str, Any]], 

1094 keyword_results: list[dict[str, Any]], 

1095 query_context: dict[str, Any], 

1096 limit: int, 

1097 source_types: list[str] | None = None, 

1098 project_ids: list[str] | None = None, 

1099 ) -> list[HybridSearchResult]: 

1100 """Combine and rerank results from vector and keyword search.""" 

1101 combined_dict = {} 

1102 

1103 # Process vector results 

1104 for result in vector_results: 

1105 text = result["text"] 

1106 if text not in combined_dict: 

1107 metadata = result["metadata"] 

1108 combined_dict[text] = { 

1109 "text": text, 

1110 "metadata": metadata, 

1111 "source_type": result["source_type"], 

1112 "vector_score": result["score"], 

1113 "keyword_score": 0.0, 

1114 } 

1115 

1116 # Process keyword results 

1117 for result in keyword_results: 

1118 text = result["text"] 

1119 if text in combined_dict: 

1120 combined_dict[text]["keyword_score"] = result["score"] 

1121 else: 

1122 metadata = result["metadata"] 

1123 combined_dict[text] = { 

1124 "text": text, 

1125 "metadata": metadata, 

1126 "source_type": result["source_type"], 

1127 "vector_score": 0.0, 

1128 "keyword_score": result["score"], 

1129 } 

1130 

1131 # Calculate combined scores and create results 

1132 combined_results = [] 

1133 

1134 # 🔥 NEW: Extract intent-specific filtering configuration 

1135 search_intent = query_context.get("search_intent") 

1136 adaptive_config = query_context.get("adaptive_config") 

1137 result_filters = adaptive_config.result_filters if adaptive_config else {} 

1138 

1139 for text, info in combined_dict.items(): 

1140 # Skip if source type doesn't match filter 

1141 if source_types and info["source_type"] not in source_types: 

1142 continue 

1143 

1144 metadata = info["metadata"] 

1145 metadata_info = self._extract_metadata_info(metadata) 

1146 

1147 # 🔥 NEW: Apply intent-specific result filtering 

1148 if search_intent and result_filters: 

1149 should_skip = False 

1150 

1151 # Content type filtering 

1152 if "content_type" in result_filters: 

1153 allowed_content_types = result_filters["content_type"] 

1154 # Check if any content type indicators match 

1155 has_matching_content = False 

1156 

1157 for content_type in allowed_content_types: 

1158 if content_type == "code" and metadata_info.get("has_code_blocks"): 

1159 has_matching_content = True 

1160 break 

1161 elif content_type == "documentation" and not metadata_info.get("has_code_blocks"): 

1162 has_matching_content = True 

1163 break 

1164 elif content_type == "technical" and query_context.get("is_technical"): 

1165 has_matching_content = True 

1166 break 

1167 elif content_type in ["requirements", "business", "strategy"]: 

1168 # Check if content mentions business terms 

1169 business_indicators = metadata_info.get("business_indicators", 0) 

1170 if business_indicators > 0: 

1171 has_matching_content = True 

1172 break 

1173 elif content_type in ["guide", "tutorial", "procedure"]: 

1174 # Check for procedural content 

1175 section_type = metadata_info.get("section_type", "").lower() 

1176 if any(proc_word in section_type for proc_word in ["step", "guide", "procedure", "tutorial"]): 

1177 has_matching_content = True 

1178 break 

1179 

1180 if not has_matching_content: 

1181 should_skip = True 

1182 

1183 if should_skip: 

1184 continue 

1185 

1186 combined_score = ( 

1187 self.vector_weight * info["vector_score"] 

1188 + self.keyword_weight * info["keyword_score"] 

1189 ) 

1190 

1191 if combined_score >= self.min_score: 

1192 # Extract project information 

1193 project_info = self._extract_project_info(metadata) 

1194 

1195 boosted_score = self._boost_score_with_metadata( 

1196 combined_score, metadata_info, query_context 

1197 ) 

1198 

1199 combined_results.append( 

1200 HybridSearchResult( 

1201 score=boosted_score, 

1202 text=text, 

1203 source_type=info["source_type"], 

1204 source_title=metadata.get("title", ""), 

1205 source_url=metadata.get("url"), 

1206 file_path=metadata.get("file_path"), 

1207 repo_name=metadata.get("repository_name"), 

1208 vector_score=info["vector_score"], 

1209 keyword_score=info["keyword_score"], 

1210 

1211 # Project information 

1212 project_id=project_info["project_id"], 

1213 project_name=project_info["project_name"], 

1214 project_description=project_info["project_description"], 

1215 collection_name=project_info["collection_name"], 

1216 

1217 # Basic hierarchy and attachment (existing) 

1218 parent_id=metadata_info["parent_id"], 

1219 parent_title=metadata_info["parent_title"], 

1220 breadcrumb_text=metadata_info["breadcrumb_text"], 

1221 depth=metadata_info["depth"], 

1222 children_count=metadata_info["children_count"], 

1223 hierarchy_context=metadata_info["hierarchy_context"], 

1224 is_attachment=metadata_info["is_attachment"], 

1225 parent_document_id=metadata_info["parent_document_id"], 

1226 parent_document_title=metadata_info["parent_document_title"], 

1227 attachment_id=metadata_info["attachment_id"], 

1228 original_filename=metadata_info["original_filename"], 

1229 file_size=metadata_info["file_size"], 

1230 mime_type=metadata_info["mime_type"], 

1231 attachment_author=metadata_info["attachment_author"], 

1232 attachment_context=metadata_info["attachment_context"], 

1233 

1234 # 🔥 NEW: Section-level intelligence 

1235 section_title=metadata_info["section_title"], 

1236 section_type=metadata_info["section_type"], 

1237 section_level=metadata_info["section_level"], 

1238 section_anchor=metadata_info["section_anchor"], 

1239 section_breadcrumb=metadata_info["section_breadcrumb"], 

1240 section_depth=metadata_info["section_depth"], 

1241 

1242 # 🔥 NEW: Content analysis 

1243 has_code_blocks=metadata_info["has_code_blocks"], 

1244 has_tables=metadata_info["has_tables"], 

1245 has_images=metadata_info["has_images"], 

1246 has_links=metadata_info["has_links"], 

1247 word_count=metadata_info["word_count"], 

1248 char_count=metadata_info["char_count"], 

1249 estimated_read_time=metadata_info["estimated_read_time"], 

1250 paragraph_count=metadata_info["paragraph_count"], 

1251 

1252 # 🔥 NEW: Semantic analysis 

1253 entities=metadata_info["entities"], 

1254 topics=metadata_info["topics"], 

1255 key_phrases=metadata_info["key_phrases"], 

1256 pos_tags=metadata_info["pos_tags"], 

1257 

1258 # 🔥 NEW: Navigation context 

1259 previous_section=metadata_info["previous_section"], 

1260 next_section=metadata_info["next_section"], 

1261 sibling_sections=metadata_info["sibling_sections"], 

1262 subsections=metadata_info["subsections"], 

1263 document_hierarchy=metadata_info["document_hierarchy"], 

1264 

1265 # 🔥 NEW: Chunking context 

1266 chunk_index=metadata_info["chunk_index"], 

1267 total_chunks=metadata_info["total_chunks"], 

1268 chunking_strategy=metadata_info["chunking_strategy"], 

1269 

1270 # 🔥 NEW: File conversion intelligence 

1271 original_file_type=metadata_info["original_file_type"], 

1272 conversion_method=metadata_info["conversion_method"], 

1273 is_excel_sheet=metadata_info["is_excel_sheet"], 

1274 is_converted=metadata_info["is_converted"], 

1275 

1276 # 🔥 NEW: Cross-references and enhanced context 

1277 cross_references=metadata_info["cross_references"], 

1278 topic_analysis=metadata_info["topic_analysis"], 

1279 content_type_context=metadata_info["content_type_context"], 

1280 ) 

1281 ) 

1282 

1283 # Sort by combined score 

1284 combined_results.sort(key=lambda x: x.score, reverse=True) 

1285 

1286 # 🔥 NEW: Apply diversity filtering for exploratory intents 

1287 if adaptive_config and adaptive_config.diversity_factor > 0.0: 

1288 diverse_results = self._apply_diversity_filtering( 

1289 combined_results, adaptive_config.diversity_factor, limit 

1290 ) 

1291 self.logger.debug( 

1292 "🔥 Applied diversity filtering", 

1293 original_count=len(combined_results), 

1294 diverse_count=len(diverse_results), 

1295 diversity_factor=adaptive_config.diversity_factor, 

1296 ) 

1297 return diverse_results 

1298 

1299 return combined_results[:limit] 

1300 

1301 def _extract_metadata_info(self, metadata: dict) -> dict: 

1302 """Extract comprehensive metadata information from document metadata. 

1303 

1304 Args: 

1305 metadata: Document metadata 

1306 

1307 Returns: 

1308 Dictionary with all available metadata information 

1309 """ 

1310 # 🔥 ENHANCED: Extract ALL the rich metadata we store 

1311 

1312 # Basic hierarchy information (existing) 

1313 hierarchy_info = { 

1314 "parent_id": metadata.get("parent_id"), 

1315 "parent_title": metadata.get("parent_title"), 

1316 "breadcrumb_text": metadata.get("breadcrumb_text"), 

1317 "depth": metadata.get("depth"), 

1318 "children_count": None, 

1319 "hierarchy_context": None, 

1320 } 

1321 

1322 # Calculate children count 

1323 children = metadata.get("children", []) 

1324 if children: 

1325 hierarchy_info["children_count"] = len(children) 

1326 

1327 # Generate hierarchy context for display 

1328 if metadata.get("breadcrumb_text") or metadata.get("depth") is not None: 

1329 context_parts = [] 

1330 

1331 if metadata.get("breadcrumb_text"): 

1332 context_parts.append(f"Path: {metadata.get('breadcrumb_text')}") 

1333 

1334 if metadata.get("depth") is not None: 

1335 context_parts.append(f"Depth: {metadata.get('depth')}") 

1336 

1337 if ( 

1338 hierarchy_info["children_count"] is not None 

1339 and hierarchy_info["children_count"] > 0 

1340 ): 

1341 context_parts.append(f"Children: {hierarchy_info['children_count']}") 

1342 

1343 if context_parts: 

1344 hierarchy_info["hierarchy_context"] = " | ".join(context_parts) 

1345 

1346 # Basic attachment information (existing) 

1347 attachment_info = { 

1348 "is_attachment": metadata.get("is_attachment", False), 

1349 "parent_document_id": metadata.get("parent_document_id"), 

1350 "parent_document_title": metadata.get("parent_document_title"), 

1351 "attachment_id": metadata.get("attachment_id"), 

1352 "original_filename": metadata.get("original_filename"), 

1353 "file_size": metadata.get("file_size"), 

1354 "mime_type": metadata.get("mime_type"), 

1355 "attachment_author": metadata.get("attachment_author") or metadata.get("author"), 

1356 "attachment_context": None, 

1357 } 

1358 

1359 # Generate attachment context for display 

1360 if attachment_info["is_attachment"]: 

1361 context_parts = [] 

1362 

1363 if attachment_info["original_filename"]: 

1364 context_parts.append(f"File: {attachment_info['original_filename']}") 

1365 

1366 if attachment_info["file_size"]: 

1367 # Convert bytes to human readable format 

1368 size = attachment_info["file_size"] 

1369 if size < 1024: 

1370 size_str = f"{size} B" 

1371 elif size < 1024 * 1024: 

1372 size_str = f"{size / 1024:.1f} KB" 

1373 elif size < 1024 * 1024 * 1024: 

1374 size_str = f"{size / (1024 * 1024):.1f} MB" 

1375 else: 

1376 size_str = f"{size / (1024 * 1024 * 1024):.1f} GB" 

1377 context_parts.append(f"Size: {size_str}") 

1378 

1379 if attachment_info["mime_type"]: 

1380 context_parts.append(f"Type: {attachment_info['mime_type']}") 

1381 

1382 if attachment_info["attachment_author"]: 

1383 context_parts.append(f"Author: {attachment_info['attachment_author']}") 

1384 

1385 if context_parts: 

1386 attachment_info["attachment_context"] = " | ".join(context_parts) 

1387 

1388 # 🔥 NEW: Section-level intelligence 

1389 section_info = { 

1390 "section_title": metadata.get("section_title"), 

1391 "section_type": metadata.get("section_type"), 

1392 "section_level": metadata.get("section_level"), 

1393 "section_anchor": metadata.get("section_anchor"), 

1394 "section_breadcrumb": metadata.get("section_breadcrumb"), 

1395 "section_depth": metadata.get("section_depth"), 

1396 } 

1397 

1398 # 🔥 NEW: Content analysis from content_type_analysis 

1399 content_analysis = metadata.get("content_type_analysis", {}) 

1400 content_info = { 

1401 "has_code_blocks": content_analysis.get("has_code_blocks", False), 

1402 "has_tables": content_analysis.get("has_tables", False), 

1403 "has_images": content_analysis.get("has_images", False), 

1404 "has_links": content_analysis.get("has_links", False), 

1405 "word_count": content_analysis.get("word_count"), 

1406 "char_count": content_analysis.get("char_count"), 

1407 "estimated_read_time": content_analysis.get("estimated_read_time"), 

1408 "paragraph_count": content_analysis.get("paragraph_count"), 

1409 } 

1410 

1411 # Generate content type context 

1412 content_types = [] 

1413 if content_info["has_code_blocks"]: 

1414 content_types.append("Code") 

1415 if content_info["has_tables"]: 

1416 content_types.append("Tables") 

1417 if content_info["has_images"]: 

1418 content_types.append("Images") 

1419 if content_info["has_links"]: 

1420 content_types.append("Links") 

1421 

1422 content_type_context = None 

1423 if content_types: 

1424 content_type_context = f"Contains: {', '.join(content_types)}" 

1425 if content_info["word_count"]: 

1426 content_type_context += f" | {content_info['word_count']} words" 

1427 if content_info["estimated_read_time"]: 

1428 content_type_context += f" | ~{content_info['estimated_read_time']}min read" 

1429 

1430 # 🔥 NEW: Semantic analysis (NLP results) 

1431 # Convert spaCy tuples to expected formats for Pydantic validation 

1432 raw_entities = metadata.get("entities", []) 

1433 raw_topics = metadata.get("topics", []) 

1434 raw_key_phrases = metadata.get("key_phrases", []) 

1435 raw_pos_tags = metadata.get("pos_tags", []) 

1436 

1437 # Convert entities from tuples [(text, label)] to dicts [{"text": text, "label": label}] 

1438 entities = [] 

1439 for entity in raw_entities: 

1440 if isinstance(entity, (list, tuple)) and len(entity) >= 2: 

1441 entities.append({"text": str(entity[0]), "label": str(entity[1])}) 

1442 elif isinstance(entity, str): 

1443 entities.append(entity) # Keep strings as-is 

1444 elif isinstance(entity, dict): 

1445 entities.append(entity) # Keep dicts as-is 

1446 

1447 # Convert topics from tuples to dicts 

1448 topics = [] 

1449 for topic in raw_topics: 

1450 if isinstance(topic, (list, tuple)) and len(topic) >= 2: 

1451 topics.append({"text": str(topic[0]), "score": float(topic[1]) if isinstance(topic[1], (int, float)) else str(topic[1])}) 

1452 elif isinstance(topic, str): 

1453 topics.append(topic) # Keep strings as-is 

1454 elif isinstance(topic, dict): 

1455 topics.append(topic) # Keep dicts as-is 

1456 

1457 # Convert key_phrases from tuples to dicts 

1458 key_phrases = [] 

1459 for phrase in raw_key_phrases: 

1460 if isinstance(phrase, (list, tuple)) and len(phrase) >= 2: 

1461 key_phrases.append({"text": str(phrase[0]), "score": float(phrase[1]) if isinstance(phrase[1], (int, float)) else str(phrase[1])}) 

1462 elif isinstance(phrase, str): 

1463 key_phrases.append(phrase) # Keep strings as-is 

1464 elif isinstance(phrase, dict): 

1465 key_phrases.append(phrase) # Keep dicts as-is 

1466 

1467 # Convert pos_tags from tuples [(token, tag)] to dicts [{"token": token, "tag": tag}] 

1468 pos_tags = [] 

1469 for pos_tag in raw_pos_tags: 

1470 if isinstance(pos_tag, (list, tuple)) and len(pos_tag) >= 2: 

1471 pos_tags.append({"token": str(pos_tag[0]), "tag": str(pos_tag[1])}) 

1472 elif isinstance(pos_tag, dict): 

1473 pos_tags.append(pos_tag) # Keep dicts as-is 

1474 

1475 semantic_info = { 

1476 "entities": entities, 

1477 "topics": topics, 

1478 "key_phrases": key_phrases, 

1479 "pos_tags": pos_tags, 

1480 "topic_analysis": metadata.get("topic_analysis"), 

1481 } 

1482 

1483 # 🔥 NEW: Navigation context 

1484 navigation_info = { 

1485 "previous_section": metadata.get("previous_section"), 

1486 "next_section": metadata.get("next_section"), 

1487 "sibling_sections": metadata.get("sibling_sections", []), 

1488 "subsections": metadata.get("subsections", []), 

1489 "document_hierarchy": metadata.get("document_hierarchy", []), 

1490 } 

1491 

1492 # 🔥 NEW: Chunking context 

1493 chunking_info = { 

1494 "chunk_index": metadata.get("chunk_index"), 

1495 "total_chunks": metadata.get("total_chunks"), 

1496 "chunking_strategy": metadata.get("chunking_strategy"), 

1497 } 

1498 

1499 # 🔥 NEW: File conversion intelligence 

1500 conversion_info = { 

1501 "original_file_type": metadata.get("original_file_type"), 

1502 "conversion_method": metadata.get("conversion_method"), 

1503 "is_excel_sheet": metadata.get("is_excel_sheet", False), 

1504 "is_converted": metadata.get("is_converted", False), 

1505 } 

1506 

1507 # 🔥 NEW: Cross-references 

1508 cross_reference_info = { 

1509 "cross_references": metadata.get("cross_references", []), 

1510 } 

1511 

1512 # Combine all metadata 

1513 return { 

1514 **hierarchy_info, 

1515 **attachment_info, 

1516 **section_info, 

1517 **content_info, 

1518 **semantic_info, 

1519 **navigation_info, 

1520 **chunking_info, 

1521 **conversion_info, 

1522 **cross_reference_info, 

1523 "content_type_context": content_type_context, 

1524 } 

1525 

1526 def _extract_project_info(self, metadata: dict) -> dict: 

1527 """Extract project information from document metadata. 

1528 

1529 Args: 

1530 metadata: Document metadata 

1531 

1532 Returns: 

1533 Dictionary with project information 

1534 """ 

1535 return { 

1536 "project_id": metadata.get("project_id"), 

1537 "project_name": metadata.get("project_name"), 

1538 "project_description": metadata.get("project_description"), 

1539 "collection_name": metadata.get("collection_name"), 

1540 } 

1541 

1542 def _build_filter( 

1543 self, project_ids: list[str] | None = None 

1544 ) -> models.Filter | None: 

1545 """Build a Qdrant filter based on project IDs.""" 

1546 if not project_ids: 

1547 return None 

1548 

1549 return models.Filter( 

1550 must=[ 

1551 models.FieldCondition( 

1552 key="project_id", match=models.MatchAny(any=project_ids) 

1553 ) 

1554 ] 

1555 ) 

1556 

1557 def _apply_diversity_filtering( 

1558 self, 

1559 results: list[HybridSearchResult], 

1560 diversity_factor: float, 

1561 limit: int 

1562 ) -> list[HybridSearchResult]: 

1563 """🔥 NEW: Apply diversity filtering to promote varied result types.""" 

1564 if diversity_factor <= 0.0 or len(results) <= limit: 

1565 return results[:limit] 

1566 

1567 diverse_results = [] 

1568 used_source_types = set() 

1569 used_section_types = set() 

1570 used_sources = set() 

1571 

1572 # First pass: Take top results while ensuring diversity 

1573 for result in results: 

1574 if len(diverse_results) >= limit: 

1575 break 

1576 

1577 # Calculate diversity score 

1578 diversity_score = 1.0 

1579 

1580 # Penalize duplicate source types (less diversity) 

1581 source_type = result.source_type 

1582 if source_type in used_source_types: 

1583 diversity_score *= (1.0 - diversity_factor * 0.3) 

1584 

1585 # Penalize duplicate section types 

1586 section_type = result.section_type or "unknown" 

1587 if section_type in used_section_types: 

1588 diversity_score *= (1.0 - diversity_factor * 0.2) 

1589 

1590 # Penalize duplicate sources (same document/file) 

1591 source_key = f"{result.source_type}:{result.source_title}" 

1592 if source_key in used_sources: 

1593 diversity_score *= (1.0 - diversity_factor * 0.4) 

1594 

1595 # Apply diversity penalty to score 

1596 adjusted_score = result.score * diversity_score 

1597 

1598 # Use original score to determine if we should include this result 

1599 if len(diverse_results) < limit * 0.7 or adjusted_score >= result.score * 0.6: 

1600 diverse_results.append(result) 

1601 used_source_types.add(source_type) 

1602 used_section_types.add(section_type) 

1603 used_sources.add(source_key) 

1604 

1605 # Second pass: Fill remaining slots with best remaining results 

1606 remaining_slots = limit - len(diverse_results) 

1607 if remaining_slots > 0: 

1608 remaining_results = [r for r in results if r not in diverse_results] 

1609 diverse_results.extend(remaining_results[:remaining_slots]) 

1610 

1611 return diverse_results[:limit] 

1612 

1613 def get_adaptive_search_stats(self) -> dict[str, Any]: 

1614 """🔥 NEW: Get adaptive search statistics for monitoring.""" 

1615 stats = { 

1616 "intent_adaptation_enabled": self.enable_intent_adaptation, 

1617 "has_knowledge_graph": self.knowledge_graph is not None, 

1618 } 

1619 

1620 if self.enable_intent_adaptation and self.intent_classifier: 

1621 stats.update(self.intent_classifier.get_cache_stats()) 

1622 

1623 if self.adaptive_strategy: 

1624 stats.update(self.adaptive_strategy.get_strategy_stats()) 

1625 

1626 return stats 

1627 

1628 # ============================================================================ 

1629 # 🔥 Phase 1.3: Dynamic Faceted Search Interface Methods 

1630 # ============================================================================ 

1631 

1632 async def search_with_facets( 

1633 self, 

1634 query: str, 

1635 limit: int = 5, 

1636 source_types: list[str] | None = None, 

1637 project_ids: list[str] | None = None, 

1638 facet_filters: list[FacetFilter] | None = None, 

1639 generate_facets: bool = True, 

1640 session_context: dict[str, Any] | None = None, 

1641 behavioral_context: list[str] | None = None, 

1642 ) -> FacetedSearchResults: 

1643 """ 

1644 🔥 Phase 1.3: Perform faceted search with dynamic facet generation. 

1645  

1646 Args: 

1647 query: Search query 

1648 limit: Maximum number of results 

1649 source_types: Optional source type filters 

1650 project_ids: Optional project ID filters  

1651 facet_filters: Optional facet filters to apply 

1652 generate_facets: Whether to generate facets from results 

1653 session_context: Optional session context for intent classification 

1654 behavioral_context: Optional behavioral context 

1655  

1656 Returns: 

1657 FacetedSearchResults with results and generated facets 

1658 """ 

1659 start_time = datetime.now() 

1660 

1661 try: 

1662 # First, perform regular search (potentially with larger limit for faceting) 

1663 search_limit = max(limit * 2, 50) if generate_facets else limit 

1664 

1665 search_results = await self.search( 

1666 query=query, 

1667 limit=search_limit, 

1668 source_types=source_types, 

1669 project_ids=project_ids, 

1670 session_context=session_context, 

1671 behavioral_context=behavioral_context 

1672 ) 

1673 

1674 # Generate faceted results 

1675 faceted_results = self.faceted_search_engine.generate_faceted_results( 

1676 results=search_results, 

1677 applied_filters=facet_filters or [] 

1678 ) 

1679 

1680 # Limit final results 

1681 faceted_results.results = faceted_results.results[:limit] 

1682 faceted_results.filtered_count = len(faceted_results.results) 

1683 

1684 search_time = (datetime.now() - start_time).total_seconds() * 1000 

1685 

1686 self.logger.info( 

1687 "Faceted search completed", 

1688 query=query, 

1689 total_results=faceted_results.total_results, 

1690 filtered_results=faceted_results.filtered_count, 

1691 facet_count=len(faceted_results.facets), 

1692 active_filters=len(faceted_results.applied_filters), 

1693 search_time_ms=round(search_time, 2) 

1694 ) 

1695 

1696 return faceted_results 

1697 

1698 except Exception as e: 

1699 self.logger.error("Error in faceted search", query=query, error=str(e)) 

1700 raise 

1701 

1702 def apply_facet_filters( 

1703 self, 

1704 results: list[SearchResult], 

1705 filters: list[FacetFilter] 

1706 ) -> list[SearchResult]: 

1707 """ 

1708 🔥 Phase 1.3: Apply facet filters to search results. 

1709  

1710 Args: 

1711 results: Search results to filter 

1712 filters: Facet filters to apply 

1713  

1714 Returns: 

1715 Filtered search results 

1716 """ 

1717 return self.faceted_search_engine.apply_facet_filters(results, filters) 

1718 

1719 def generate_facets( 

1720 self, 

1721 results: list[SearchResult] 

1722 ) -> list: 

1723 """ 

1724 🔥 Phase 1.3: Generate dynamic facets from search results. 

1725  

1726 Args: 

1727 results: Search results to analyze 

1728  

1729 Returns: 

1730 List of generated facets 

1731 """ 

1732 return self.faceted_search_engine.facet_generator.generate_facets(results) 

1733 

1734 def suggest_facet_refinements( 

1735 self, 

1736 current_results: list[SearchResult], 

1737 current_filters: list[FacetFilter] 

1738 ) -> list[dict[str, Any]]: 

1739 """ 

1740 🔥 Phase 1.3: Suggest facet refinements based on current results. 

1741  

1742 Args: 

1743 current_results: Current search results 

1744 current_filters: Currently applied filters 

1745  

1746 Returns: 

1747 List of suggested refinements with impact estimates 

1748 """ 

1749 return self.faceted_search_engine.suggest_refinements( 

1750 current_results, 

1751 current_filters 

1752 ) 

1753 

1754 # 🔥 Phase 2.3: Cross-Document Intelligence Methods 

1755 

1756 async def analyze_document_relationships( 

1757 self, 

1758 documents: list[SearchResult] 

1759 ) -> dict[str, Any]: 

1760 """ 

1761 🔥 Phase 2.3: Perform comprehensive cross-document relationship analysis. 

1762  

1763 Args: 

1764 documents: Documents to analyze for relationships 

1765  

1766 Returns: 

1767 Comprehensive analysis including clusters, similarities, and conflicts 

1768 """ 

1769 try: 

1770 return self.cross_document_engine.analyze_document_relationships(documents) 

1771 except Exception as e: 

1772 self.logger.error("Error in cross-document analysis", error=str(e)) 

1773 raise 

1774 

1775 async def find_similar_documents( 

1776 self, 

1777 target_document: SearchResult, 

1778 documents: list[SearchResult], 

1779 similarity_metrics: list[SimilarityMetric] = None, 

1780 max_similar: int = 5 

1781 ) -> list[dict[str, Any]]: 

1782 """ 

1783 🔥 Phase 2.3: Find documents similar to a target document. 

1784  

1785 Args: 

1786 target_document: Document to find similar documents for 

1787 documents: Pool of documents to search within 

1788 similarity_metrics: Metrics to use for similarity calculation 

1789 max_similar: Maximum number of similar documents to return 

1790  

1791 Returns: 

1792 List of similar documents with similarity scores 

1793 """ 

1794 try: 

1795 similarity_calculator = self.cross_document_engine.similarity_calculator 

1796 similar_docs = [] 

1797 

1798 for doc in documents: 

1799 if doc == target_document: 

1800 continue 

1801 

1802 similarity = similarity_calculator.calculate_similarity( 

1803 target_document, 

1804 doc, 

1805 similarity_metrics 

1806 ) 

1807 

1808 similar_docs.append({ 

1809 "document": doc, 

1810 "similarity_score": similarity.similarity_score, 

1811 "metric_scores": similarity.metric_scores, 

1812 "similarity_reasons": [similarity.get_display_explanation()] 

1813 }) 

1814 

1815 # Sort by similarity score and return top results 

1816 similar_docs.sort(key=lambda x: x["similarity_score"], reverse=True) 

1817 return similar_docs[:max_similar] 

1818 

1819 except Exception as e: 

1820 self.logger.error("Error finding similar documents", error=str(e)) 

1821 raise 

1822 

1823 async def detect_document_conflicts( 

1824 self, 

1825 documents: list[SearchResult] 

1826 ) -> dict[str, Any]: 

1827 """ 

1828 🔥 Phase 2.3: Detect conflicts between documents. 

1829  

1830 Args: 

1831 documents: Documents to analyze for conflicts 

1832  

1833 Returns: 

1834 Conflict analysis with detected conflicts and resolution suggestions 

1835 """ 

1836 try: 

1837 conflict_analysis = self.cross_document_engine.conflict_detector.detect_conflicts(documents) 

1838 # Convert ConflictAnalysis object to dictionary format 

1839 return { 

1840 "conflicting_pairs": conflict_analysis.conflicting_pairs, 

1841 "conflict_categories": conflict_analysis.conflict_categories, 

1842 "resolution_suggestions": conflict_analysis.resolution_suggestions 

1843 } 

1844 except Exception as e: 

1845 self.logger.error("Error detecting conflicts", error=str(e)) 

1846 raise 

1847 

1848 async def find_complementary_content( 

1849 self, 

1850 target_document: SearchResult, 

1851 documents: list[SearchResult], 

1852 max_recommendations: int = 5 

1853 ) -> list[dict[str, Any]]: 

1854 """ 

1855 🔥 Phase 2.3: Find content that complements the target document. 

1856  

1857 Args: 

1858 target_document: Document to find complementary content for 

1859 documents: Pool of documents to search within 

1860 max_recommendations: Maximum number of recommendations 

1861  

1862 Returns: 

1863 List of complementary documents with recommendation reasons 

1864 """ 

1865 try: 

1866 complementary_content = self.cross_document_engine.complementary_finder.find_complementary_content( 

1867 target_document, 

1868 documents 

1869 ) 

1870 # Get top recommendations and enhance with document objects 

1871 recommendations = complementary_content.get_top_recommendations(max_recommendations) 

1872 

1873 # Create lookup dictionary for documents by ID 

1874 doc_lookup = {f"{doc.source_type}:{doc.source_title}": doc for doc in documents} 

1875 

1876 # Enhance recommendations with full document objects 

1877 enhanced_recommendations = [] 

1878 for rec in recommendations: 

1879 doc_id = rec["document_id"] 

1880 if doc_id in doc_lookup: 

1881 enhanced_rec = { 

1882 "document": doc_lookup[doc_id], # Include full document object 

1883 "relevance_score": rec["relevance_score"], 

1884 "recommendation_reason": rec["recommendation_reason"], 

1885 "strategy": rec["strategy"] 

1886 } 

1887 enhanced_recommendations.append(enhanced_rec) 

1888 

1889 return enhanced_recommendations 

1890 except Exception as e: 

1891 self.logger.error("Error finding complementary content", error=str(e)) 

1892 raise 

1893 

1894 async def cluster_documents( 

1895 self, 

1896 documents: list[SearchResult], 

1897 strategy: ClusteringStrategy = ClusteringStrategy.MIXED_FEATURES, 

1898 max_clusters: int = 10, 

1899 min_cluster_size: int = 2 

1900 ) -> dict[str, Any]: 

1901 """ 

1902 🔥 Phase 2.3: Cluster documents based on similarity and relationships. 

1903  

1904 Args: 

1905 documents: Documents to cluster 

1906 strategy: Clustering strategy to use 

1907 max_clusters: Maximum number of clusters to create 

1908 min_cluster_size: Minimum size for a cluster 

1909  

1910 Returns: 

1911 Document clusters with metadata and relationships 

1912 """ 

1913 try: 

1914 clusters = self.cross_document_engine.cluster_analyzer.create_clusters( 

1915 documents, 

1916 strategy, 

1917 max_clusters, 

1918 min_cluster_size 

1919 ) 

1920 

1921 # Convert to serializable format 

1922 cluster_data = [] 

1923 for cluster in clusters: 

1924 cluster_data.append({ 

1925 "id": cluster.cluster_id, 

1926 "documents": cluster.documents, 

1927 "centroid_topics": cluster.shared_topics, 

1928 "shared_entities": cluster.shared_entities, 

1929 "coherence_score": cluster.coherence_score, 

1930 "cluster_summary": cluster.cluster_description 

1931 }) 

1932 

1933 return { 

1934 "clusters": cluster_data, 

1935 "clustering_metadata": { 

1936 "strategy": strategy.value, 

1937 "total_clusters": len(clusters), 

1938 "total_documents": len(documents) 

1939 } 

1940 } 

1941 

1942 except Exception as e: 

1943 self.logger.error("Error clustering documents", error=str(e)) 

1944 raise