Coverage for src/qdrant_loader_mcp_server/search/hybrid_search.py: 72%
716 statements
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:38 +0000
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:38 +0000
1"""Hybrid search implementation combining vector and keyword search."""
3import re
4from dataclasses import dataclass
5from datetime import datetime
6from typing import Any
8import numpy as np
9from openai import AsyncOpenAI
10from qdrant_client import QdrantClient
11from qdrant_client.http import models
12from rank_bm25 import BM25Okapi
14from ..utils.logging import LoggingConfig
15from .models import SearchResult
16from .nlp.spacy_analyzer import SpaCyQueryAnalyzer
17# 🔥 NEW: Phase 2.2 Intent-Aware Adaptive Search
18from .enhanced.intent_classifier import IntentClassifier, AdaptiveSearchStrategy, SearchIntent
19from .enhanced.knowledge_graph import DocumentKnowledgeGraph
20# 🔥 NEW: Phase 1.2 Topic-Driven Search Chaining
21from .enhanced.topic_search_chain import (
22 TopicSearchChainGenerator,
23 TopicSearchChain,
24 ChainStrategy,
25 TopicChainLink
26)
27# 🔥 NEW: Phase 1.3 Dynamic Faceted Search Interface
28from .enhanced.faceted_search import (
29 FacetType,
30 FacetFilter,
31 FacetedSearchResults,
32 DynamicFacetGenerator,
33 FacetedSearchEngine
34)
35# 🔥 NEW: Phase 2.3 Cross-Document Intelligence
36from .enhanced.cross_document_intelligence import (
37 CrossDocumentIntelligenceEngine,
38 SimilarityMetric,
39 ClusteringStrategy,
40 DocumentSimilarityCalculator
41)
43logger = LoggingConfig.get_logger(__name__)
46@dataclass
47class HybridSearchResult:
48 """Container for hybrid search results with comprehensive metadata."""
50 score: float
51 text: str
52 source_type: str
53 source_title: str
54 source_url: str | None = None
55 file_path: str | None = None
56 repo_name: str | None = None
57 vector_score: float = 0.0
58 keyword_score: float = 0.0
60 # Project information (for multi-project support)
61 project_id: str | None = None
62 project_name: str | None = None
63 project_description: str | None = None
64 collection_name: str | None = None
66 # Hierarchy information (primarily for Confluence)
67 parent_id: str | None = None
68 parent_title: str | None = None
69 breadcrumb_text: str | None = None
70 depth: int | None = None
71 children_count: int | None = None
72 hierarchy_context: str | None = None
74 # Attachment information (for files attached to documents)
75 is_attachment: bool = False
76 parent_document_id: str | None = None
77 parent_document_title: str | None = None
78 attachment_id: str | None = None
79 original_filename: str | None = None
80 file_size: int | None = None
81 mime_type: str | None = None
82 attachment_author: str | None = None
83 attachment_context: str | None = None
85 # 🔥 NEW: Section-level intelligence
86 section_title: str | None = None
87 section_type: str | None = None # e.g., "h1", "h2", "content"
88 section_level: int | None = None
89 section_anchor: str | None = None
90 section_breadcrumb: str | None = None
91 section_depth: int | None = None
93 # 🔥 NEW: Content analysis
94 has_code_blocks: bool = False
95 has_tables: bool = False
96 has_images: bool = False
97 has_links: bool = False
98 word_count: int | None = None
99 char_count: int | None = None
100 estimated_read_time: int | None = None # minutes
101 paragraph_count: int | None = None
103 # 🔥 NEW: Semantic analysis (NLP results)
104 entities: list[dict | str] = None
105 topics: list[dict | str] = None
106 key_phrases: list[dict | str] = None
107 pos_tags: list[dict] = None
109 # 🔥 NEW: Navigation context
110 previous_section: str | None = None
111 next_section: str | None = None
112 sibling_sections: list[str] = None
113 subsections: list[str] = None
114 document_hierarchy: list[str] = None
116 # 🔥 NEW: Chunking context
117 chunk_index: int | None = None
118 total_chunks: int | None = None
119 chunking_strategy: str | None = None
121 # 🔥 NEW: File conversion intelligence
122 original_file_type: str | None = None
123 conversion_method: str | None = None
124 is_excel_sheet: bool = False
125 is_converted: bool = False
127 # 🔥 NEW: Cross-references and enhanced context
128 cross_references: list[dict] = None
129 topic_analysis: dict | None = None
130 content_type_context: str | None = None # Human-readable content description
132 def __post_init__(self):
133 """Initialize default values for list fields."""
134 if self.entities is None:
135 self.entities = []
136 if self.topics is None:
137 self.topics = []
138 if self.key_phrases is None:
139 self.key_phrases = []
140 if self.pos_tags is None:
141 self.pos_tags = []
142 if self.sibling_sections is None:
143 self.sibling_sections = []
144 if self.subsections is None:
145 self.subsections = []
146 if self.document_hierarchy is None:
147 self.document_hierarchy = []
148 if self.cross_references is None:
149 self.cross_references = []
152class HybridSearchEngine:
153 """Service for hybrid search combining vector and keyword search."""
155 def __init__(
156 self,
157 qdrant_client: QdrantClient,
158 openai_client: AsyncOpenAI,
159 collection_name: str,
160 vector_weight: float = 0.6,
161 keyword_weight: float = 0.3,
162 metadata_weight: float = 0.1,
163 min_score: float = 0.3,
164 dense_vector_name: str = "dense",
165 sparse_vector_name: str = "sparse",
166 alpha: float = 0.5,
167 # 🔥 NEW: Phase 2.2 parameters
168 knowledge_graph: DocumentKnowledgeGraph = None,
169 enable_intent_adaptation: bool = True,
170 ):
171 """Initialize the hybrid search service.
173 Args:
174 qdrant_client: Qdrant client instance
175 openai_client: OpenAI client instance
176 collection_name: Name of the Qdrant collection
177 vector_weight: Weight for vector search scores (0-1)
178 keyword_weight: Weight for keyword search scores (0-1)
179 metadata_weight: Weight for metadata-based scoring (0-1)
180 min_score: Minimum combined score threshold
181 dense_vector_name: Name of the dense vector field
182 sparse_vector_name: Name of the sparse vector field
183 alpha: Weight for dense search (1-alpha for sparse search)
184 knowledge_graph: Optional knowledge graph for Phase 2.1 integration
185 enable_intent_adaptation: Enable Phase 2.2 intent-aware adaptive search
187 """
188 self.qdrant_client = qdrant_client
189 self.openai_client = openai_client
190 self.collection_name = collection_name
191 self.vector_weight = vector_weight
192 self.keyword_weight = keyword_weight
193 self.metadata_weight = metadata_weight
194 self.min_score = min_score
195 self.dense_vector_name = dense_vector_name
196 self.sparse_vector_name = sparse_vector_name
197 self.alpha = alpha
198 self.logger = LoggingConfig.get_logger(__name__)
200 # 🔥 NEW: Initialize spaCy query analyzer for intelligent query processing
201 self.spacy_analyzer = SpaCyQueryAnalyzer(spacy_model="en_core_web_md")
203 # 🔥 NEW: Phase 2.2 Intent-Aware Adaptive Search
204 self.enable_intent_adaptation = enable_intent_adaptation
205 self.knowledge_graph = knowledge_graph
207 if self.enable_intent_adaptation:
208 self.intent_classifier = IntentClassifier(self.spacy_analyzer)
209 self.adaptive_strategy = AdaptiveSearchStrategy(self.knowledge_graph)
210 logger.info("🔥 Phase 2.2: Intent-aware adaptive search ENABLED")
211 else:
212 self.intent_classifier = None
213 self.adaptive_strategy = None
214 logger.info("Intent-aware adaptive search DISABLED")
216 # 🔥 NEW: Phase 1.2 Topic-Driven Search Chaining
217 self.topic_chain_generator = TopicSearchChainGenerator(
218 self.spacy_analyzer,
219 self.knowledge_graph
220 )
221 self._topic_chains_initialized = False
222 logger.info("🔥 Phase 1.2: Topic-driven search chaining ENABLED")
224 # 🔥 NEW: Phase 1.3 Dynamic Faceted Search Interface
225 self.faceted_search_engine = FacetedSearchEngine()
226 logger.info("🔥 Phase 1.3: Dynamic faceted search interface ENABLED")
228 # Cross-Document Intelligence (always enabled)
229 self.cross_document_engine = CrossDocumentIntelligenceEngine(
230 self.spacy_analyzer,
231 self.knowledge_graph
232 )
233 logger.info("Cross-document intelligence ENABLED")
235 # Enhanced query expansions leveraging spaCy semantic understanding
236 self.query_expansions = {
237 "product requirements": [
238 "PRD",
239 "requirements document",
240 "product specification",
241 ],
242 "requirements": ["specs", "requirements document", "features"],
243 "architecture": ["system design", "technical architecture"],
244 "UI": ["user interface", "frontend", "design"],
245 "API": ["interface", "endpoints", "REST"],
246 "database": ["DB", "data storage", "persistence"],
247 "security": ["auth", "authentication", "authorization"],
248 # 🔥 NEW: Content-type aware expansions
249 "code": ["implementation", "function", "method", "class"],
250 "documentation": ["docs", "guide", "manual", "instructions"],
251 "config": ["configuration", "settings", "setup"],
252 "table": ["data", "spreadsheet", "excel", "csv"],
253 "image": ["screenshot", "diagram", "chart", "visual"],
254 "link": ["reference", "url", "external", "connection"],
255 }
257 async def _expand_query(self, query: str) -> str:
258 """🔥 ENHANCED: Expand query with spaCy semantic understanding and related terms."""
259 # Use spaCy analyzer for intelligent query expansion
260 try:
261 query_analysis = self.spacy_analyzer.analyze_query_semantic(query)
263 # Start with original query
264 expanded_query = query
266 # Add semantic keywords for broader matching
267 if query_analysis.semantic_keywords:
268 # Add top semantic keywords
269 semantic_terms = " ".join(query_analysis.semantic_keywords[:3])
270 expanded_query = f"{query} {semantic_terms}"
272 # Add main concepts for concept-based expansion
273 if query_analysis.main_concepts:
274 concept_terms = " ".join(query_analysis.main_concepts[:2])
275 expanded_query = f"{expanded_query} {concept_terms}"
277 # Legacy expansion logic as fallback
278 lower_query = query.lower()
279 for key, expansions in self.query_expansions.items():
280 if key.lower() in lower_query:
281 expansion_terms = " ".join(expansions[:2]) # Limit to avoid over-expansion
282 expanded_query = f"{expanded_query} {expansion_terms}"
283 break
285 if expanded_query != query:
286 self.logger.debug(
287 "🔥 spaCy-enhanced query expansion",
288 original_query=query,
289 expanded_query=expanded_query,
290 semantic_keywords=query_analysis.semantic_keywords[:3],
291 main_concepts=query_analysis.main_concepts[:2],
292 )
294 return expanded_query
296 except Exception as e:
297 self.logger.warning(f"spaCy expansion failed, using fallback: {e}")
298 # Fallback to original expansion logic
299 expanded_query = query
300 lower_query = query.lower()
302 for key, expansions in self.query_expansions.items():
303 if key.lower() in lower_query:
304 expansion_terms = " ".join(expansions)
305 expanded_query = f"{query} {expansion_terms}"
306 self.logger.debug(
307 "Expanded query (fallback)",
308 original_query=query,
309 expanded_query=expanded_query,
310 )
311 break
313 return expanded_query
315 async def _expand_query_aggressive(self, query: str) -> str:
316 """🔥 NEW: More aggressive query expansion for exploratory searches."""
317 try:
318 query_analysis = self.spacy_analyzer.analyze_query_semantic(query)
320 # Start with original query
321 expanded_query = query
323 # Add more semantic keywords (increased from 3 to 5)
324 if query_analysis.semantic_keywords:
325 semantic_terms = " ".join(query_analysis.semantic_keywords[:5])
326 expanded_query = f"{query} {semantic_terms}"
328 # Add more main concepts (increased from 2 to 4)
329 if query_analysis.main_concepts:
330 concept_terms = " ".join(query_analysis.main_concepts[:4])
331 expanded_query = f"{expanded_query} {concept_terms}"
333 # Add entity-based expansion
334 if query_analysis.entities:
335 entity_terms = " ".join([ent[0] for ent in query_analysis.entities[:3]])
336 expanded_query = f"{expanded_query} {entity_terms}"
338 # Apply multiple legacy expansions for exploration
339 lower_query = query.lower()
340 expansion_count = 0
341 for key, expansions in self.query_expansions.items():
342 if key.lower() in lower_query and expansion_count < 3: # Max 3 expansions
343 expansion_terms = " ".join(expansions[:3])
344 expanded_query = f"{expanded_query} {expansion_terms}"
345 expansion_count += 1
347 self.logger.debug(
348 "🔥 Aggressive query expansion for exploration",
349 original_query=query,
350 expanded_query=expanded_query,
351 expansion_ratio=len(expanded_query.split()) / len(query.split()),
352 )
354 return expanded_query
356 except Exception as e:
357 self.logger.warning(f"Aggressive expansion failed, using standard: {e}")
358 return await self._expand_query(query)
360 async def _get_embedding(self, text: str) -> list[float]:
361 """Get embedding for text using OpenAI."""
362 try:
363 response = await self.openai_client.embeddings.create(
364 model="text-embedding-3-small",
365 input=text,
366 )
367 return response.data[0].embedding
368 except Exception as e:
369 self.logger.error("Failed to get embedding", error=str(e))
370 raise
372 async def search(
373 self,
374 query: str,
375 limit: int = 5,
376 source_types: list[str] | None = None,
377 project_ids: list[str] | None = None,
378 # 🔥 NEW: Phase 2.2 parameters
379 session_context: dict[str, Any] | None = None,
380 behavioral_context: list[str] | None = None,
381 ) -> list[SearchResult]:
382 """Perform hybrid search combining vector and keyword search.
384 Args:
385 query: Search query text
386 limit: Maximum number of results to return
387 source_types: Optional list of source types to filter by
388 project_ids: Optional list of project IDs to filter by
389 session_context: Optional session context for intent classification
390 behavioral_context: Optional behavioral context (previous intents)
391 """
392 self.logger.debug(
393 "Starting hybrid search",
394 query=query,
395 limit=limit,
396 source_types=source_types,
397 project_ids=project_ids,
398 intent_adaptation_enabled=self.enable_intent_adaptation,
399 )
401 try:
402 # 🔥 NEW: Phase 2.2 Intent Classification and Adaptive Search
403 search_intent = None
404 adaptive_config = None
406 if self.enable_intent_adaptation and self.intent_classifier:
407 # Classify search intent using comprehensive spaCy analysis
408 search_intent = self.intent_classifier.classify_intent(
409 query, session_context, behavioral_context
410 )
412 # Adapt search configuration based on classified intent
413 adaptive_config = self.adaptive_strategy.adapt_search(
414 search_intent, query
415 )
417 # Update search parameters based on adaptive configuration
418 if adaptive_config:
419 # Override weights based on intent
420 original_vector_weight = self.vector_weight
421 original_keyword_weight = self.keyword_weight
422 original_min_score = self.min_score
424 self.vector_weight = adaptive_config.vector_weight
425 self.keyword_weight = adaptive_config.keyword_weight
426 self.min_score = adaptive_config.min_score_threshold
428 # Adjust limit based on intent configuration
429 limit = min(adaptive_config.max_results, limit * 2)
431 self.logger.debug(
432 "🔥 Adapted search parameters based on intent",
433 intent=search_intent.intent_type.value,
434 confidence=search_intent.confidence,
435 vector_weight=self.vector_weight,
436 keyword_weight=self.keyword_weight,
437 adjusted_limit=limit,
438 use_kg=adaptive_config.use_knowledge_graph,
439 )
441 # Expand query with related terms (now potentially adapted)
442 expanded_query = await self._expand_query(query)
444 # Apply intent-specific query expansion if available
445 if adaptive_config and adaptive_config.expand_query:
446 if adaptive_config.expansion_aggressiveness > 0.5:
447 # More aggressive expansion for exploratory queries
448 expanded_query = await self._expand_query_aggressive(query)
450 # Get vector search results
451 vector_results = await self._vector_search(
452 expanded_query, limit * 3, project_ids
453 )
455 # Get keyword search results
456 keyword_results = await self._keyword_search(query, limit * 3, project_ids)
458 # Analyze query for context
459 query_context = self._analyze_query(query)
461 # 🔥 NEW: Add intent information to query context
462 if search_intent:
463 query_context["search_intent"] = search_intent
464 query_context["adaptive_config"] = adaptive_config
466 # Combine and rerank results
467 combined_results = await self._combine_results(
468 vector_results,
469 keyword_results,
470 query_context,
471 limit,
472 source_types,
473 project_ids,
474 )
476 # 🔥 NEW: Restore original search parameters if they were modified
477 if adaptive_config:
478 self.vector_weight = original_vector_weight
479 self.keyword_weight = original_keyword_weight
480 self.min_score = original_min_score
482 # Convert to SearchResult objects
483 return [
484 SearchResult(
485 score=result.score,
486 text=result.text,
487 source_type=result.source_type,
488 source_title=result.source_title,
489 source_url=result.source_url,
490 file_path=result.file_path,
491 repo_name=result.repo_name,
493 # Project information
494 project_id=result.project_id,
495 project_name=result.project_name,
496 project_description=result.project_description,
497 collection_name=result.collection_name,
499 # Basic hierarchy and attachment (existing)
500 parent_id=result.parent_id,
501 parent_title=result.parent_title,
502 breadcrumb_text=result.breadcrumb_text,
503 depth=result.depth,
504 children_count=result.children_count,
505 hierarchy_context=result.hierarchy_context,
506 is_attachment=result.is_attachment,
507 parent_document_id=result.parent_document_id,
508 parent_document_title=result.parent_document_title,
509 attachment_id=result.attachment_id,
510 original_filename=result.original_filename,
511 file_size=result.file_size,
512 mime_type=result.mime_type,
513 attachment_author=result.attachment_author,
514 attachment_context=result.attachment_context,
516 # 🔥 NEW: Section-level intelligence
517 section_title=result.section_title,
518 section_type=result.section_type,
519 section_level=result.section_level,
520 section_anchor=result.section_anchor,
521 section_breadcrumb=result.section_breadcrumb,
522 section_depth=result.section_depth,
524 # 🔥 NEW: Content analysis
525 has_code_blocks=result.has_code_blocks,
526 has_tables=result.has_tables,
527 has_images=result.has_images,
528 has_links=result.has_links,
529 word_count=result.word_count,
530 char_count=result.char_count,
531 estimated_read_time=result.estimated_read_time,
532 paragraph_count=result.paragraph_count,
534 # 🔥 NEW: Semantic analysis
535 entities=result.entities,
536 topics=result.topics,
537 key_phrases=result.key_phrases,
538 pos_tags=result.pos_tags,
540 # 🔥 NEW: Navigation context
541 previous_section=result.previous_section,
542 next_section=result.next_section,
543 sibling_sections=result.sibling_sections,
544 subsections=result.subsections,
545 document_hierarchy=result.document_hierarchy,
547 # 🔥 NEW: Chunking context
548 chunk_index=result.chunk_index,
549 total_chunks=result.total_chunks,
550 chunking_strategy=result.chunking_strategy,
552 # 🔥 NEW: File conversion intelligence
553 original_file_type=result.original_file_type,
554 conversion_method=result.conversion_method,
555 is_excel_sheet=result.is_excel_sheet,
556 is_converted=result.is_converted,
558 # 🔥 NEW: Cross-references and enhanced context
559 cross_references=result.cross_references,
560 topic_analysis=result.topic_analysis,
561 content_type_context=result.content_type_context,
562 )
563 for result in combined_results
564 ]
566 except Exception as e:
567 self.logger.error("Error in hybrid search", error=str(e), query=query)
568 raise
570 async def generate_topic_search_chain(
571 self,
572 query: str,
573 strategy: ChainStrategy = ChainStrategy.MIXED_EXPLORATION,
574 max_links: int = 5,
575 initialize_from_search: bool = True
576 ) -> TopicSearchChain:
577 """🔥 NEW: Generate a topic-driven search chain for progressive content discovery.
579 Args:
580 query: Original search query
581 strategy: Strategy for chain generation
582 max_links: Maximum number of links in the chain
583 initialize_from_search: Whether to initialize topic relationships from search results
585 Returns:
586 TopicSearchChain with progressive queries for exploration
587 """
588 self.logger.debug(
589 "Generating topic search chain",
590 query=query,
591 strategy=strategy.value,
592 max_links=max_links
593 )
595 try:
596 # Initialize topic relationships from search results if needed
597 if initialize_from_search and not self._topic_chains_initialized:
598 await self._initialize_topic_relationships(query)
600 # Generate the topic search chain
601 topic_chain = self.topic_chain_generator.generate_search_chain(
602 original_query=query,
603 strategy=strategy,
604 max_links=max_links
605 )
607 self.logger.info(
608 "Topic search chain generated successfully",
609 chain_length=len(topic_chain.chain_links),
610 strategy=strategy.value,
611 topics_covered=topic_chain.total_topics_covered,
612 discovery_potential=f"{topic_chain.estimated_discovery_potential:.2f}",
613 generation_time=f"{topic_chain.generation_time_ms:.1f}ms"
614 )
616 return topic_chain
618 except Exception as e:
619 self.logger.error("Error generating topic search chain", error=str(e), query=query)
620 raise
622 async def execute_topic_chain_search(
623 self,
624 topic_chain: TopicSearchChain,
625 results_per_link: int = 3,
626 source_types: list[str] | None = None,
627 project_ids: list[str] | None = None
628 ) -> dict[str, list[SearchResult]]:
629 """🔥 NEW: Execute searches for all links in a topic chain.
631 Args:
632 topic_chain: The topic search chain to execute
633 results_per_link: Number of results to return per chain link
634 source_types: Optional source type filters
635 project_ids: Optional project ID filters
637 Returns:
638 Dictionary mapping chain link queries to their search results
639 """
640 self.logger.debug(
641 "Executing topic chain search",
642 chain_length=len(topic_chain.chain_links),
643 results_per_link=results_per_link
644 )
646 chain_results = {}
648 try:
649 # Execute search for original query
650 original_results = await self.search(
651 query=topic_chain.original_query,
652 limit=results_per_link,
653 source_types=source_types,
654 project_ids=project_ids
655 )
656 chain_results[topic_chain.original_query] = original_results
658 # Execute search for each chain link
659 for link in topic_chain.chain_links:
660 try:
661 link_results = await self.search(
662 query=link.query,
663 limit=results_per_link,
664 source_types=source_types,
665 project_ids=project_ids
666 )
667 chain_results[link.query] = link_results
669 self.logger.debug(
670 "Executed chain link search",
671 query=link.query,
672 results_count=len(link_results),
673 topic_focus=link.topic_focus,
674 exploration_type=link.exploration_type
675 )
677 except Exception as e:
678 self.logger.warning(
679 "Failed to execute chain link search",
680 query=link.query,
681 error=str(e)
682 )
683 chain_results[link.query] = []
685 total_results = sum(len(results) for results in chain_results.values())
686 self.logger.info(
687 "Topic chain search execution completed",
688 total_queries=len(chain_results),
689 total_results=total_results,
690 original_query=topic_chain.original_query
691 )
693 return chain_results
695 except Exception as e:
696 self.logger.error("Error executing topic chain search", error=str(e))
697 raise
699 async def _initialize_topic_relationships(self, sample_query: str) -> None:
700 """Initialize topic relationships from a sample search to bootstrap topic chaining."""
701 try:
702 # Perform a broad search to get diverse results for topic relationship mapping
703 sample_results = await self.search(
704 query=sample_query,
705 limit=20, # Get more results for better topic coverage
706 source_types=None,
707 project_ids=None
708 )
710 if sample_results:
711 # Initialize topic relationships from the sample results
712 self.topic_chain_generator.initialize_from_results(sample_results)
713 self._topic_chains_initialized = True
715 self.logger.info(
716 "Topic relationships initialized from search results",
717 sample_query=sample_query,
718 sample_results_count=len(sample_results)
719 )
720 else:
721 self.logger.warning(
722 "No search results available for topic relationship initialization",
723 sample_query=sample_query
724 )
726 except Exception as e:
727 self.logger.error(
728 "Failed to initialize topic relationships",
729 error=str(e),
730 sample_query=sample_query
731 )
732 # Don't raise - topic chaining can still work with limited relationships
734 def _analyze_query(self, query: str) -> dict[str, Any]:
735 """🔥 ENHANCED: Analyze query using spaCy NLP instead of regex patterns."""
736 try:
737 # Use spaCy analyzer for comprehensive query analysis
738 query_analysis = self.spacy_analyzer.analyze_query_semantic(query)
740 # Create enhanced query context using spaCy analysis
741 context = {
742 # Basic query characteristics
743 "is_question": query_analysis.is_question,
744 "is_broad": len(query.split()) < 5,
745 "is_specific": len(query.split()) > 7,
746 "is_technical": query_analysis.is_technical,
747 "complexity_score": query_analysis.complexity_score,
749 # spaCy-powered intent detection
750 "probable_intent": query_analysis.intent_signals.get("primary_intent", "informational"),
751 "intent_confidence": query_analysis.intent_signals.get("confidence", 0.0),
752 "linguistic_features": query_analysis.intent_signals.get("linguistic_features", {}),
754 # Enhanced keyword extraction using spaCy
755 "keywords": query_analysis.semantic_keywords,
756 "entities": [entity[0] for entity in query_analysis.entities], # Extract entity text
757 "entity_types": [entity[1] for entity in query_analysis.entities], # Extract entity labels
758 "main_concepts": query_analysis.main_concepts,
759 "pos_patterns": query_analysis.pos_patterns,
761 # Store query analysis for later use
762 "spacy_analysis": query_analysis,
763 }
765 # Enhanced content type preference detection using spaCy
766 semantic_keywords_set = set(query_analysis.semantic_keywords)
768 # Code preference detection
769 code_keywords = {"code", "function", "implementation", "script", "method", "class", "api"}
770 if semantic_keywords_set.intersection(code_keywords):
771 context["prefers_code"] = True
773 # Table/data preference detection
774 table_keywords = {"table", "data", "excel", "spreadsheet", "csv", "sheet"}
775 if semantic_keywords_set.intersection(table_keywords):
776 context["prefers_tables"] = True
778 # Image preference detection
779 image_keywords = {"image", "diagram", "screenshot", "visual", "chart", "graph"}
780 if semantic_keywords_set.intersection(image_keywords):
781 context["prefers_images"] = True
783 # Documentation preference detection
784 doc_keywords = {"documentation", "doc", "guide", "manual", "instruction", "help"}
785 if semantic_keywords_set.intersection(doc_keywords):
786 context["prefers_docs"] = True
788 self.logger.debug(
789 "🔥 spaCy query analysis completed",
790 intent=context["probable_intent"],
791 confidence=context["intent_confidence"],
792 entities_found=len(query_analysis.entities),
793 keywords_extracted=len(query_analysis.semantic_keywords),
794 processing_time_ms=query_analysis.processing_time_ms,
795 )
797 return context
799 except Exception as e:
800 self.logger.warning(f"spaCy analysis failed, using fallback: {e}")
801 # Fallback to original regex-based analysis
802 return self._analyze_query_fallback(query)
804 def _analyze_query_fallback(self, query: str) -> dict[str, Any]:
805 """Fallback query analysis using original regex patterns."""
806 context = {
807 "is_question": bool(
808 re.search(r"\?|what|how|why|when|who|where", query.lower())
809 ),
810 "is_broad": len(query.split()) < 5,
811 "is_specific": len(query.split()) > 7,
812 "probable_intent": "informational",
813 "keywords": [
814 word.lower() for word in re.findall(r"\b\w{3,}\b", query.lower())
815 ],
816 }
818 lower_query = query.lower()
819 if "how to" in lower_query or "steps" in lower_query:
820 context["probable_intent"] = "procedural"
821 elif any(
822 term in lower_query for term in ["requirements", "prd", "specification"]
823 ):
824 context["probable_intent"] = "requirements"
825 elif any(
826 term in lower_query for term in ["architecture", "design", "structure"]
827 ):
828 context["probable_intent"] = "architecture"
830 # Content type preferences (original logic)
831 if any(term in lower_query for term in ["code", "function", "implementation", "script"]):
832 context["prefers_code"] = True
833 if any(term in lower_query for term in ["table", "data", "excel", "spreadsheet"]):
834 context["prefers_tables"] = True
835 if any(term in lower_query for term in ["image", "diagram", "screenshot", "visual"]):
836 context["prefers_images"] = True
837 if any(term in lower_query for term in ["documentation", "docs", "guide", "manual"]):
838 context["prefers_docs"] = True
840 return context
842 def _boost_score_with_metadata(
843 self, base_score: float, metadata_info: dict, query_context: dict
844 ) -> float:
845 """🔥 ENHANCED: Boost search scores using spaCy semantic analysis and metadata context."""
846 boosted_score = base_score
847 boost_factor = 0.0
849 # 🔥 NEW: Phase 2.2 Intent-Aware Boosting
850 search_intent = query_context.get("search_intent")
851 adaptive_config = query_context.get("adaptive_config")
853 if search_intent and adaptive_config:
854 # Apply intent-specific ranking boosts
855 ranking_boosts = adaptive_config.ranking_boosts
856 source_type_preferences = adaptive_config.source_type_preferences
858 # Source type preference boosting
859 source_type = metadata_info.get("source_type", "")
860 if source_type in source_type_preferences:
861 source_boost = (source_type_preferences[source_type] - 1.0) * 0.2
862 boost_factor += source_boost
864 # Content type boosting from ranking_boosts
865 for boost_key, boost_value in ranking_boosts.items():
866 if boost_key == "section_type" and isinstance(boost_value, dict):
867 section_type = metadata_info.get("section_type", "")
868 if section_type in boost_value:
869 section_boost = (boost_value[section_type] - 1.0) * 0.15
870 boost_factor += section_boost
871 elif boost_key == "source_type" and isinstance(boost_value, dict):
872 if source_type in boost_value:
873 source_boost = (boost_value[source_type] - 1.0) * 0.15
874 boost_factor += source_boost
875 elif boost_key in metadata_info and metadata_info[boost_key]:
876 # Boolean metadata boosting (e.g., has_money_entities, has_org_entities)
877 if isinstance(boost_value, (int, float)):
878 bool_boost = (boost_value - 1.0) * 0.1
879 boost_factor += bool_boost
881 # Intent-specific confidence boosting
882 confidence_boost = search_intent.confidence * 0.05 # Up to 5% boost for high confidence
883 boost_factor += confidence_boost
885 self.logger.debug(
886 "🔥 Applied intent-aware boosting",
887 intent=search_intent.intent_type.value,
888 confidence=search_intent.confidence,
889 source_type=source_type,
890 total_intent_boost=boost_factor,
891 )
893 # 🔥 Content type relevance boosting (enhanced)
894 if query_context.get("prefers_code") and metadata_info.get("has_code_blocks"):
895 boost_factor += 0.15
897 if query_context.get("prefers_tables") and metadata_info.get("has_tables"):
898 boost_factor += 0.12
900 if query_context.get("prefers_images") and metadata_info.get("has_images"):
901 boost_factor += 0.10
903 if query_context.get("prefers_docs") and not metadata_info.get("has_code_blocks"):
904 boost_factor += 0.08
906 # 🔥 Section level relevance (higher level = more important)
907 section_level = metadata_info.get("section_level")
908 if section_level is not None:
909 if section_level <= 2: # H1, H2 are more important
910 boost_factor += 0.10
911 elif section_level <= 3: # H3 moderately important
912 boost_factor += 0.05
914 # 🔥 Content quality indicators
915 word_count = metadata_info.get("word_count") or 0
916 if word_count > 100: # Substantial content
917 boost_factor += 0.05
918 if word_count > 500: # Very detailed content
919 boost_factor += 0.05
921 # 🔥 Converted file boosting (often contains rich content)
922 if metadata_info.get("is_converted") and metadata_info.get("original_file_type") in ["docx", "xlsx", "pdf"]:
923 boost_factor += 0.08
925 # 🔥 Excel sheet specific boosting for data queries
926 if metadata_info.get("is_excel_sheet") and any(
927 term in " ".join(query_context.get("keywords", []))
928 for term in ["data", "table", "sheet", "excel", "csv"]
929 ):
930 boost_factor += 0.12
932 # 🔥 NEW: spaCy-powered semantic entity relevance
933 if "spacy_analysis" in query_context:
934 spacy_analysis = query_context["spacy_analysis"]
936 # Enhanced entity matching using spaCy similarity
937 entities = metadata_info.get("entities", [])
938 if entities and spacy_analysis.entities:
939 max_entity_similarity = 0.0
940 for entity in entities:
941 entity_text = entity if isinstance(entity, str) else entity.get("text", str(entity))
942 similarity = self.spacy_analyzer.semantic_similarity_matching(
943 spacy_analysis, entity_text
944 )
945 max_entity_similarity = max(max_entity_similarity, similarity)
947 # Apply semantic entity boost based on similarity
948 if max_entity_similarity > 0.6: # High similarity
949 boost_factor += 0.15
950 elif max_entity_similarity > 0.4: # Medium similarity
951 boost_factor += 0.10
952 elif max_entity_similarity > 0.2: # Low similarity
953 boost_factor += 0.05
955 # Enhanced topic relevance using spaCy
956 topics = metadata_info.get("topics", [])
957 if topics and spacy_analysis.main_concepts:
958 max_topic_similarity = 0.0
959 for topic in topics:
960 topic_text = topic if isinstance(topic, str) else topic.get("text", str(topic))
961 for concept in spacy_analysis.main_concepts:
962 similarity = self.spacy_analyzer.semantic_similarity_matching(
963 spacy_analysis, f"{topic_text} {concept}"
964 )
965 max_topic_similarity = max(max_topic_similarity, similarity)
967 # Apply semantic topic boost
968 if max_topic_similarity > 0.5:
969 boost_factor += 0.12
970 elif max_topic_similarity > 0.3:
971 boost_factor += 0.08
973 else:
974 # Fallback to original entity/topic matching
975 entities = metadata_info.get("entities", [])
976 if entities:
977 query_keywords = set(query_context.get("keywords", []))
978 entity_texts = set()
979 for entity in entities:
980 if isinstance(entity, str):
981 entity_texts.add(entity.lower())
982 elif isinstance(entity, dict):
983 if "text" in entity:
984 entity_texts.add(str(entity["text"]).lower())
985 elif "entity" in entity:
986 entity_texts.add(str(entity["entity"]).lower())
987 else:
988 entity_texts.add(str(entity).lower())
990 if query_keywords.intersection(entity_texts):
991 boost_factor += 0.10
993 # Original topic relevance
994 topics = metadata_info.get("topics", [])
995 if topics:
996 query_keywords = set(query_context.get("keywords", []))
997 topic_texts = set()
998 for topic in topics:
999 if isinstance(topic, str):
1000 topic_texts.add(topic.lower())
1001 elif isinstance(topic, dict):
1002 if "text" in topic:
1003 topic_texts.add(str(topic["text"]).lower())
1004 elif "topic" in topic:
1005 topic_texts.add(str(topic["topic"]).lower())
1006 else:
1007 topic_texts.add(str(topic).lower())
1009 if query_keywords.intersection(topic_texts):
1010 boost_factor += 0.08
1012 # Apply boost (cap at reasonable maximum)
1013 boost_factor = min(boost_factor, 0.5) # Maximum 50% boost (increased from 40%)
1014 return boosted_score * (1 + boost_factor)
1016 async def _vector_search(
1017 self, query: str, limit: int, project_ids: list[str] | None = None
1018 ) -> list[dict[str, Any]]:
1019 """Perform vector search using Qdrant."""
1020 query_embedding = await self._get_embedding(query)
1022 search_params = models.SearchParams(hnsw_ef=128, exact=False)
1024 results = await self.qdrant_client.search(
1025 collection_name=self.collection_name,
1026 query_vector=query_embedding,
1027 limit=limit,
1028 score_threshold=self.min_score,
1029 search_params=search_params,
1030 query_filter=self._build_filter(project_ids),
1031 )
1033 return [
1034 {
1035 "score": hit.score,
1036 "text": hit.payload.get("content", "") if hit.payload else "",
1037 "metadata": hit.payload.get("metadata", {}) if hit.payload else {},
1038 "source_type": (
1039 hit.payload.get("source_type", "unknown")
1040 if hit.payload
1041 else "unknown"
1042 ),
1043 }
1044 for hit in results
1045 ]
1047 async def _keyword_search(
1048 self, query: str, limit: int, project_ids: list[str] | None = None
1049 ) -> list[dict[str, Any]]:
1050 """Perform keyword search using BM25."""
1051 scroll_results = await self.qdrant_client.scroll(
1052 collection_name=self.collection_name,
1053 limit=10000,
1054 with_payload=True,
1055 with_vectors=False,
1056 scroll_filter=self._build_filter(project_ids),
1057 )
1059 documents = []
1060 metadata_list = []
1061 source_types = []
1063 for point in scroll_results[0]:
1064 if point.payload:
1065 content = point.payload.get("content", "")
1066 metadata = point.payload.get("metadata", {})
1067 source_type = point.payload.get("source_type", "unknown")
1068 documents.append(content)
1069 metadata_list.append(metadata)
1070 source_types.append(source_type)
1072 tokenized_docs = [doc.split() for doc in documents]
1073 bm25 = BM25Okapi(tokenized_docs)
1075 tokenized_query = query.split()
1076 scores = bm25.get_scores(tokenized_query)
1078 top_indices = np.argsort(scores)[-limit:][::-1]
1080 return [
1081 {
1082 "score": float(scores[idx]),
1083 "text": documents[idx],
1084 "metadata": metadata_list[idx],
1085 "source_type": source_types[idx],
1086 }
1087 for idx in top_indices
1088 if scores[idx] > 0
1089 ]
1091 async def _combine_results(
1092 self,
1093 vector_results: list[dict[str, Any]],
1094 keyword_results: list[dict[str, Any]],
1095 query_context: dict[str, Any],
1096 limit: int,
1097 source_types: list[str] | None = None,
1098 project_ids: list[str] | None = None,
1099 ) -> list[HybridSearchResult]:
1100 """Combine and rerank results from vector and keyword search."""
1101 combined_dict = {}
1103 # Process vector results
1104 for result in vector_results:
1105 text = result["text"]
1106 if text not in combined_dict:
1107 metadata = result["metadata"]
1108 combined_dict[text] = {
1109 "text": text,
1110 "metadata": metadata,
1111 "source_type": result["source_type"],
1112 "vector_score": result["score"],
1113 "keyword_score": 0.0,
1114 }
1116 # Process keyword results
1117 for result in keyword_results:
1118 text = result["text"]
1119 if text in combined_dict:
1120 combined_dict[text]["keyword_score"] = result["score"]
1121 else:
1122 metadata = result["metadata"]
1123 combined_dict[text] = {
1124 "text": text,
1125 "metadata": metadata,
1126 "source_type": result["source_type"],
1127 "vector_score": 0.0,
1128 "keyword_score": result["score"],
1129 }
1131 # Calculate combined scores and create results
1132 combined_results = []
1134 # 🔥 NEW: Extract intent-specific filtering configuration
1135 search_intent = query_context.get("search_intent")
1136 adaptive_config = query_context.get("adaptive_config")
1137 result_filters = adaptive_config.result_filters if adaptive_config else {}
1139 for text, info in combined_dict.items():
1140 # Skip if source type doesn't match filter
1141 if source_types and info["source_type"] not in source_types:
1142 continue
1144 metadata = info["metadata"]
1145 metadata_info = self._extract_metadata_info(metadata)
1147 # 🔥 NEW: Apply intent-specific result filtering
1148 if search_intent and result_filters:
1149 should_skip = False
1151 # Content type filtering
1152 if "content_type" in result_filters:
1153 allowed_content_types = result_filters["content_type"]
1154 # Check if any content type indicators match
1155 has_matching_content = False
1157 for content_type in allowed_content_types:
1158 if content_type == "code" and metadata_info.get("has_code_blocks"):
1159 has_matching_content = True
1160 break
1161 elif content_type == "documentation" and not metadata_info.get("has_code_blocks"):
1162 has_matching_content = True
1163 break
1164 elif content_type == "technical" and query_context.get("is_technical"):
1165 has_matching_content = True
1166 break
1167 elif content_type in ["requirements", "business", "strategy"]:
1168 # Check if content mentions business terms
1169 business_indicators = metadata_info.get("business_indicators", 0)
1170 if business_indicators > 0:
1171 has_matching_content = True
1172 break
1173 elif content_type in ["guide", "tutorial", "procedure"]:
1174 # Check for procedural content
1175 section_type = metadata_info.get("section_type", "").lower()
1176 if any(proc_word in section_type for proc_word in ["step", "guide", "procedure", "tutorial"]):
1177 has_matching_content = True
1178 break
1180 if not has_matching_content:
1181 should_skip = True
1183 if should_skip:
1184 continue
1186 combined_score = (
1187 self.vector_weight * info["vector_score"]
1188 + self.keyword_weight * info["keyword_score"]
1189 )
1191 if combined_score >= self.min_score:
1192 # Extract project information
1193 project_info = self._extract_project_info(metadata)
1195 boosted_score = self._boost_score_with_metadata(
1196 combined_score, metadata_info, query_context
1197 )
1199 combined_results.append(
1200 HybridSearchResult(
1201 score=boosted_score,
1202 text=text,
1203 source_type=info["source_type"],
1204 source_title=metadata.get("title", ""),
1205 source_url=metadata.get("url"),
1206 file_path=metadata.get("file_path"),
1207 repo_name=metadata.get("repository_name"),
1208 vector_score=info["vector_score"],
1209 keyword_score=info["keyword_score"],
1211 # Project information
1212 project_id=project_info["project_id"],
1213 project_name=project_info["project_name"],
1214 project_description=project_info["project_description"],
1215 collection_name=project_info["collection_name"],
1217 # Basic hierarchy and attachment (existing)
1218 parent_id=metadata_info["parent_id"],
1219 parent_title=metadata_info["parent_title"],
1220 breadcrumb_text=metadata_info["breadcrumb_text"],
1221 depth=metadata_info["depth"],
1222 children_count=metadata_info["children_count"],
1223 hierarchy_context=metadata_info["hierarchy_context"],
1224 is_attachment=metadata_info["is_attachment"],
1225 parent_document_id=metadata_info["parent_document_id"],
1226 parent_document_title=metadata_info["parent_document_title"],
1227 attachment_id=metadata_info["attachment_id"],
1228 original_filename=metadata_info["original_filename"],
1229 file_size=metadata_info["file_size"],
1230 mime_type=metadata_info["mime_type"],
1231 attachment_author=metadata_info["attachment_author"],
1232 attachment_context=metadata_info["attachment_context"],
1234 # 🔥 NEW: Section-level intelligence
1235 section_title=metadata_info["section_title"],
1236 section_type=metadata_info["section_type"],
1237 section_level=metadata_info["section_level"],
1238 section_anchor=metadata_info["section_anchor"],
1239 section_breadcrumb=metadata_info["section_breadcrumb"],
1240 section_depth=metadata_info["section_depth"],
1242 # 🔥 NEW: Content analysis
1243 has_code_blocks=metadata_info["has_code_blocks"],
1244 has_tables=metadata_info["has_tables"],
1245 has_images=metadata_info["has_images"],
1246 has_links=metadata_info["has_links"],
1247 word_count=metadata_info["word_count"],
1248 char_count=metadata_info["char_count"],
1249 estimated_read_time=metadata_info["estimated_read_time"],
1250 paragraph_count=metadata_info["paragraph_count"],
1252 # 🔥 NEW: Semantic analysis
1253 entities=metadata_info["entities"],
1254 topics=metadata_info["topics"],
1255 key_phrases=metadata_info["key_phrases"],
1256 pos_tags=metadata_info["pos_tags"],
1258 # 🔥 NEW: Navigation context
1259 previous_section=metadata_info["previous_section"],
1260 next_section=metadata_info["next_section"],
1261 sibling_sections=metadata_info["sibling_sections"],
1262 subsections=metadata_info["subsections"],
1263 document_hierarchy=metadata_info["document_hierarchy"],
1265 # 🔥 NEW: Chunking context
1266 chunk_index=metadata_info["chunk_index"],
1267 total_chunks=metadata_info["total_chunks"],
1268 chunking_strategy=metadata_info["chunking_strategy"],
1270 # 🔥 NEW: File conversion intelligence
1271 original_file_type=metadata_info["original_file_type"],
1272 conversion_method=metadata_info["conversion_method"],
1273 is_excel_sheet=metadata_info["is_excel_sheet"],
1274 is_converted=metadata_info["is_converted"],
1276 # 🔥 NEW: Cross-references and enhanced context
1277 cross_references=metadata_info["cross_references"],
1278 topic_analysis=metadata_info["topic_analysis"],
1279 content_type_context=metadata_info["content_type_context"],
1280 )
1281 )
1283 # Sort by combined score
1284 combined_results.sort(key=lambda x: x.score, reverse=True)
1286 # 🔥 NEW: Apply diversity filtering for exploratory intents
1287 if adaptive_config and adaptive_config.diversity_factor > 0.0:
1288 diverse_results = self._apply_diversity_filtering(
1289 combined_results, adaptive_config.diversity_factor, limit
1290 )
1291 self.logger.debug(
1292 "🔥 Applied diversity filtering",
1293 original_count=len(combined_results),
1294 diverse_count=len(diverse_results),
1295 diversity_factor=adaptive_config.diversity_factor,
1296 )
1297 return diverse_results
1299 return combined_results[:limit]
1301 def _extract_metadata_info(self, metadata: dict) -> dict:
1302 """Extract comprehensive metadata information from document metadata.
1304 Args:
1305 metadata: Document metadata
1307 Returns:
1308 Dictionary with all available metadata information
1309 """
1310 # 🔥 ENHANCED: Extract ALL the rich metadata we store
1312 # Basic hierarchy information (existing)
1313 hierarchy_info = {
1314 "parent_id": metadata.get("parent_id"),
1315 "parent_title": metadata.get("parent_title"),
1316 "breadcrumb_text": metadata.get("breadcrumb_text"),
1317 "depth": metadata.get("depth"),
1318 "children_count": None,
1319 "hierarchy_context": None,
1320 }
1322 # Calculate children count
1323 children = metadata.get("children", [])
1324 if children:
1325 hierarchy_info["children_count"] = len(children)
1327 # Generate hierarchy context for display
1328 if metadata.get("breadcrumb_text") or metadata.get("depth") is not None:
1329 context_parts = []
1331 if metadata.get("breadcrumb_text"):
1332 context_parts.append(f"Path: {metadata.get('breadcrumb_text')}")
1334 if metadata.get("depth") is not None:
1335 context_parts.append(f"Depth: {metadata.get('depth')}")
1337 if (
1338 hierarchy_info["children_count"] is not None
1339 and hierarchy_info["children_count"] > 0
1340 ):
1341 context_parts.append(f"Children: {hierarchy_info['children_count']}")
1343 if context_parts:
1344 hierarchy_info["hierarchy_context"] = " | ".join(context_parts)
1346 # Basic attachment information (existing)
1347 attachment_info = {
1348 "is_attachment": metadata.get("is_attachment", False),
1349 "parent_document_id": metadata.get("parent_document_id"),
1350 "parent_document_title": metadata.get("parent_document_title"),
1351 "attachment_id": metadata.get("attachment_id"),
1352 "original_filename": metadata.get("original_filename"),
1353 "file_size": metadata.get("file_size"),
1354 "mime_type": metadata.get("mime_type"),
1355 "attachment_author": metadata.get("attachment_author") or metadata.get("author"),
1356 "attachment_context": None,
1357 }
1359 # Generate attachment context for display
1360 if attachment_info["is_attachment"]:
1361 context_parts = []
1363 if attachment_info["original_filename"]:
1364 context_parts.append(f"File: {attachment_info['original_filename']}")
1366 if attachment_info["file_size"]:
1367 # Convert bytes to human readable format
1368 size = attachment_info["file_size"]
1369 if size < 1024:
1370 size_str = f"{size} B"
1371 elif size < 1024 * 1024:
1372 size_str = f"{size / 1024:.1f} KB"
1373 elif size < 1024 * 1024 * 1024:
1374 size_str = f"{size / (1024 * 1024):.1f} MB"
1375 else:
1376 size_str = f"{size / (1024 * 1024 * 1024):.1f} GB"
1377 context_parts.append(f"Size: {size_str}")
1379 if attachment_info["mime_type"]:
1380 context_parts.append(f"Type: {attachment_info['mime_type']}")
1382 if attachment_info["attachment_author"]:
1383 context_parts.append(f"Author: {attachment_info['attachment_author']}")
1385 if context_parts:
1386 attachment_info["attachment_context"] = " | ".join(context_parts)
1388 # 🔥 NEW: Section-level intelligence
1389 section_info = {
1390 "section_title": metadata.get("section_title"),
1391 "section_type": metadata.get("section_type"),
1392 "section_level": metadata.get("section_level"),
1393 "section_anchor": metadata.get("section_anchor"),
1394 "section_breadcrumb": metadata.get("section_breadcrumb"),
1395 "section_depth": metadata.get("section_depth"),
1396 }
1398 # 🔥 NEW: Content analysis from content_type_analysis
1399 content_analysis = metadata.get("content_type_analysis", {})
1400 content_info = {
1401 "has_code_blocks": content_analysis.get("has_code_blocks", False),
1402 "has_tables": content_analysis.get("has_tables", False),
1403 "has_images": content_analysis.get("has_images", False),
1404 "has_links": content_analysis.get("has_links", False),
1405 "word_count": content_analysis.get("word_count"),
1406 "char_count": content_analysis.get("char_count"),
1407 "estimated_read_time": content_analysis.get("estimated_read_time"),
1408 "paragraph_count": content_analysis.get("paragraph_count"),
1409 }
1411 # Generate content type context
1412 content_types = []
1413 if content_info["has_code_blocks"]:
1414 content_types.append("Code")
1415 if content_info["has_tables"]:
1416 content_types.append("Tables")
1417 if content_info["has_images"]:
1418 content_types.append("Images")
1419 if content_info["has_links"]:
1420 content_types.append("Links")
1422 content_type_context = None
1423 if content_types:
1424 content_type_context = f"Contains: {', '.join(content_types)}"
1425 if content_info["word_count"]:
1426 content_type_context += f" | {content_info['word_count']} words"
1427 if content_info["estimated_read_time"]:
1428 content_type_context += f" | ~{content_info['estimated_read_time']}min read"
1430 # 🔥 NEW: Semantic analysis (NLP results)
1431 # Convert spaCy tuples to expected formats for Pydantic validation
1432 raw_entities = metadata.get("entities", [])
1433 raw_topics = metadata.get("topics", [])
1434 raw_key_phrases = metadata.get("key_phrases", [])
1435 raw_pos_tags = metadata.get("pos_tags", [])
1437 # Convert entities from tuples [(text, label)] to dicts [{"text": text, "label": label}]
1438 entities = []
1439 for entity in raw_entities:
1440 if isinstance(entity, (list, tuple)) and len(entity) >= 2:
1441 entities.append({"text": str(entity[0]), "label": str(entity[1])})
1442 elif isinstance(entity, str):
1443 entities.append(entity) # Keep strings as-is
1444 elif isinstance(entity, dict):
1445 entities.append(entity) # Keep dicts as-is
1447 # Convert topics from tuples to dicts
1448 topics = []
1449 for topic in raw_topics:
1450 if isinstance(topic, (list, tuple)) and len(topic) >= 2:
1451 topics.append({"text": str(topic[0]), "score": float(topic[1]) if isinstance(topic[1], (int, float)) else str(topic[1])})
1452 elif isinstance(topic, str):
1453 topics.append(topic) # Keep strings as-is
1454 elif isinstance(topic, dict):
1455 topics.append(topic) # Keep dicts as-is
1457 # Convert key_phrases from tuples to dicts
1458 key_phrases = []
1459 for phrase in raw_key_phrases:
1460 if isinstance(phrase, (list, tuple)) and len(phrase) >= 2:
1461 key_phrases.append({"text": str(phrase[0]), "score": float(phrase[1]) if isinstance(phrase[1], (int, float)) else str(phrase[1])})
1462 elif isinstance(phrase, str):
1463 key_phrases.append(phrase) # Keep strings as-is
1464 elif isinstance(phrase, dict):
1465 key_phrases.append(phrase) # Keep dicts as-is
1467 # Convert pos_tags from tuples [(token, tag)] to dicts [{"token": token, "tag": tag}]
1468 pos_tags = []
1469 for pos_tag in raw_pos_tags:
1470 if isinstance(pos_tag, (list, tuple)) and len(pos_tag) >= 2:
1471 pos_tags.append({"token": str(pos_tag[0]), "tag": str(pos_tag[1])})
1472 elif isinstance(pos_tag, dict):
1473 pos_tags.append(pos_tag) # Keep dicts as-is
1475 semantic_info = {
1476 "entities": entities,
1477 "topics": topics,
1478 "key_phrases": key_phrases,
1479 "pos_tags": pos_tags,
1480 "topic_analysis": metadata.get("topic_analysis"),
1481 }
1483 # 🔥 NEW: Navigation context
1484 navigation_info = {
1485 "previous_section": metadata.get("previous_section"),
1486 "next_section": metadata.get("next_section"),
1487 "sibling_sections": metadata.get("sibling_sections", []),
1488 "subsections": metadata.get("subsections", []),
1489 "document_hierarchy": metadata.get("document_hierarchy", []),
1490 }
1492 # 🔥 NEW: Chunking context
1493 chunking_info = {
1494 "chunk_index": metadata.get("chunk_index"),
1495 "total_chunks": metadata.get("total_chunks"),
1496 "chunking_strategy": metadata.get("chunking_strategy"),
1497 }
1499 # 🔥 NEW: File conversion intelligence
1500 conversion_info = {
1501 "original_file_type": metadata.get("original_file_type"),
1502 "conversion_method": metadata.get("conversion_method"),
1503 "is_excel_sheet": metadata.get("is_excel_sheet", False),
1504 "is_converted": metadata.get("is_converted", False),
1505 }
1507 # 🔥 NEW: Cross-references
1508 cross_reference_info = {
1509 "cross_references": metadata.get("cross_references", []),
1510 }
1512 # Combine all metadata
1513 return {
1514 **hierarchy_info,
1515 **attachment_info,
1516 **section_info,
1517 **content_info,
1518 **semantic_info,
1519 **navigation_info,
1520 **chunking_info,
1521 **conversion_info,
1522 **cross_reference_info,
1523 "content_type_context": content_type_context,
1524 }
1526 def _extract_project_info(self, metadata: dict) -> dict:
1527 """Extract project information from document metadata.
1529 Args:
1530 metadata: Document metadata
1532 Returns:
1533 Dictionary with project information
1534 """
1535 return {
1536 "project_id": metadata.get("project_id"),
1537 "project_name": metadata.get("project_name"),
1538 "project_description": metadata.get("project_description"),
1539 "collection_name": metadata.get("collection_name"),
1540 }
1542 def _build_filter(
1543 self, project_ids: list[str] | None = None
1544 ) -> models.Filter | None:
1545 """Build a Qdrant filter based on project IDs."""
1546 if not project_ids:
1547 return None
1549 return models.Filter(
1550 must=[
1551 models.FieldCondition(
1552 key="project_id", match=models.MatchAny(any=project_ids)
1553 )
1554 ]
1555 )
1557 def _apply_diversity_filtering(
1558 self,
1559 results: list[HybridSearchResult],
1560 diversity_factor: float,
1561 limit: int
1562 ) -> list[HybridSearchResult]:
1563 """🔥 NEW: Apply diversity filtering to promote varied result types."""
1564 if diversity_factor <= 0.0 or len(results) <= limit:
1565 return results[:limit]
1567 diverse_results = []
1568 used_source_types = set()
1569 used_section_types = set()
1570 used_sources = set()
1572 # First pass: Take top results while ensuring diversity
1573 for result in results:
1574 if len(diverse_results) >= limit:
1575 break
1577 # Calculate diversity score
1578 diversity_score = 1.0
1580 # Penalize duplicate source types (less diversity)
1581 source_type = result.source_type
1582 if source_type in used_source_types:
1583 diversity_score *= (1.0 - diversity_factor * 0.3)
1585 # Penalize duplicate section types
1586 section_type = result.section_type or "unknown"
1587 if section_type in used_section_types:
1588 diversity_score *= (1.0 - diversity_factor * 0.2)
1590 # Penalize duplicate sources (same document/file)
1591 source_key = f"{result.source_type}:{result.source_title}"
1592 if source_key in used_sources:
1593 diversity_score *= (1.0 - diversity_factor * 0.4)
1595 # Apply diversity penalty to score
1596 adjusted_score = result.score * diversity_score
1598 # Use original score to determine if we should include this result
1599 if len(diverse_results) < limit * 0.7 or adjusted_score >= result.score * 0.6:
1600 diverse_results.append(result)
1601 used_source_types.add(source_type)
1602 used_section_types.add(section_type)
1603 used_sources.add(source_key)
1605 # Second pass: Fill remaining slots with best remaining results
1606 remaining_slots = limit - len(diverse_results)
1607 if remaining_slots > 0:
1608 remaining_results = [r for r in results if r not in diverse_results]
1609 diverse_results.extend(remaining_results[:remaining_slots])
1611 return diverse_results[:limit]
1613 def get_adaptive_search_stats(self) -> dict[str, Any]:
1614 """🔥 NEW: Get adaptive search statistics for monitoring."""
1615 stats = {
1616 "intent_adaptation_enabled": self.enable_intent_adaptation,
1617 "has_knowledge_graph": self.knowledge_graph is not None,
1618 }
1620 if self.enable_intent_adaptation and self.intent_classifier:
1621 stats.update(self.intent_classifier.get_cache_stats())
1623 if self.adaptive_strategy:
1624 stats.update(self.adaptive_strategy.get_strategy_stats())
1626 return stats
1628 # ============================================================================
1629 # 🔥 Phase 1.3: Dynamic Faceted Search Interface Methods
1630 # ============================================================================
1632 async def search_with_facets(
1633 self,
1634 query: str,
1635 limit: int = 5,
1636 source_types: list[str] | None = None,
1637 project_ids: list[str] | None = None,
1638 facet_filters: list[FacetFilter] | None = None,
1639 generate_facets: bool = True,
1640 session_context: dict[str, Any] | None = None,
1641 behavioral_context: list[str] | None = None,
1642 ) -> FacetedSearchResults:
1643 """
1644 🔥 Phase 1.3: Perform faceted search with dynamic facet generation.
1646 Args:
1647 query: Search query
1648 limit: Maximum number of results
1649 source_types: Optional source type filters
1650 project_ids: Optional project ID filters
1651 facet_filters: Optional facet filters to apply
1652 generate_facets: Whether to generate facets from results
1653 session_context: Optional session context for intent classification
1654 behavioral_context: Optional behavioral context
1656 Returns:
1657 FacetedSearchResults with results and generated facets
1658 """
1659 start_time = datetime.now()
1661 try:
1662 # First, perform regular search (potentially with larger limit for faceting)
1663 search_limit = max(limit * 2, 50) if generate_facets else limit
1665 search_results = await self.search(
1666 query=query,
1667 limit=search_limit,
1668 source_types=source_types,
1669 project_ids=project_ids,
1670 session_context=session_context,
1671 behavioral_context=behavioral_context
1672 )
1674 # Generate faceted results
1675 faceted_results = self.faceted_search_engine.generate_faceted_results(
1676 results=search_results,
1677 applied_filters=facet_filters or []
1678 )
1680 # Limit final results
1681 faceted_results.results = faceted_results.results[:limit]
1682 faceted_results.filtered_count = len(faceted_results.results)
1684 search_time = (datetime.now() - start_time).total_seconds() * 1000
1686 self.logger.info(
1687 "Faceted search completed",
1688 query=query,
1689 total_results=faceted_results.total_results,
1690 filtered_results=faceted_results.filtered_count,
1691 facet_count=len(faceted_results.facets),
1692 active_filters=len(faceted_results.applied_filters),
1693 search_time_ms=round(search_time, 2)
1694 )
1696 return faceted_results
1698 except Exception as e:
1699 self.logger.error("Error in faceted search", query=query, error=str(e))
1700 raise
1702 def apply_facet_filters(
1703 self,
1704 results: list[SearchResult],
1705 filters: list[FacetFilter]
1706 ) -> list[SearchResult]:
1707 """
1708 🔥 Phase 1.3: Apply facet filters to search results.
1710 Args:
1711 results: Search results to filter
1712 filters: Facet filters to apply
1714 Returns:
1715 Filtered search results
1716 """
1717 return self.faceted_search_engine.apply_facet_filters(results, filters)
1719 def generate_facets(
1720 self,
1721 results: list[SearchResult]
1722 ) -> list:
1723 """
1724 🔥 Phase 1.3: Generate dynamic facets from search results.
1726 Args:
1727 results: Search results to analyze
1729 Returns:
1730 List of generated facets
1731 """
1732 return self.faceted_search_engine.facet_generator.generate_facets(results)
1734 def suggest_facet_refinements(
1735 self,
1736 current_results: list[SearchResult],
1737 current_filters: list[FacetFilter]
1738 ) -> list[dict[str, Any]]:
1739 """
1740 🔥 Phase 1.3: Suggest facet refinements based on current results.
1742 Args:
1743 current_results: Current search results
1744 current_filters: Currently applied filters
1746 Returns:
1747 List of suggested refinements with impact estimates
1748 """
1749 return self.faceted_search_engine.suggest_refinements(
1750 current_results,
1751 current_filters
1752 )
1754 # 🔥 Phase 2.3: Cross-Document Intelligence Methods
1756 async def analyze_document_relationships(
1757 self,
1758 documents: list[SearchResult]
1759 ) -> dict[str, Any]:
1760 """
1761 🔥 Phase 2.3: Perform comprehensive cross-document relationship analysis.
1763 Args:
1764 documents: Documents to analyze for relationships
1766 Returns:
1767 Comprehensive analysis including clusters, similarities, and conflicts
1768 """
1769 try:
1770 return self.cross_document_engine.analyze_document_relationships(documents)
1771 except Exception as e:
1772 self.logger.error("Error in cross-document analysis", error=str(e))
1773 raise
1775 async def find_similar_documents(
1776 self,
1777 target_document: SearchResult,
1778 documents: list[SearchResult],
1779 similarity_metrics: list[SimilarityMetric] = None,
1780 max_similar: int = 5
1781 ) -> list[dict[str, Any]]:
1782 """
1783 🔥 Phase 2.3: Find documents similar to a target document.
1785 Args:
1786 target_document: Document to find similar documents for
1787 documents: Pool of documents to search within
1788 similarity_metrics: Metrics to use for similarity calculation
1789 max_similar: Maximum number of similar documents to return
1791 Returns:
1792 List of similar documents with similarity scores
1793 """
1794 try:
1795 similarity_calculator = self.cross_document_engine.similarity_calculator
1796 similar_docs = []
1798 for doc in documents:
1799 if doc == target_document:
1800 continue
1802 similarity = similarity_calculator.calculate_similarity(
1803 target_document,
1804 doc,
1805 similarity_metrics
1806 )
1808 similar_docs.append({
1809 "document": doc,
1810 "similarity_score": similarity.similarity_score,
1811 "metric_scores": similarity.metric_scores,
1812 "similarity_reasons": [similarity.get_display_explanation()]
1813 })
1815 # Sort by similarity score and return top results
1816 similar_docs.sort(key=lambda x: x["similarity_score"], reverse=True)
1817 return similar_docs[:max_similar]
1819 except Exception as e:
1820 self.logger.error("Error finding similar documents", error=str(e))
1821 raise
1823 async def detect_document_conflicts(
1824 self,
1825 documents: list[SearchResult]
1826 ) -> dict[str, Any]:
1827 """
1828 🔥 Phase 2.3: Detect conflicts between documents.
1830 Args:
1831 documents: Documents to analyze for conflicts
1833 Returns:
1834 Conflict analysis with detected conflicts and resolution suggestions
1835 """
1836 try:
1837 conflict_analysis = self.cross_document_engine.conflict_detector.detect_conflicts(documents)
1838 # Convert ConflictAnalysis object to dictionary format
1839 return {
1840 "conflicting_pairs": conflict_analysis.conflicting_pairs,
1841 "conflict_categories": conflict_analysis.conflict_categories,
1842 "resolution_suggestions": conflict_analysis.resolution_suggestions
1843 }
1844 except Exception as e:
1845 self.logger.error("Error detecting conflicts", error=str(e))
1846 raise
1848 async def find_complementary_content(
1849 self,
1850 target_document: SearchResult,
1851 documents: list[SearchResult],
1852 max_recommendations: int = 5
1853 ) -> list[dict[str, Any]]:
1854 """
1855 🔥 Phase 2.3: Find content that complements the target document.
1857 Args:
1858 target_document: Document to find complementary content for
1859 documents: Pool of documents to search within
1860 max_recommendations: Maximum number of recommendations
1862 Returns:
1863 List of complementary documents with recommendation reasons
1864 """
1865 try:
1866 complementary_content = self.cross_document_engine.complementary_finder.find_complementary_content(
1867 target_document,
1868 documents
1869 )
1870 # Get top recommendations and enhance with document objects
1871 recommendations = complementary_content.get_top_recommendations(max_recommendations)
1873 # Create lookup dictionary for documents by ID
1874 doc_lookup = {f"{doc.source_type}:{doc.source_title}": doc for doc in documents}
1876 # Enhance recommendations with full document objects
1877 enhanced_recommendations = []
1878 for rec in recommendations:
1879 doc_id = rec["document_id"]
1880 if doc_id in doc_lookup:
1881 enhanced_rec = {
1882 "document": doc_lookup[doc_id], # Include full document object
1883 "relevance_score": rec["relevance_score"],
1884 "recommendation_reason": rec["recommendation_reason"],
1885 "strategy": rec["strategy"]
1886 }
1887 enhanced_recommendations.append(enhanced_rec)
1889 return enhanced_recommendations
1890 except Exception as e:
1891 self.logger.error("Error finding complementary content", error=str(e))
1892 raise
1894 async def cluster_documents(
1895 self,
1896 documents: list[SearchResult],
1897 strategy: ClusteringStrategy = ClusteringStrategy.MIXED_FEATURES,
1898 max_clusters: int = 10,
1899 min_cluster_size: int = 2
1900 ) -> dict[str, Any]:
1901 """
1902 🔥 Phase 2.3: Cluster documents based on similarity and relationships.
1904 Args:
1905 documents: Documents to cluster
1906 strategy: Clustering strategy to use
1907 max_clusters: Maximum number of clusters to create
1908 min_cluster_size: Minimum size for a cluster
1910 Returns:
1911 Document clusters with metadata and relationships
1912 """
1913 try:
1914 clusters = self.cross_document_engine.cluster_analyzer.create_clusters(
1915 documents,
1916 strategy,
1917 max_clusters,
1918 min_cluster_size
1919 )
1921 # Convert to serializable format
1922 cluster_data = []
1923 for cluster in clusters:
1924 cluster_data.append({
1925 "id": cluster.cluster_id,
1926 "documents": cluster.documents,
1927 "centroid_topics": cluster.shared_topics,
1928 "shared_entities": cluster.shared_entities,
1929 "coherence_score": cluster.coherence_score,
1930 "cluster_summary": cluster.cluster_description
1931 })
1933 return {
1934 "clusters": cluster_data,
1935 "clustering_metadata": {
1936 "strategy": strategy.value,
1937 "total_clusters": len(clusters),
1938 "total_documents": len(documents)
1939 }
1940 }
1942 except Exception as e:
1943 self.logger.error("Error clustering documents", error=str(e))
1944 raise