Coverage for src/qdrant_loader_mcp_server/search/enhanced/intent_classifier.py: 94%
330 statements
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:38 +0000
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:38 +0000
1"""Intent-Aware Adaptive Search for Phase 2.2 Search Enhancement.
3This module implements advanced intent classification and adaptive search strategies
4that leverage Phase 1.0 spaCy analysis and Phase 2.1 knowledge graph capabilities.
5"""
7import logging
8import time
9from dataclasses import dataclass, field
10from enum import Enum
11from typing import Any, Dict, List, Optional, Set, Tuple, Union
12from collections import defaultdict, Counter
13import math
15from ...utils.logging import LoggingConfig
16from ..nlp.spacy_analyzer import SpaCyQueryAnalyzer, QueryAnalysis
17from ..models import SearchResult
18from .knowledge_graph import DocumentKnowledgeGraph, TraversalStrategy
20logger = LoggingConfig.get_logger(__name__)
23class IntentType(Enum):
24 """Types of search intents for adaptive search strategies."""
25 TECHNICAL_LOOKUP = "technical_lookup" # API docs, code examples, implementation
26 BUSINESS_CONTEXT = "business_context" # Requirements, objectives, strategy
27 VENDOR_EVALUATION = "vendor_evaluation" # Proposals, comparisons, criteria
28 PROCEDURAL = "procedural" # How-to guides, step-by-step
29 INFORMATIONAL = "informational" # What is, definitions, overviews
30 EXPLORATORY = "exploratory" # Broad discovery, browsing
31 TROUBLESHOOTING = "troubleshooting" # Error solving, debugging
32 GENERAL = "general" # Fallback for unclear intent
35@dataclass
36class SearchIntent:
37 """Container for classified search intent with confidence and context."""
39 intent_type: IntentType
40 confidence: float # 0.0 - 1.0 confidence score
41 secondary_intents: List[Tuple[IntentType, float]] = field(default_factory=list)
43 # Linguistic evidence
44 supporting_evidence: Dict[str, Any] = field(default_factory=dict)
45 linguistic_features: Dict[str, Any] = field(default_factory=dict)
47 # Context information
48 query_complexity: float = 0.0 # From spaCy analysis
49 is_question: bool = False
50 is_technical: bool = False
52 # Behavioral context
53 session_context: Dict[str, Any] = field(default_factory=dict)
54 previous_intents: List[IntentType] = field(default_factory=list)
56 # Processing metadata
57 classification_time_ms: float = 0.0
60@dataclass
61class AdaptiveSearchConfig:
62 """Configuration for adaptive search based on intent."""
64 # Core search parameters
65 search_strategy: str = "hybrid" # hybrid, vector, keyword
66 vector_weight: float = 0.7 # Weight for vector search
67 keyword_weight: float = 0.3 # Weight for keyword search
69 # Knowledge graph integration
70 use_knowledge_graph: bool = False
71 kg_traversal_strategy: TraversalStrategy = TraversalStrategy.SEMANTIC
72 max_graph_hops: int = 2
73 kg_expansion_weight: float = 0.2
75 # Result filtering and ranking
76 result_filters: Dict[str, Any] = field(default_factory=dict)
77 ranking_boosts: Dict[str, float] = field(default_factory=dict)
78 source_type_preferences: Dict[str, float] = field(default_factory=dict)
80 # Query expansion
81 expand_query: bool = True
82 expansion_aggressiveness: float = 0.3 # 0.0 - 1.0
83 semantic_expansion: bool = True
84 entity_expansion: bool = True
86 # Performance tuning
87 max_results: int = 20
88 min_score_threshold: float = 0.1
89 diversity_factor: float = 0.0 # 0.0 = relevance only, 1.0 = max diversity
91 # Contextual parameters
92 temporal_bias: float = 0.0 # Bias toward recent content
93 authority_bias: float = 0.0 # Bias toward authoritative sources
94 personal_bias: float = 0.0 # Bias toward user's previous interests
97class IntentClassifier:
98 """Advanced intent classification using spaCy analysis and behavioral patterns."""
100 def __init__(self, spacy_analyzer: SpaCyQueryAnalyzer):
101 """Initialize the intent classifier."""
102 self.spacy_analyzer = spacy_analyzer
103 self.logger = LoggingConfig.get_logger(__name__)
105 # Intent classification patterns using spaCy linguistic features
106 self.intent_patterns = {
107 IntentType.TECHNICAL_LOOKUP: {
108 "keywords": {
109 "api", "apis", "endpoint", "endpoints", "function", "functions",
110 "method", "methods", "class", "classes", "library", "libraries",
111 "framework", "frameworks", "code", "implementation", "syntax",
112 "documentation", "docs", "reference", "specification", "protocol"
113 },
114 "pos_patterns": [
115 ["NOUN", "NOUN"], # "API documentation"
116 ["ADJ", "NOUN"], # "REST API"
117 ["VERB", "NOUN"], # "implement authentication"
118 ["NOUN", "VERB"], # "code example"
119 ],
120 "entity_types": {"PRODUCT", "ORG", "LANGUAGE"},
121 "question_words": {"how", "what"},
122 "linguistic_indicators": {
123 "has_code_terms": True,
124 "technical_complexity": 0.6,
125 "verb_imperative": True
126 },
127 "weight": 1.0
128 },
130 IntentType.BUSINESS_CONTEXT: {
131 "keywords": {
132 "requirements", "requirement", "objectives", "objective", "goals", "goal",
133 "strategy", "strategies", "business", "scope", "stakeholder", "stakeholders",
134 "budget", "timeline", "deliverable", "deliverables", "milestone",
135 "criteria", "specification", "specifications", "priority", "priorities"
136 },
137 "pos_patterns": [
138 ["NOUN", "NOUN"], # "business requirements"
139 ["ADJ", "NOUN"], # "functional requirements"
140 ["MODAL", "VERB"], # "should implement"
141 ["DET", "NOUN", "VERB"], # "the system should"
142 ],
143 "entity_types": {"ORG", "MONEY", "PERCENT", "CARDINAL"},
144 "question_words": {"what", "why", "which"},
145 "linguistic_indicators": {
146 "has_business_terms": True,
147 "formal_language": True,
148 "future_tense": True
149 },
150 "weight": 1.0
151 },
153 IntentType.VENDOR_EVALUATION: {
154 "keywords": {
155 "vendor", "vendors", "supplier", "suppliers", "proposal", "proposals",
156 "bid", "bids", "quote", "quotes", "cost", "costs", "price", "pricing",
157 "comparison", "compare", "evaluate", "evaluation", "criteria",
158 "selection", "recommendation", "assessment", "analysis"
159 },
160 "pos_patterns": [
161 ["NOUN", "NOUN"], # "vendor proposal"
162 ["VERB", "NOUN"], # "compare vendors"
163 ["ADJ", "NOUN"], # "best vendor"
164 ["NOUN", "VERB", "ADJ"], # "vendor is better"
165 ],
166 "entity_types": {"ORG", "MONEY", "PERSON"},
167 "question_words": {"which", "who", "what", "how much"},
168 "linguistic_indicators": {
169 "has_comparison": True,
170 "has_evaluation_terms": True,
171 "superlative_forms": True
172 },
173 "weight": 1.0
174 },
176 IntentType.PROCEDURAL: {
177 "keywords": {
178 "how", "steps", "step", "process", "procedure", "guide", "tutorial",
179 "walkthrough", "instructions", "setup", "configure", "install",
180 "deploy", "implement", "create", "build", "make", "do"
181 },
182 "pos_patterns": [
183 ["VERB", "NOUN"], # "install package"
184 ["VERB", "DET", "NOUN"], # "setup the system"
185 ["ADV", "VERB"], # "how configure"
186 ["NOUN", "VERB"], # "steps install"
187 ],
188 "entity_types": set(),
189 "question_words": {"how", "when", "where"},
190 "linguistic_indicators": {
191 "imperative_mood": True,
192 "action_oriented": True,
193 "sequential_indicators": True
194 },
195 "weight": 1.0
196 },
198 IntentType.INFORMATIONAL: {
199 "keywords": {
200 "what", "definition", "meaning", "explain", "overview", "about",
201 "introduction", "basics", "fundamentals", "concept", "concepts",
202 "understand", "learn", "know", "information", "details"
203 },
204 "pos_patterns": [
205 ["NOUN"], # "authentication"
206 ["ADJ", "NOUN"], # "basic concept"
207 ["VERB", "NOUN"], # "understand API"
208 ["NOUN", "VERB"], # "concept explains"
209 ],
210 "entity_types": set(),
211 "question_words": {"what", "who", "when", "where"},
212 "linguistic_indicators": {
213 "knowledge_seeking": True,
214 "present_tense": True,
215 "general_terms": True
216 },
217 "weight": 1.0
218 },
220 IntentType.TROUBLESHOOTING: {
221 "keywords": {
222 "error", "errors", "problem", "problems", "issue", "issues",
223 "bug", "bugs", "fix", "fixes", "solve", "solution", "solutions",
224 "troubleshoot", "debug", "debugging", "failed", "failing",
225 "broken", "not working", "doesn't work"
226 },
227 "pos_patterns": [
228 ["NOUN", "VERB"], # "error occurs"
229 ["VERB", "NOUN"], # "fix error"
230 ["ADJ", "NOUN"], # "broken system"
231 ["NOUN", "ADJ"], # "system broken"
232 ],
233 "entity_types": set(),
234 "question_words": {"why", "how", "what"},
235 "linguistic_indicators": {
236 "negative_sentiment": True,
237 "problem_indicators": True,
238 "past_tense": True
239 },
240 "weight": 1.0
241 },
243 IntentType.EXPLORATORY: {
244 "keywords": {
245 "explore", "discover", "find", "search", "browse", "look",
246 "see", "show", "list", "available", "options", "alternatives",
247 "similar", "related", "examples", "samples"
248 },
249 "pos_patterns": [
250 ["VERB"], # "explore"
251 ["VERB", "NOUN"], # "find examples"
252 ["ADJ", "NOUN"], # "similar tools"
253 ["DET", "NOUN"], # "some options"
254 ],
255 "entity_types": set(),
256 "question_words": {"what", "which"},
257 "linguistic_indicators": {
258 "open_ended": True,
259 "discovery_oriented": True,
260 "broad_scope": True
261 },
262 "weight": 0.8
263 }
264 }
266 # Behavioral pattern recognition
267 self.session_patterns = {
268 "technical_session": [IntentType.TECHNICAL_LOOKUP, IntentType.PROCEDURAL],
269 "business_session": [IntentType.BUSINESS_CONTEXT, IntentType.VENDOR_EVALUATION],
270 "learning_session": [IntentType.INFORMATIONAL, IntentType.EXPLORATORY, IntentType.PROCEDURAL],
271 "problem_solving": [IntentType.TROUBLESHOOTING, IntentType.PROCEDURAL, IntentType.TECHNICAL_LOOKUP]
272 }
274 # Cache for intent classification results
275 self._intent_cache: Dict[str, SearchIntent] = {}
277 logger.info("Initialized intent classifier with spaCy integration")
279 def classify_intent(
280 self,
281 query: str,
282 session_context: Optional[Dict[str, Any]] = None,
283 behavioral_context: Optional[List[str]] = None
284 ) -> SearchIntent:
285 """Classify search intent using comprehensive spaCy analysis."""
287 start_time = time.time()
289 # Check cache first
290 cache_key = f"{query}:{str(session_context)}:{str(behavioral_context)}"
291 if cache_key in self._intent_cache:
292 cached = self._intent_cache[cache_key]
293 logger.debug(f"Using cached intent classification for: {query[:50]}...")
294 return cached
296 try:
297 # Step 1: Perform spaCy semantic analysis (leveraging Phase 1.0)
298 spacy_analysis = self.spacy_analyzer.analyze_query_semantic(query)
300 # Step 2: Extract linguistic features for intent classification
301 linguistic_features = self._extract_linguistic_features(spacy_analysis, query)
303 # Step 3: Score each intent type using pattern matching
304 intent_scores = self._score_intent_patterns(
305 spacy_analysis, linguistic_features, query
306 )
308 # Step 4: Apply behavioral context weighting
309 if behavioral_context:
310 intent_scores = self._apply_behavioral_weighting(
311 intent_scores, behavioral_context
312 )
314 # Step 5: Apply session context boosting
315 if session_context:
316 intent_scores = self._apply_session_context(
317 intent_scores, session_context
318 )
320 # Step 6: Determine primary and secondary intents
321 primary_intent, confidence = self._select_primary_intent(intent_scores)
322 secondary_intents = self._select_secondary_intents(intent_scores, primary_intent)
324 # Step 7: Build supporting evidence
325 supporting_evidence = self._build_evidence(
326 spacy_analysis, linguistic_features, intent_scores
327 )
329 # Step 8: Create intent result
330 classification_time = (time.time() - start_time) * 1000
332 search_intent = SearchIntent(
333 intent_type=primary_intent,
334 confidence=confidence,
335 secondary_intents=secondary_intents,
336 supporting_evidence=supporting_evidence,
337 linguistic_features=linguistic_features,
338 query_complexity=spacy_analysis.complexity_score,
339 is_question=spacy_analysis.is_question,
340 is_technical=spacy_analysis.is_technical,
341 session_context=session_context or {},
342 previous_intents=behavioral_context or [],
343 classification_time_ms=classification_time
344 )
346 # Cache the result
347 self._intent_cache[cache_key] = search_intent
349 logger.debug(
350 f"Classified intent in {classification_time:.2f}ms",
351 query_length=len(query),
352 primary_intent=primary_intent.value,
353 confidence=confidence,
354 secondary_count=len(secondary_intents)
355 )
357 return search_intent
359 except Exception as e:
360 logger.error(f"Intent classification failed: {e}")
361 # Return fallback intent
362 classification_time = (time.time() - start_time) * 1000
363 return SearchIntent(
364 intent_type=IntentType.GENERAL,
365 confidence=0.5,
366 classification_time_ms=classification_time
367 )
369 def _extract_linguistic_features(
370 self,
371 spacy_analysis: QueryAnalysis,
372 query: str
373 ) -> Dict[str, Any]:
374 """Extract comprehensive linguistic features for intent classification."""
376 features = {
377 # Basic query characteristics
378 "query_length": len(query.split()),
379 "has_question_mark": "?" in query,
380 "starts_with_question_word": False,
381 "has_imperative_verbs": False,
382 "has_modal_verbs": False,
384 # spaCy-derived features
385 "entity_count": len(spacy_analysis.entities),
386 "concept_count": len(spacy_analysis.main_concepts),
387 "keyword_count": len(spacy_analysis.semantic_keywords),
388 "pos_diversity": len(set(spacy_analysis.pos_patterns)),
390 # Semantic features
391 "technical_indicators": 0,
392 "business_indicators": 0,
393 "procedural_indicators": 0,
394 "problem_indicators": 0,
396 # Entity type analysis
397 "entity_types": [ent[1] for ent in spacy_analysis.entities],
398 "has_org_entities": any(ent[1] == "ORG" for ent in spacy_analysis.entities),
399 "has_product_entities": any(ent[1] == "PRODUCT" for ent in spacy_analysis.entities),
400 "has_person_entities": any(ent[1] == "PERSON" for ent in spacy_analysis.entities),
401 "has_money_entities": any(ent[1] == "MONEY" for ent in spacy_analysis.entities),
402 }
404 # Analyze question word patterns
405 question_words = {"what", "how", "why", "when", "who", "where", "which", "whose"}
406 query_lower = query.lower()
407 first_word = query_lower.split()[0] if query_lower.split() else ""
408 features["starts_with_question_word"] = first_word in question_words
410 # Count technical, business, and procedural indicators
411 technical_terms = {"api", "code", "function", "method", "library", "framework", "implementation"}
412 business_terms = {"requirements", "objectives", "strategy", "business", "scope", "criteria"}
413 procedural_terms = {"how", "steps", "process", "guide", "setup", "install", "configure"}
414 problem_terms = {"error", "problem", "issue", "bug", "fix", "solve", "broken", "failed"}
416 keywords_lower = [kw.lower() for kw in spacy_analysis.semantic_keywords]
417 features["technical_indicators"] = sum(1 for term in technical_terms if term in keywords_lower)
418 features["business_indicators"] = sum(1 for term in business_terms if term in keywords_lower)
419 features["procedural_indicators"] = sum(1 for term in procedural_terms if term in keywords_lower)
420 features["problem_indicators"] = sum(1 for term in problem_terms if term in keywords_lower)
422 # POS pattern analysis
423 pos_patterns = spacy_analysis.pos_patterns
424 features["has_imperative_verbs"] = "VERB" in pos_patterns and features["starts_with_question_word"]
425 features["has_modal_verbs"] = any(pos in ["MD", "MODAL"] for pos in pos_patterns)
427 return features
429 def _score_intent_patterns(
430 self,
431 spacy_analysis: QueryAnalysis,
432 linguistic_features: Dict[str, Any],
433 query: str
434 ) -> Dict[IntentType, float]:
435 """Score each intent type using pattern matching."""
437 intent_scores = {}
438 query_words = set(query.lower().split())
439 keywords_set = set(kw.lower() for kw in spacy_analysis.semantic_keywords)
441 for intent_type, pattern in self.intent_patterns.items():
442 score = 0.0
444 # 1. Keyword matching (40% weight)
445 keyword_matches = len(keywords_set.intersection(pattern["keywords"]))
446 keyword_score = keyword_matches / max(len(pattern["keywords"]), 1)
447 score += keyword_score * 0.4
449 # 2. POS pattern matching (25% weight)
450 pos_score = self._match_pos_patterns(
451 spacy_analysis.pos_patterns, pattern["pos_patterns"]
452 )
453 score += pos_score * 0.25
455 # 3. Entity type matching (20% weight)
456 entity_score = self._match_entity_types(
457 spacy_analysis.entities, pattern["entity_types"]
458 )
459 score += entity_score * 0.20
461 # 4. Question word matching (10% weight)
462 question_score = self._match_question_words(query, pattern["question_words"])
463 score += question_score * 0.10
465 # 5. Linguistic indicator bonus (5% weight)
466 indicator_score = self._match_linguistic_indicators(
467 linguistic_features, pattern.get("linguistic_indicators", {})
468 )
469 score += indicator_score * 0.05
471 # Apply pattern weight
472 score *= pattern.get("weight", 1.0)
474 intent_scores[intent_type] = score
476 return intent_scores
478 def _match_pos_patterns(
479 self,
480 query_pos: List[str],
481 target_patterns: List[List[str]]
482 ) -> float:
483 """Match POS tag patterns in the query."""
484 if not target_patterns or not query_pos:
485 return 0.0
487 matches = 0
488 total_patterns = len(target_patterns)
490 for pattern in target_patterns:
491 if self._contains_pos_sequence(query_pos, pattern):
492 matches += 1
494 return matches / total_patterns
496 def _contains_pos_sequence(self, pos_tags: List[str], sequence: List[str]) -> bool:
497 """Check if POS sequence exists in the query."""
498 if len(sequence) > len(pos_tags):
499 return False
501 for i in range(len(pos_tags) - len(sequence) + 1):
502 if pos_tags[i:i+len(sequence)] == sequence:
503 return True
505 return False
507 def _match_entity_types(
508 self,
509 query_entities: List[Tuple[str, str]],
510 target_types: Set[str]
511 ) -> float:
512 """Match entity types in the query."""
513 if not target_types:
514 return 0.0
516 query_entity_types = set(ent[1] for ent in query_entities)
517 matches = len(query_entity_types.intersection(target_types))
519 return matches / len(target_types)
521 def _match_question_words(self, query: str, target_words: Set[str]) -> float:
522 """Match question words in the query."""
523 if not target_words:
524 return 0.0
526 query_words = set(query.lower().split())
527 matches = len(query_words.intersection(target_words))
529 return matches / len(target_words)
531 def _match_linguistic_indicators(
532 self,
533 features: Dict[str, Any],
534 target_indicators: Dict[str, Any]
535 ) -> float:
536 """Match linguistic indicators."""
537 if not target_indicators:
538 return 0.0
540 score = 0.0
541 total_indicators = len(target_indicators)
543 for indicator, expected_value in target_indicators.items():
544 if indicator in features:
545 if isinstance(expected_value, bool):
546 if features[indicator] == expected_value:
547 score += 1.0
548 elif isinstance(expected_value, (int, float)):
549 # For numeric indicators, use similarity
550 actual_value = features.get(indicator, 0)
551 if isinstance(actual_value, (int, float)):
552 similarity = 1.0 - abs(actual_value - expected_value) / max(expected_value, 1.0)
553 score += max(0.0, similarity)
555 return score / max(total_indicators, 1)
557 def _apply_behavioral_weighting(
558 self,
559 intent_scores: Dict[IntentType, float],
560 behavioral_context: List[str]
561 ) -> Dict[IntentType, float]:
562 """Apply behavioral context weighting to intent scores."""
564 if not behavioral_context:
565 return intent_scores
567 # Convert string intents to IntentType
568 previous_intents = []
569 for intent_str in behavioral_context[-5:]: # Last 5 intents
570 try:
571 previous_intents.append(IntentType(intent_str))
572 except ValueError:
573 continue
575 if not previous_intents:
576 return intent_scores
578 weighted_scores = intent_scores.copy()
580 # Boost scores for intents that commonly follow previous intents
581 intent_transitions = {
582 IntentType.INFORMATIONAL: [IntentType.PROCEDURAL, IntentType.TECHNICAL_LOOKUP],
583 IntentType.TECHNICAL_LOOKUP: [IntentType.PROCEDURAL, IntentType.TROUBLESHOOTING],
584 IntentType.BUSINESS_CONTEXT: [IntentType.VENDOR_EVALUATION, IntentType.TECHNICAL_LOOKUP],
585 IntentType.VENDOR_EVALUATION: [IntentType.BUSINESS_CONTEXT, IntentType.TECHNICAL_LOOKUP],
586 IntentType.PROCEDURAL: [IntentType.TROUBLESHOOTING, IntentType.TECHNICAL_LOOKUP],
587 IntentType.TROUBLESHOOTING: [IntentType.PROCEDURAL, IntentType.TECHNICAL_LOOKUP]
588 }
590 most_recent_intent = previous_intents[-1]
591 likely_next_intents = intent_transitions.get(most_recent_intent, [])
593 for intent_type in likely_next_intents:
594 if intent_type in weighted_scores:
595 weighted_scores[intent_type] *= 1.2 # 20% boost
597 # Apply session pattern recognition
598 for pattern_name, pattern_intents in self.session_patterns.items():
599 pattern_match_score = sum(
600 1 for intent in previous_intents if intent in pattern_intents
601 ) / len(pattern_intents)
603 if pattern_match_score > 0.5: # More than half of pattern matched
604 for intent_type in pattern_intents:
605 if intent_type in weighted_scores:
606 weighted_scores[intent_type] *= (1.0 + pattern_match_score * 0.3)
608 return weighted_scores
610 def _apply_session_context(
611 self,
612 intent_scores: Dict[IntentType, float],
613 session_context: Dict[str, Any]
614 ) -> Dict[IntentType, float]:
615 """Apply session context to intent scores."""
617 weighted_scores = intent_scores.copy()
619 # Apply domain context boosting
620 domain = session_context.get("domain", "")
621 if domain == "technical":
622 weighted_scores[IntentType.TECHNICAL_LOOKUP] *= 1.3
623 weighted_scores[IntentType.PROCEDURAL] *= 1.2
624 elif domain == "business":
625 weighted_scores[IntentType.BUSINESS_CONTEXT] *= 1.3
626 weighted_scores[IntentType.VENDOR_EVALUATION] *= 1.2
628 # Apply user role context
629 user_role = session_context.get("user_role", "")
630 if user_role in ["developer", "engineer", "architect"]:
631 weighted_scores[IntentType.TECHNICAL_LOOKUP] *= 1.2
632 weighted_scores[IntentType.PROCEDURAL] *= 1.1
633 elif user_role in ["manager", "analyst", "consultant"]:
634 weighted_scores[IntentType.BUSINESS_CONTEXT] *= 1.2
635 weighted_scores[IntentType.VENDOR_EVALUATION] *= 1.1
637 # Apply urgency context
638 urgency = session_context.get("urgency", "normal")
639 if urgency == "high":
640 weighted_scores[IntentType.TROUBLESHOOTING] *= 1.4
641 weighted_scores[IntentType.PROCEDURAL] *= 1.2
643 return weighted_scores
645 def _select_primary_intent(
646 self,
647 intent_scores: Dict[IntentType, float]
648 ) -> Tuple[IntentType, float]:
649 """Select the primary intent with highest confidence."""
651 if not intent_scores:
652 return IntentType.GENERAL, 0.5
654 # Find the highest scoring intent
655 primary_intent = max(intent_scores, key=intent_scores.get)
656 raw_score = intent_scores[primary_intent]
658 # Normalize confidence score
659 total_score = sum(intent_scores.values())
660 confidence = raw_score / max(total_score, 1.0)
662 # Apply confidence threshold
663 if confidence < 0.3:
664 return IntentType.GENERAL, confidence
666 return primary_intent, confidence
668 def _select_secondary_intents(
669 self,
670 intent_scores: Dict[IntentType, float],
671 primary_intent: IntentType
672 ) -> List[Tuple[IntentType, float]]:
673 """Select secondary intents with meaningful confidence."""
675 secondary_intents = []
677 # Sort intents by score, excluding primary
678 sorted_intents = sorted(
679 [(intent, score) for intent, score in intent_scores.items() if intent != primary_intent],
680 key=lambda x: x[1],
681 reverse=True
682 )
684 # Include intents with score > 30% of primary intent score
685 primary_score = intent_scores[primary_intent]
686 threshold = primary_score * 0.3
688 for intent, score in sorted_intents[:3]: # Max 3 secondary intents
689 if score >= threshold:
690 confidence = score / max(sum(intent_scores.values()), 1.0)
691 secondary_intents.append((intent, confidence))
693 return secondary_intents
695 def _build_evidence(
696 self,
697 spacy_analysis: QueryAnalysis,
698 linguistic_features: Dict[str, Any],
699 intent_scores: Dict[IntentType, float]
700 ) -> Dict[str, Any]:
701 """Build supporting evidence for the intent classification."""
703 return {
704 "spacy_processing_time": spacy_analysis.processing_time_ms,
705 "query_complexity": spacy_analysis.complexity_score,
706 "semantic_keywords": spacy_analysis.semantic_keywords[:5], # Top 5
707 "extracted_entities": [ent[0] for ent in spacy_analysis.entities[:3]], # Top 3
708 "main_concepts": spacy_analysis.main_concepts[:3], # Top 3
709 "intent_signals": spacy_analysis.intent_signals,
710 "linguistic_features": {
711 "technical_indicators": linguistic_features.get("technical_indicators", 0),
712 "business_indicators": linguistic_features.get("business_indicators", 0),
713 "procedural_indicators": linguistic_features.get("procedural_indicators", 0),
714 "problem_indicators": linguistic_features.get("problem_indicators", 0)
715 },
716 "top_intent_scores": dict(sorted(intent_scores.items(), key=lambda x: x[1], reverse=True)[:3])
717 }
719 def clear_cache(self):
720 """Clear intent classification cache."""
721 self._intent_cache.clear()
722 logger.debug("Cleared intent classification cache")
724 def get_cache_stats(self) -> Dict[str, int]:
725 """Get cache statistics."""
726 return {
727 "intent_cache_size": len(self._intent_cache),
728 }
731class AdaptiveSearchStrategy:
732 """Adaptive search strategy that configures search based on classified intent."""
734 def __init__(self, knowledge_graph: Optional[DocumentKnowledgeGraph] = None):
735 """Initialize the adaptive search strategy."""
736 self.knowledge_graph = knowledge_graph
737 self.logger = LoggingConfig.get_logger(__name__)
739 # Define intent-specific search configurations
740 self.intent_configs = {
741 IntentType.TECHNICAL_LOOKUP: AdaptiveSearchConfig(
742 search_strategy="hybrid",
743 vector_weight=0.8, # Higher vector weight for semantic similarity
744 keyword_weight=0.2,
745 use_knowledge_graph=True,
746 kg_traversal_strategy=TraversalStrategy.SEMANTIC,
747 max_graph_hops=2,
748 kg_expansion_weight=0.3,
749 result_filters={"content_type": ["code", "documentation", "technical"]},
750 ranking_boosts={"source_type": {"git": 1.4, "confluence": 1.2}},
751 source_type_preferences={"git": 1.5, "documentation": 1.3},
752 expand_query=True,
753 expansion_aggressiveness=0.4,
754 semantic_expansion=True,
755 entity_expansion=True,
756 max_results=25,
757 min_score_threshold=0.15,
758 authority_bias=0.3
759 ),
761 IntentType.BUSINESS_CONTEXT: AdaptiveSearchConfig(
762 search_strategy="hybrid",
763 vector_weight=0.6, # Balanced approach
764 keyword_weight=0.4,
765 use_knowledge_graph=True,
766 kg_traversal_strategy=TraversalStrategy.WEIGHTED,
767 max_graph_hops=3,
768 kg_expansion_weight=0.2,
769 result_filters={"content_type": ["requirements", "business", "strategy"]},
770 ranking_boosts={"section_type": {"requirements": 1.5, "objectives": 1.4}},
771 source_type_preferences={"confluence": 1.4, "documentation": 1.2},
772 expand_query=True,
773 expansion_aggressiveness=0.3,
774 semantic_expansion=True,
775 entity_expansion=False,
776 max_results=20,
777 min_score_threshold=0.1,
778 authority_bias=0.4
779 ),
781 IntentType.VENDOR_EVALUATION: AdaptiveSearchConfig(
782 search_strategy="hybrid",
783 vector_weight=0.5, # Equal weight for structured comparison
784 keyword_weight=0.5,
785 use_knowledge_graph=True,
786 kg_traversal_strategy=TraversalStrategy.CENTRALITY,
787 max_graph_hops=2,
788 kg_expansion_weight=0.25,
789 result_filters={"content_type": ["proposal", "evaluation", "comparison"]},
790 ranking_boosts={"has_money_entities": 1.3, "has_org_entities": 1.2},
791 source_type_preferences={"confluence": 1.3, "documentation": 1.1},
792 expand_query=True,
793 expansion_aggressiveness=0.35,
794 semantic_expansion=True,
795 entity_expansion=True,
796 max_results=15,
797 min_score_threshold=0.12,
798 diversity_factor=0.3, # Encourage diverse vendor options
799 authority_bias=0.2
800 ),
802 IntentType.PROCEDURAL: AdaptiveSearchConfig(
803 search_strategy="hybrid",
804 vector_weight=0.7, # Higher semantic matching for procedures
805 keyword_weight=0.3,
806 use_knowledge_graph=True,
807 kg_traversal_strategy=TraversalStrategy.BREADTH_FIRST,
808 max_graph_hops=2,
809 kg_expansion_weight=0.2,
810 result_filters={"content_type": ["guide", "tutorial", "procedure"]},
811 ranking_boosts={"section_type": {"steps": 1.5, "procedure": 1.4, "guide": 1.3}},
812 source_type_preferences={"documentation": 1.4, "git": 1.2},
813 expand_query=True,
814 expansion_aggressiveness=0.25,
815 semantic_expansion=True,
816 entity_expansion=False,
817 max_results=15,
818 min_score_threshold=0.15,
819 temporal_bias=0.2 # Prefer recent procedures
820 ),
822 IntentType.INFORMATIONAL: AdaptiveSearchConfig(
823 search_strategy="vector", # Vector-first for conceptual understanding
824 vector_weight=0.9,
825 keyword_weight=0.1,
826 use_knowledge_graph=True,
827 kg_traversal_strategy=TraversalStrategy.SEMANTIC,
828 max_graph_hops=3,
829 kg_expansion_weight=0.4, # More expansion for discovery
830 result_filters={},
831 ranking_boosts={"section_type": {"overview": 1.4, "introduction": 1.3}},
832 source_type_preferences={"documentation": 1.3, "confluence": 1.1},
833 expand_query=True,
834 expansion_aggressiveness=0.5, # Aggressive expansion for discovery
835 semantic_expansion=True,
836 entity_expansion=True,
837 max_results=30,
838 min_score_threshold=0.05,
839 diversity_factor=0.4, # Encourage diverse perspectives
840 authority_bias=0.3
841 ),
843 IntentType.TROUBLESHOOTING: AdaptiveSearchConfig(
844 search_strategy="hybrid",
845 vector_weight=0.6,
846 keyword_weight=0.4, # Higher keyword weight for specific errors
847 use_knowledge_graph=True,
848 kg_traversal_strategy=TraversalStrategy.WEIGHTED,
849 max_graph_hops=2,
850 kg_expansion_weight=0.15,
851 result_filters={"content_type": ["troubleshooting", "fix", "solution"]},
852 ranking_boosts={"has_problem_indicators": 1.4, "section_type": {"solution": 1.5}},
853 source_type_preferences={"git": 1.3, "documentation": 1.2},
854 expand_query=False, # Don't expand error-specific queries
855 expansion_aggressiveness=0.1,
856 semantic_expansion=False,
857 entity_expansion=False,
858 max_results=10,
859 min_score_threshold=0.2,
860 temporal_bias=0.3 # Prefer recent solutions
861 ),
863 IntentType.EXPLORATORY: AdaptiveSearchConfig(
864 search_strategy="vector", # Vector-first for exploration
865 vector_weight=0.85,
866 keyword_weight=0.15,
867 use_knowledge_graph=True,
868 kg_traversal_strategy=TraversalStrategy.BREADTH_FIRST,
869 max_graph_hops=4, # Deeper exploration
870 kg_expansion_weight=0.5, # Maximum expansion
871 result_filters={},
872 ranking_boosts={},
873 source_type_preferences={},
874 expand_query=True,
875 expansion_aggressiveness=0.6, # Very aggressive expansion
876 semantic_expansion=True,
877 entity_expansion=True,
878 max_results=40, # More results for exploration
879 min_score_threshold=0.03, # Lower threshold
880 diversity_factor=0.6, # Maximum diversity
881 authority_bias=0.1
882 ),
884 # Fallback configuration
885 IntentType.GENERAL: AdaptiveSearchConfig(
886 search_strategy="hybrid",
887 vector_weight=0.7,
888 keyword_weight=0.3,
889 use_knowledge_graph=False,
890 expand_query=True,
891 expansion_aggressiveness=0.3,
892 semantic_expansion=True,
893 entity_expansion=True,
894 max_results=20,
895 min_score_threshold=0.1
896 )
897 }
899 logger.info("Initialized adaptive search strategy with intent-specific configurations")
901 def adapt_search(
902 self,
903 search_intent: SearchIntent,
904 query: str,
905 base_results: Optional[List[SearchResult]] = None
906 ) -> AdaptiveSearchConfig:
907 """Adapt search configuration based on classified intent."""
909 try:
910 # Get base configuration for the primary intent
911 config = self._get_base_config(search_intent.intent_type)
913 # Apply confidence-based adjustments
914 config = self._apply_confidence_adjustments(config, search_intent)
916 # Apply secondary intent blending
917 if search_intent.secondary_intents:
918 config = self._blend_secondary_intents(config, search_intent.secondary_intents)
920 # Apply query-specific adaptations
921 config = self._apply_query_adaptations(config, search_intent, query)
923 # Apply session context adaptations
924 if search_intent.session_context:
925 config = self._apply_session_adaptations(config, search_intent.session_context)
927 logger.debug(
928 f"Adapted search configuration for {search_intent.intent_type.value}",
929 confidence=search_intent.confidence,
930 vector_weight=config.vector_weight,
931 use_kg=config.use_knowledge_graph,
932 max_results=config.max_results
933 )
935 return config
937 except Exception as e:
938 logger.error(f"Failed to adapt search configuration: {e}")
939 return self.intent_configs[IntentType.GENERAL]
941 def _get_base_config(self, intent_type: IntentType) -> AdaptiveSearchConfig:
942 """Get base configuration for intent type."""
943 return self.intent_configs.get(intent_type, self.intent_configs[IntentType.GENERAL])
945 def _apply_confidence_adjustments(
946 self,
947 config: AdaptiveSearchConfig,
948 search_intent: SearchIntent
949 ) -> AdaptiveSearchConfig:
950 """Apply confidence-based adjustments to the configuration."""
952 # Low confidence: reduce aggressiveness, increase diversity
953 if search_intent.confidence < 0.5:
954 config.expansion_aggressiveness *= 0.7
955 config.diversity_factor = min(1.0, config.diversity_factor + 0.2)
956 config.min_score_threshold *= 0.8
958 # High confidence: increase precision, reduce diversity
959 elif search_intent.confidence > 0.8:
960 config.expansion_aggressiveness *= 1.3
961 config.diversity_factor *= 0.7
962 config.min_score_threshold *= 1.2
964 return config
966 def _blend_secondary_intents(
967 self,
968 config: AdaptiveSearchConfig,
969 secondary_intents: List[Tuple[IntentType, float]]
970 ) -> AdaptiveSearchConfig:
971 """Blend secondary intent configurations with primary."""
973 for intent_type, confidence in secondary_intents:
974 if confidence > 0.3: # Only blend significant secondary intents
975 secondary_config = self.intent_configs.get(intent_type)
976 if secondary_config:
977 blend_factor = confidence * 0.3 # Max 30% blending
979 # Blend key parameters
980 config.vector_weight = (
981 config.vector_weight * (1 - blend_factor) +
982 secondary_config.vector_weight * blend_factor
983 )
984 config.expansion_aggressiveness = (
985 config.expansion_aggressiveness * (1 - blend_factor) +
986 secondary_config.expansion_aggressiveness * blend_factor
987 )
988 config.diversity_factor = max(
989 config.diversity_factor,
990 secondary_config.diversity_factor * blend_factor
991 )
993 return config
995 def _apply_query_adaptations(
996 self,
997 config: AdaptiveSearchConfig,
998 search_intent: SearchIntent,
999 query: str
1000 ) -> AdaptiveSearchConfig:
1001 """Apply query-specific adaptations."""
1003 # Short queries: increase expansion
1004 if len(query.split()) <= 3:
1005 config.expansion_aggressiveness *= 1.4
1006 config.semantic_expansion = True
1008 # Long queries: reduce expansion, increase precision
1009 elif len(query.split()) >= 8:
1010 config.expansion_aggressiveness *= 0.7
1011 config.min_score_threshold *= 1.2
1013 # Very complex queries: use knowledge graph more aggressively
1014 if search_intent.query_complexity > 0.7:
1015 config.use_knowledge_graph = True
1016 config.kg_expansion_weight *= 1.3
1017 config.max_graph_hops = min(4, config.max_graph_hops + 1)
1019 # Question queries: increase semantic weight
1020 if search_intent.is_question:
1021 config.vector_weight = min(0.9, config.vector_weight + 0.1)
1022 config.semantic_expansion = True
1024 # Technical queries: boost technical sources
1025 if search_intent.is_technical:
1026 config.source_type_preferences["git"] = config.source_type_preferences.get("git", 1.0) * 1.2
1027 config.authority_bias *= 1.2
1029 return config
1031 def _apply_session_adaptations(
1032 self,
1033 config: AdaptiveSearchConfig,
1034 session_context: Dict[str, Any]
1035 ) -> AdaptiveSearchConfig:
1036 """Apply session context adaptations."""
1038 # Time-sensitive sessions: increase temporal bias
1039 if session_context.get("urgency") == "high":
1040 config.temporal_bias = min(1.0, config.temporal_bias + 0.3)
1041 config.max_results = min(15, config.max_results)
1043 # Learning sessions: increase diversity and expansion
1044 session_type = session_context.get("session_type", "")
1045 if session_type == "learning":
1046 config.diversity_factor = min(1.0, config.diversity_factor + 0.2)
1047 config.expansion_aggressiveness *= 1.2
1048 config.max_results = min(30, config.max_results + 5)
1050 # Focused sessions: increase precision
1051 elif session_type == "focused":
1052 config.min_score_threshold *= 1.3
1053 config.expansion_aggressiveness *= 0.8
1054 config.max_results = max(10, config.max_results - 5)
1056 # User experience level
1057 experience_level = session_context.get("experience_level", "intermediate")
1058 if experience_level == "beginner":
1059 config.source_type_preferences["documentation"] = 1.4
1060 config.ranking_boosts["section_type"] = {"introduction": 1.5, "overview": 1.4}
1061 elif experience_level == "expert":
1062 config.source_type_preferences["git"] = 1.3
1063 config.ranking_boosts["section_type"] = {"implementation": 1.4, "advanced": 1.3}
1065 return config
1067 def get_strategy_stats(self) -> Dict[str, Any]:
1068 """Get adaptive search strategy statistics."""
1069 return {
1070 "intent_types_supported": len(self.intent_configs),
1071 "has_knowledge_graph": self.knowledge_graph is not None,
1072 "strategy_types": list(set(config.search_strategy for config in self.intent_configs.values())),
1073 "traversal_strategies": list(set(config.kg_traversal_strategy.value for config in self.intent_configs.values() if config.use_knowledge_graph))
1074 }