Coverage for src/qdrant_loader_mcp_server/search/enhanced/intent_classifier.py: 94%

330 statements  

« prev     ^ index     » next       coverage.py v7.10.0, created at 2025-07-25 11:38 +0000

1"""Intent-Aware Adaptive Search for Phase 2.2 Search Enhancement. 

2 

3This module implements advanced intent classification and adaptive search strategies 

4that leverage Phase 1.0 spaCy analysis and Phase 2.1 knowledge graph capabilities. 

5""" 

6 

7import logging 

8import time 

9from dataclasses import dataclass, field 

10from enum import Enum 

11from typing import Any, Dict, List, Optional, Set, Tuple, Union 

12from collections import defaultdict, Counter 

13import math 

14 

15from ...utils.logging import LoggingConfig 

16from ..nlp.spacy_analyzer import SpaCyQueryAnalyzer, QueryAnalysis 

17from ..models import SearchResult 

18from .knowledge_graph import DocumentKnowledgeGraph, TraversalStrategy 

19 

20logger = LoggingConfig.get_logger(__name__) 

21 

22 

23class IntentType(Enum): 

24 """Types of search intents for adaptive search strategies.""" 

25 TECHNICAL_LOOKUP = "technical_lookup" # API docs, code examples, implementation 

26 BUSINESS_CONTEXT = "business_context" # Requirements, objectives, strategy 

27 VENDOR_EVALUATION = "vendor_evaluation" # Proposals, comparisons, criteria 

28 PROCEDURAL = "procedural" # How-to guides, step-by-step 

29 INFORMATIONAL = "informational" # What is, definitions, overviews 

30 EXPLORATORY = "exploratory" # Broad discovery, browsing 

31 TROUBLESHOOTING = "troubleshooting" # Error solving, debugging 

32 GENERAL = "general" # Fallback for unclear intent 

33 

34 

35@dataclass 

36class SearchIntent: 

37 """Container for classified search intent with confidence and context.""" 

38 

39 intent_type: IntentType 

40 confidence: float # 0.0 - 1.0 confidence score 

41 secondary_intents: List[Tuple[IntentType, float]] = field(default_factory=list) 

42 

43 # Linguistic evidence 

44 supporting_evidence: Dict[str, Any] = field(default_factory=dict) 

45 linguistic_features: Dict[str, Any] = field(default_factory=dict) 

46 

47 # Context information 

48 query_complexity: float = 0.0 # From spaCy analysis 

49 is_question: bool = False 

50 is_technical: bool = False 

51 

52 # Behavioral context 

53 session_context: Dict[str, Any] = field(default_factory=dict) 

54 previous_intents: List[IntentType] = field(default_factory=list) 

55 

56 # Processing metadata 

57 classification_time_ms: float = 0.0 

58 

59 

60@dataclass 

61class AdaptiveSearchConfig: 

62 """Configuration for adaptive search based on intent.""" 

63 

64 # Core search parameters 

65 search_strategy: str = "hybrid" # hybrid, vector, keyword 

66 vector_weight: float = 0.7 # Weight for vector search 

67 keyword_weight: float = 0.3 # Weight for keyword search 

68 

69 # Knowledge graph integration 

70 use_knowledge_graph: bool = False 

71 kg_traversal_strategy: TraversalStrategy = TraversalStrategy.SEMANTIC 

72 max_graph_hops: int = 2 

73 kg_expansion_weight: float = 0.2 

74 

75 # Result filtering and ranking 

76 result_filters: Dict[str, Any] = field(default_factory=dict) 

77 ranking_boosts: Dict[str, float] = field(default_factory=dict) 

78 source_type_preferences: Dict[str, float] = field(default_factory=dict) 

79 

80 # Query expansion 

81 expand_query: bool = True 

82 expansion_aggressiveness: float = 0.3 # 0.0 - 1.0 

83 semantic_expansion: bool = True 

84 entity_expansion: bool = True 

85 

86 # Performance tuning 

87 max_results: int = 20 

88 min_score_threshold: float = 0.1 

89 diversity_factor: float = 0.0 # 0.0 = relevance only, 1.0 = max diversity 

90 

91 # Contextual parameters 

92 temporal_bias: float = 0.0 # Bias toward recent content 

93 authority_bias: float = 0.0 # Bias toward authoritative sources 

94 personal_bias: float = 0.0 # Bias toward user's previous interests 

95 

96 

97class IntentClassifier: 

98 """Advanced intent classification using spaCy analysis and behavioral patterns.""" 

99 

100 def __init__(self, spacy_analyzer: SpaCyQueryAnalyzer): 

101 """Initialize the intent classifier.""" 

102 self.spacy_analyzer = spacy_analyzer 

103 self.logger = LoggingConfig.get_logger(__name__) 

104 

105 # Intent classification patterns using spaCy linguistic features 

106 self.intent_patterns = { 

107 IntentType.TECHNICAL_LOOKUP: { 

108 "keywords": { 

109 "api", "apis", "endpoint", "endpoints", "function", "functions", 

110 "method", "methods", "class", "classes", "library", "libraries", 

111 "framework", "frameworks", "code", "implementation", "syntax", 

112 "documentation", "docs", "reference", "specification", "protocol" 

113 }, 

114 "pos_patterns": [ 

115 ["NOUN", "NOUN"], # "API documentation" 

116 ["ADJ", "NOUN"], # "REST API" 

117 ["VERB", "NOUN"], # "implement authentication" 

118 ["NOUN", "VERB"], # "code example" 

119 ], 

120 "entity_types": {"PRODUCT", "ORG", "LANGUAGE"}, 

121 "question_words": {"how", "what"}, 

122 "linguistic_indicators": { 

123 "has_code_terms": True, 

124 "technical_complexity": 0.6, 

125 "verb_imperative": True 

126 }, 

127 "weight": 1.0 

128 }, 

129 

130 IntentType.BUSINESS_CONTEXT: { 

131 "keywords": { 

132 "requirements", "requirement", "objectives", "objective", "goals", "goal", 

133 "strategy", "strategies", "business", "scope", "stakeholder", "stakeholders", 

134 "budget", "timeline", "deliverable", "deliverables", "milestone", 

135 "criteria", "specification", "specifications", "priority", "priorities" 

136 }, 

137 "pos_patterns": [ 

138 ["NOUN", "NOUN"], # "business requirements" 

139 ["ADJ", "NOUN"], # "functional requirements" 

140 ["MODAL", "VERB"], # "should implement" 

141 ["DET", "NOUN", "VERB"], # "the system should" 

142 ], 

143 "entity_types": {"ORG", "MONEY", "PERCENT", "CARDINAL"}, 

144 "question_words": {"what", "why", "which"}, 

145 "linguistic_indicators": { 

146 "has_business_terms": True, 

147 "formal_language": True, 

148 "future_tense": True 

149 }, 

150 "weight": 1.0 

151 }, 

152 

153 IntentType.VENDOR_EVALUATION: { 

154 "keywords": { 

155 "vendor", "vendors", "supplier", "suppliers", "proposal", "proposals", 

156 "bid", "bids", "quote", "quotes", "cost", "costs", "price", "pricing", 

157 "comparison", "compare", "evaluate", "evaluation", "criteria", 

158 "selection", "recommendation", "assessment", "analysis" 

159 }, 

160 "pos_patterns": [ 

161 ["NOUN", "NOUN"], # "vendor proposal" 

162 ["VERB", "NOUN"], # "compare vendors" 

163 ["ADJ", "NOUN"], # "best vendor" 

164 ["NOUN", "VERB", "ADJ"], # "vendor is better" 

165 ], 

166 "entity_types": {"ORG", "MONEY", "PERSON"}, 

167 "question_words": {"which", "who", "what", "how much"}, 

168 "linguistic_indicators": { 

169 "has_comparison": True, 

170 "has_evaluation_terms": True, 

171 "superlative_forms": True 

172 }, 

173 "weight": 1.0 

174 }, 

175 

176 IntentType.PROCEDURAL: { 

177 "keywords": { 

178 "how", "steps", "step", "process", "procedure", "guide", "tutorial", 

179 "walkthrough", "instructions", "setup", "configure", "install", 

180 "deploy", "implement", "create", "build", "make", "do" 

181 }, 

182 "pos_patterns": [ 

183 ["VERB", "NOUN"], # "install package" 

184 ["VERB", "DET", "NOUN"], # "setup the system" 

185 ["ADV", "VERB"], # "how configure" 

186 ["NOUN", "VERB"], # "steps install" 

187 ], 

188 "entity_types": set(), 

189 "question_words": {"how", "when", "where"}, 

190 "linguistic_indicators": { 

191 "imperative_mood": True, 

192 "action_oriented": True, 

193 "sequential_indicators": True 

194 }, 

195 "weight": 1.0 

196 }, 

197 

198 IntentType.INFORMATIONAL: { 

199 "keywords": { 

200 "what", "definition", "meaning", "explain", "overview", "about", 

201 "introduction", "basics", "fundamentals", "concept", "concepts", 

202 "understand", "learn", "know", "information", "details" 

203 }, 

204 "pos_patterns": [ 

205 ["NOUN"], # "authentication" 

206 ["ADJ", "NOUN"], # "basic concept" 

207 ["VERB", "NOUN"], # "understand API" 

208 ["NOUN", "VERB"], # "concept explains" 

209 ], 

210 "entity_types": set(), 

211 "question_words": {"what", "who", "when", "where"}, 

212 "linguistic_indicators": { 

213 "knowledge_seeking": True, 

214 "present_tense": True, 

215 "general_terms": True 

216 }, 

217 "weight": 1.0 

218 }, 

219 

220 IntentType.TROUBLESHOOTING: { 

221 "keywords": { 

222 "error", "errors", "problem", "problems", "issue", "issues", 

223 "bug", "bugs", "fix", "fixes", "solve", "solution", "solutions", 

224 "troubleshoot", "debug", "debugging", "failed", "failing", 

225 "broken", "not working", "doesn't work" 

226 }, 

227 "pos_patterns": [ 

228 ["NOUN", "VERB"], # "error occurs" 

229 ["VERB", "NOUN"], # "fix error" 

230 ["ADJ", "NOUN"], # "broken system" 

231 ["NOUN", "ADJ"], # "system broken" 

232 ], 

233 "entity_types": set(), 

234 "question_words": {"why", "how", "what"}, 

235 "linguistic_indicators": { 

236 "negative_sentiment": True, 

237 "problem_indicators": True, 

238 "past_tense": True 

239 }, 

240 "weight": 1.0 

241 }, 

242 

243 IntentType.EXPLORATORY: { 

244 "keywords": { 

245 "explore", "discover", "find", "search", "browse", "look", 

246 "see", "show", "list", "available", "options", "alternatives", 

247 "similar", "related", "examples", "samples" 

248 }, 

249 "pos_patterns": [ 

250 ["VERB"], # "explore" 

251 ["VERB", "NOUN"], # "find examples" 

252 ["ADJ", "NOUN"], # "similar tools" 

253 ["DET", "NOUN"], # "some options" 

254 ], 

255 "entity_types": set(), 

256 "question_words": {"what", "which"}, 

257 "linguistic_indicators": { 

258 "open_ended": True, 

259 "discovery_oriented": True, 

260 "broad_scope": True 

261 }, 

262 "weight": 0.8 

263 } 

264 } 

265 

266 # Behavioral pattern recognition 

267 self.session_patterns = { 

268 "technical_session": [IntentType.TECHNICAL_LOOKUP, IntentType.PROCEDURAL], 

269 "business_session": [IntentType.BUSINESS_CONTEXT, IntentType.VENDOR_EVALUATION], 

270 "learning_session": [IntentType.INFORMATIONAL, IntentType.EXPLORATORY, IntentType.PROCEDURAL], 

271 "problem_solving": [IntentType.TROUBLESHOOTING, IntentType.PROCEDURAL, IntentType.TECHNICAL_LOOKUP] 

272 } 

273 

274 # Cache for intent classification results 

275 self._intent_cache: Dict[str, SearchIntent] = {} 

276 

277 logger.info("Initialized intent classifier with spaCy integration") 

278 

279 def classify_intent( 

280 self, 

281 query: str, 

282 session_context: Optional[Dict[str, Any]] = None, 

283 behavioral_context: Optional[List[str]] = None 

284 ) -> SearchIntent: 

285 """Classify search intent using comprehensive spaCy analysis.""" 

286 

287 start_time = time.time() 

288 

289 # Check cache first 

290 cache_key = f"{query}:{str(session_context)}:{str(behavioral_context)}" 

291 if cache_key in self._intent_cache: 

292 cached = self._intent_cache[cache_key] 

293 logger.debug(f"Using cached intent classification for: {query[:50]}...") 

294 return cached 

295 

296 try: 

297 # Step 1: Perform spaCy semantic analysis (leveraging Phase 1.0) 

298 spacy_analysis = self.spacy_analyzer.analyze_query_semantic(query) 

299 

300 # Step 2: Extract linguistic features for intent classification 

301 linguistic_features = self._extract_linguistic_features(spacy_analysis, query) 

302 

303 # Step 3: Score each intent type using pattern matching 

304 intent_scores = self._score_intent_patterns( 

305 spacy_analysis, linguistic_features, query 

306 ) 

307 

308 # Step 4: Apply behavioral context weighting 

309 if behavioral_context: 

310 intent_scores = self._apply_behavioral_weighting( 

311 intent_scores, behavioral_context 

312 ) 

313 

314 # Step 5: Apply session context boosting 

315 if session_context: 

316 intent_scores = self._apply_session_context( 

317 intent_scores, session_context 

318 ) 

319 

320 # Step 6: Determine primary and secondary intents 

321 primary_intent, confidence = self._select_primary_intent(intent_scores) 

322 secondary_intents = self._select_secondary_intents(intent_scores, primary_intent) 

323 

324 # Step 7: Build supporting evidence 

325 supporting_evidence = self._build_evidence( 

326 spacy_analysis, linguistic_features, intent_scores 

327 ) 

328 

329 # Step 8: Create intent result 

330 classification_time = (time.time() - start_time) * 1000 

331 

332 search_intent = SearchIntent( 

333 intent_type=primary_intent, 

334 confidence=confidence, 

335 secondary_intents=secondary_intents, 

336 supporting_evidence=supporting_evidence, 

337 linguistic_features=linguistic_features, 

338 query_complexity=spacy_analysis.complexity_score, 

339 is_question=spacy_analysis.is_question, 

340 is_technical=spacy_analysis.is_technical, 

341 session_context=session_context or {}, 

342 previous_intents=behavioral_context or [], 

343 classification_time_ms=classification_time 

344 ) 

345 

346 # Cache the result 

347 self._intent_cache[cache_key] = search_intent 

348 

349 logger.debug( 

350 f"Classified intent in {classification_time:.2f}ms", 

351 query_length=len(query), 

352 primary_intent=primary_intent.value, 

353 confidence=confidence, 

354 secondary_count=len(secondary_intents) 

355 ) 

356 

357 return search_intent 

358 

359 except Exception as e: 

360 logger.error(f"Intent classification failed: {e}") 

361 # Return fallback intent 

362 classification_time = (time.time() - start_time) * 1000 

363 return SearchIntent( 

364 intent_type=IntentType.GENERAL, 

365 confidence=0.5, 

366 classification_time_ms=classification_time 

367 ) 

368 

369 def _extract_linguistic_features( 

370 self, 

371 spacy_analysis: QueryAnalysis, 

372 query: str 

373 ) -> Dict[str, Any]: 

374 """Extract comprehensive linguistic features for intent classification.""" 

375 

376 features = { 

377 # Basic query characteristics 

378 "query_length": len(query.split()), 

379 "has_question_mark": "?" in query, 

380 "starts_with_question_word": False, 

381 "has_imperative_verbs": False, 

382 "has_modal_verbs": False, 

383 

384 # spaCy-derived features 

385 "entity_count": len(spacy_analysis.entities), 

386 "concept_count": len(spacy_analysis.main_concepts), 

387 "keyword_count": len(spacy_analysis.semantic_keywords), 

388 "pos_diversity": len(set(spacy_analysis.pos_patterns)), 

389 

390 # Semantic features 

391 "technical_indicators": 0, 

392 "business_indicators": 0, 

393 "procedural_indicators": 0, 

394 "problem_indicators": 0, 

395 

396 # Entity type analysis 

397 "entity_types": [ent[1] for ent in spacy_analysis.entities], 

398 "has_org_entities": any(ent[1] == "ORG" for ent in spacy_analysis.entities), 

399 "has_product_entities": any(ent[1] == "PRODUCT" for ent in spacy_analysis.entities), 

400 "has_person_entities": any(ent[1] == "PERSON" for ent in spacy_analysis.entities), 

401 "has_money_entities": any(ent[1] == "MONEY" for ent in spacy_analysis.entities), 

402 } 

403 

404 # Analyze question word patterns 

405 question_words = {"what", "how", "why", "when", "who", "where", "which", "whose"} 

406 query_lower = query.lower() 

407 first_word = query_lower.split()[0] if query_lower.split() else "" 

408 features["starts_with_question_word"] = first_word in question_words 

409 

410 # Count technical, business, and procedural indicators 

411 technical_terms = {"api", "code", "function", "method", "library", "framework", "implementation"} 

412 business_terms = {"requirements", "objectives", "strategy", "business", "scope", "criteria"} 

413 procedural_terms = {"how", "steps", "process", "guide", "setup", "install", "configure"} 

414 problem_terms = {"error", "problem", "issue", "bug", "fix", "solve", "broken", "failed"} 

415 

416 keywords_lower = [kw.lower() for kw in spacy_analysis.semantic_keywords] 

417 features["technical_indicators"] = sum(1 for term in technical_terms if term in keywords_lower) 

418 features["business_indicators"] = sum(1 for term in business_terms if term in keywords_lower) 

419 features["procedural_indicators"] = sum(1 for term in procedural_terms if term in keywords_lower) 

420 features["problem_indicators"] = sum(1 for term in problem_terms if term in keywords_lower) 

421 

422 # POS pattern analysis 

423 pos_patterns = spacy_analysis.pos_patterns 

424 features["has_imperative_verbs"] = "VERB" in pos_patterns and features["starts_with_question_word"] 

425 features["has_modal_verbs"] = any(pos in ["MD", "MODAL"] for pos in pos_patterns) 

426 

427 return features 

428 

429 def _score_intent_patterns( 

430 self, 

431 spacy_analysis: QueryAnalysis, 

432 linguistic_features: Dict[str, Any], 

433 query: str 

434 ) -> Dict[IntentType, float]: 

435 """Score each intent type using pattern matching.""" 

436 

437 intent_scores = {} 

438 query_words = set(query.lower().split()) 

439 keywords_set = set(kw.lower() for kw in spacy_analysis.semantic_keywords) 

440 

441 for intent_type, pattern in self.intent_patterns.items(): 

442 score = 0.0 

443 

444 # 1. Keyword matching (40% weight) 

445 keyword_matches = len(keywords_set.intersection(pattern["keywords"])) 

446 keyword_score = keyword_matches / max(len(pattern["keywords"]), 1) 

447 score += keyword_score * 0.4 

448 

449 # 2. POS pattern matching (25% weight) 

450 pos_score = self._match_pos_patterns( 

451 spacy_analysis.pos_patterns, pattern["pos_patterns"] 

452 ) 

453 score += pos_score * 0.25 

454 

455 # 3. Entity type matching (20% weight) 

456 entity_score = self._match_entity_types( 

457 spacy_analysis.entities, pattern["entity_types"] 

458 ) 

459 score += entity_score * 0.20 

460 

461 # 4. Question word matching (10% weight) 

462 question_score = self._match_question_words(query, pattern["question_words"]) 

463 score += question_score * 0.10 

464 

465 # 5. Linguistic indicator bonus (5% weight) 

466 indicator_score = self._match_linguistic_indicators( 

467 linguistic_features, pattern.get("linguistic_indicators", {}) 

468 ) 

469 score += indicator_score * 0.05 

470 

471 # Apply pattern weight 

472 score *= pattern.get("weight", 1.0) 

473 

474 intent_scores[intent_type] = score 

475 

476 return intent_scores 

477 

478 def _match_pos_patterns( 

479 self, 

480 query_pos: List[str], 

481 target_patterns: List[List[str]] 

482 ) -> float: 

483 """Match POS tag patterns in the query.""" 

484 if not target_patterns or not query_pos: 

485 return 0.0 

486 

487 matches = 0 

488 total_patterns = len(target_patterns) 

489 

490 for pattern in target_patterns: 

491 if self._contains_pos_sequence(query_pos, pattern): 

492 matches += 1 

493 

494 return matches / total_patterns 

495 

496 def _contains_pos_sequence(self, pos_tags: List[str], sequence: List[str]) -> bool: 

497 """Check if POS sequence exists in the query.""" 

498 if len(sequence) > len(pos_tags): 

499 return False 

500 

501 for i in range(len(pos_tags) - len(sequence) + 1): 

502 if pos_tags[i:i+len(sequence)] == sequence: 

503 return True 

504 

505 return False 

506 

507 def _match_entity_types( 

508 self, 

509 query_entities: List[Tuple[str, str]], 

510 target_types: Set[str] 

511 ) -> float: 

512 """Match entity types in the query.""" 

513 if not target_types: 

514 return 0.0 

515 

516 query_entity_types = set(ent[1] for ent in query_entities) 

517 matches = len(query_entity_types.intersection(target_types)) 

518 

519 return matches / len(target_types) 

520 

521 def _match_question_words(self, query: str, target_words: Set[str]) -> float: 

522 """Match question words in the query.""" 

523 if not target_words: 

524 return 0.0 

525 

526 query_words = set(query.lower().split()) 

527 matches = len(query_words.intersection(target_words)) 

528 

529 return matches / len(target_words) 

530 

531 def _match_linguistic_indicators( 

532 self, 

533 features: Dict[str, Any], 

534 target_indicators: Dict[str, Any] 

535 ) -> float: 

536 """Match linguistic indicators.""" 

537 if not target_indicators: 

538 return 0.0 

539 

540 score = 0.0 

541 total_indicators = len(target_indicators) 

542 

543 for indicator, expected_value in target_indicators.items(): 

544 if indicator in features: 

545 if isinstance(expected_value, bool): 

546 if features[indicator] == expected_value: 

547 score += 1.0 

548 elif isinstance(expected_value, (int, float)): 

549 # For numeric indicators, use similarity 

550 actual_value = features.get(indicator, 0) 

551 if isinstance(actual_value, (int, float)): 

552 similarity = 1.0 - abs(actual_value - expected_value) / max(expected_value, 1.0) 

553 score += max(0.0, similarity) 

554 

555 return score / max(total_indicators, 1) 

556 

557 def _apply_behavioral_weighting( 

558 self, 

559 intent_scores: Dict[IntentType, float], 

560 behavioral_context: List[str] 

561 ) -> Dict[IntentType, float]: 

562 """Apply behavioral context weighting to intent scores.""" 

563 

564 if not behavioral_context: 

565 return intent_scores 

566 

567 # Convert string intents to IntentType 

568 previous_intents = [] 

569 for intent_str in behavioral_context[-5:]: # Last 5 intents 

570 try: 

571 previous_intents.append(IntentType(intent_str)) 

572 except ValueError: 

573 continue 

574 

575 if not previous_intents: 

576 return intent_scores 

577 

578 weighted_scores = intent_scores.copy() 

579 

580 # Boost scores for intents that commonly follow previous intents 

581 intent_transitions = { 

582 IntentType.INFORMATIONAL: [IntentType.PROCEDURAL, IntentType.TECHNICAL_LOOKUP], 

583 IntentType.TECHNICAL_LOOKUP: [IntentType.PROCEDURAL, IntentType.TROUBLESHOOTING], 

584 IntentType.BUSINESS_CONTEXT: [IntentType.VENDOR_EVALUATION, IntentType.TECHNICAL_LOOKUP], 

585 IntentType.VENDOR_EVALUATION: [IntentType.BUSINESS_CONTEXT, IntentType.TECHNICAL_LOOKUP], 

586 IntentType.PROCEDURAL: [IntentType.TROUBLESHOOTING, IntentType.TECHNICAL_LOOKUP], 

587 IntentType.TROUBLESHOOTING: [IntentType.PROCEDURAL, IntentType.TECHNICAL_LOOKUP] 

588 } 

589 

590 most_recent_intent = previous_intents[-1] 

591 likely_next_intents = intent_transitions.get(most_recent_intent, []) 

592 

593 for intent_type in likely_next_intents: 

594 if intent_type in weighted_scores: 

595 weighted_scores[intent_type] *= 1.2 # 20% boost 

596 

597 # Apply session pattern recognition 

598 for pattern_name, pattern_intents in self.session_patterns.items(): 

599 pattern_match_score = sum( 

600 1 for intent in previous_intents if intent in pattern_intents 

601 ) / len(pattern_intents) 

602 

603 if pattern_match_score > 0.5: # More than half of pattern matched 

604 for intent_type in pattern_intents: 

605 if intent_type in weighted_scores: 

606 weighted_scores[intent_type] *= (1.0 + pattern_match_score * 0.3) 

607 

608 return weighted_scores 

609 

610 def _apply_session_context( 

611 self, 

612 intent_scores: Dict[IntentType, float], 

613 session_context: Dict[str, Any] 

614 ) -> Dict[IntentType, float]: 

615 """Apply session context to intent scores.""" 

616 

617 weighted_scores = intent_scores.copy() 

618 

619 # Apply domain context boosting 

620 domain = session_context.get("domain", "") 

621 if domain == "technical": 

622 weighted_scores[IntentType.TECHNICAL_LOOKUP] *= 1.3 

623 weighted_scores[IntentType.PROCEDURAL] *= 1.2 

624 elif domain == "business": 

625 weighted_scores[IntentType.BUSINESS_CONTEXT] *= 1.3 

626 weighted_scores[IntentType.VENDOR_EVALUATION] *= 1.2 

627 

628 # Apply user role context 

629 user_role = session_context.get("user_role", "") 

630 if user_role in ["developer", "engineer", "architect"]: 

631 weighted_scores[IntentType.TECHNICAL_LOOKUP] *= 1.2 

632 weighted_scores[IntentType.PROCEDURAL] *= 1.1 

633 elif user_role in ["manager", "analyst", "consultant"]: 

634 weighted_scores[IntentType.BUSINESS_CONTEXT] *= 1.2 

635 weighted_scores[IntentType.VENDOR_EVALUATION] *= 1.1 

636 

637 # Apply urgency context 

638 urgency = session_context.get("urgency", "normal") 

639 if urgency == "high": 

640 weighted_scores[IntentType.TROUBLESHOOTING] *= 1.4 

641 weighted_scores[IntentType.PROCEDURAL] *= 1.2 

642 

643 return weighted_scores 

644 

645 def _select_primary_intent( 

646 self, 

647 intent_scores: Dict[IntentType, float] 

648 ) -> Tuple[IntentType, float]: 

649 """Select the primary intent with highest confidence.""" 

650 

651 if not intent_scores: 

652 return IntentType.GENERAL, 0.5 

653 

654 # Find the highest scoring intent 

655 primary_intent = max(intent_scores, key=intent_scores.get) 

656 raw_score = intent_scores[primary_intent] 

657 

658 # Normalize confidence score 

659 total_score = sum(intent_scores.values()) 

660 confidence = raw_score / max(total_score, 1.0) 

661 

662 # Apply confidence threshold 

663 if confidence < 0.3: 

664 return IntentType.GENERAL, confidence 

665 

666 return primary_intent, confidence 

667 

668 def _select_secondary_intents( 

669 self, 

670 intent_scores: Dict[IntentType, float], 

671 primary_intent: IntentType 

672 ) -> List[Tuple[IntentType, float]]: 

673 """Select secondary intents with meaningful confidence.""" 

674 

675 secondary_intents = [] 

676 

677 # Sort intents by score, excluding primary 

678 sorted_intents = sorted( 

679 [(intent, score) for intent, score in intent_scores.items() if intent != primary_intent], 

680 key=lambda x: x[1], 

681 reverse=True 

682 ) 

683 

684 # Include intents with score > 30% of primary intent score 

685 primary_score = intent_scores[primary_intent] 

686 threshold = primary_score * 0.3 

687 

688 for intent, score in sorted_intents[:3]: # Max 3 secondary intents 

689 if score >= threshold: 

690 confidence = score / max(sum(intent_scores.values()), 1.0) 

691 secondary_intents.append((intent, confidence)) 

692 

693 return secondary_intents 

694 

695 def _build_evidence( 

696 self, 

697 spacy_analysis: QueryAnalysis, 

698 linguistic_features: Dict[str, Any], 

699 intent_scores: Dict[IntentType, float] 

700 ) -> Dict[str, Any]: 

701 """Build supporting evidence for the intent classification.""" 

702 

703 return { 

704 "spacy_processing_time": spacy_analysis.processing_time_ms, 

705 "query_complexity": spacy_analysis.complexity_score, 

706 "semantic_keywords": spacy_analysis.semantic_keywords[:5], # Top 5 

707 "extracted_entities": [ent[0] for ent in spacy_analysis.entities[:3]], # Top 3 

708 "main_concepts": spacy_analysis.main_concepts[:3], # Top 3 

709 "intent_signals": spacy_analysis.intent_signals, 

710 "linguistic_features": { 

711 "technical_indicators": linguistic_features.get("technical_indicators", 0), 

712 "business_indicators": linguistic_features.get("business_indicators", 0), 

713 "procedural_indicators": linguistic_features.get("procedural_indicators", 0), 

714 "problem_indicators": linguistic_features.get("problem_indicators", 0) 

715 }, 

716 "top_intent_scores": dict(sorted(intent_scores.items(), key=lambda x: x[1], reverse=True)[:3]) 

717 } 

718 

719 def clear_cache(self): 

720 """Clear intent classification cache.""" 

721 self._intent_cache.clear() 

722 logger.debug("Cleared intent classification cache") 

723 

724 def get_cache_stats(self) -> Dict[str, int]: 

725 """Get cache statistics.""" 

726 return { 

727 "intent_cache_size": len(self._intent_cache), 

728 } 

729 

730 

731class AdaptiveSearchStrategy: 

732 """Adaptive search strategy that configures search based on classified intent.""" 

733 

734 def __init__(self, knowledge_graph: Optional[DocumentKnowledgeGraph] = None): 

735 """Initialize the adaptive search strategy.""" 

736 self.knowledge_graph = knowledge_graph 

737 self.logger = LoggingConfig.get_logger(__name__) 

738 

739 # Define intent-specific search configurations 

740 self.intent_configs = { 

741 IntentType.TECHNICAL_LOOKUP: AdaptiveSearchConfig( 

742 search_strategy="hybrid", 

743 vector_weight=0.8, # Higher vector weight for semantic similarity 

744 keyword_weight=0.2, 

745 use_knowledge_graph=True, 

746 kg_traversal_strategy=TraversalStrategy.SEMANTIC, 

747 max_graph_hops=2, 

748 kg_expansion_weight=0.3, 

749 result_filters={"content_type": ["code", "documentation", "technical"]}, 

750 ranking_boosts={"source_type": {"git": 1.4, "confluence": 1.2}}, 

751 source_type_preferences={"git": 1.5, "documentation": 1.3}, 

752 expand_query=True, 

753 expansion_aggressiveness=0.4, 

754 semantic_expansion=True, 

755 entity_expansion=True, 

756 max_results=25, 

757 min_score_threshold=0.15, 

758 authority_bias=0.3 

759 ), 

760 

761 IntentType.BUSINESS_CONTEXT: AdaptiveSearchConfig( 

762 search_strategy="hybrid", 

763 vector_weight=0.6, # Balanced approach 

764 keyword_weight=0.4, 

765 use_knowledge_graph=True, 

766 kg_traversal_strategy=TraversalStrategy.WEIGHTED, 

767 max_graph_hops=3, 

768 kg_expansion_weight=0.2, 

769 result_filters={"content_type": ["requirements", "business", "strategy"]}, 

770 ranking_boosts={"section_type": {"requirements": 1.5, "objectives": 1.4}}, 

771 source_type_preferences={"confluence": 1.4, "documentation": 1.2}, 

772 expand_query=True, 

773 expansion_aggressiveness=0.3, 

774 semantic_expansion=True, 

775 entity_expansion=False, 

776 max_results=20, 

777 min_score_threshold=0.1, 

778 authority_bias=0.4 

779 ), 

780 

781 IntentType.VENDOR_EVALUATION: AdaptiveSearchConfig( 

782 search_strategy="hybrid", 

783 vector_weight=0.5, # Equal weight for structured comparison 

784 keyword_weight=0.5, 

785 use_knowledge_graph=True, 

786 kg_traversal_strategy=TraversalStrategy.CENTRALITY, 

787 max_graph_hops=2, 

788 kg_expansion_weight=0.25, 

789 result_filters={"content_type": ["proposal", "evaluation", "comparison"]}, 

790 ranking_boosts={"has_money_entities": 1.3, "has_org_entities": 1.2}, 

791 source_type_preferences={"confluence": 1.3, "documentation": 1.1}, 

792 expand_query=True, 

793 expansion_aggressiveness=0.35, 

794 semantic_expansion=True, 

795 entity_expansion=True, 

796 max_results=15, 

797 min_score_threshold=0.12, 

798 diversity_factor=0.3, # Encourage diverse vendor options 

799 authority_bias=0.2 

800 ), 

801 

802 IntentType.PROCEDURAL: AdaptiveSearchConfig( 

803 search_strategy="hybrid", 

804 vector_weight=0.7, # Higher semantic matching for procedures 

805 keyword_weight=0.3, 

806 use_knowledge_graph=True, 

807 kg_traversal_strategy=TraversalStrategy.BREADTH_FIRST, 

808 max_graph_hops=2, 

809 kg_expansion_weight=0.2, 

810 result_filters={"content_type": ["guide", "tutorial", "procedure"]}, 

811 ranking_boosts={"section_type": {"steps": 1.5, "procedure": 1.4, "guide": 1.3}}, 

812 source_type_preferences={"documentation": 1.4, "git": 1.2}, 

813 expand_query=True, 

814 expansion_aggressiveness=0.25, 

815 semantic_expansion=True, 

816 entity_expansion=False, 

817 max_results=15, 

818 min_score_threshold=0.15, 

819 temporal_bias=0.2 # Prefer recent procedures 

820 ), 

821 

822 IntentType.INFORMATIONAL: AdaptiveSearchConfig( 

823 search_strategy="vector", # Vector-first for conceptual understanding 

824 vector_weight=0.9, 

825 keyword_weight=0.1, 

826 use_knowledge_graph=True, 

827 kg_traversal_strategy=TraversalStrategy.SEMANTIC, 

828 max_graph_hops=3, 

829 kg_expansion_weight=0.4, # More expansion for discovery 

830 result_filters={}, 

831 ranking_boosts={"section_type": {"overview": 1.4, "introduction": 1.3}}, 

832 source_type_preferences={"documentation": 1.3, "confluence": 1.1}, 

833 expand_query=True, 

834 expansion_aggressiveness=0.5, # Aggressive expansion for discovery 

835 semantic_expansion=True, 

836 entity_expansion=True, 

837 max_results=30, 

838 min_score_threshold=0.05, 

839 diversity_factor=0.4, # Encourage diverse perspectives 

840 authority_bias=0.3 

841 ), 

842 

843 IntentType.TROUBLESHOOTING: AdaptiveSearchConfig( 

844 search_strategy="hybrid", 

845 vector_weight=0.6, 

846 keyword_weight=0.4, # Higher keyword weight for specific errors 

847 use_knowledge_graph=True, 

848 kg_traversal_strategy=TraversalStrategy.WEIGHTED, 

849 max_graph_hops=2, 

850 kg_expansion_weight=0.15, 

851 result_filters={"content_type": ["troubleshooting", "fix", "solution"]}, 

852 ranking_boosts={"has_problem_indicators": 1.4, "section_type": {"solution": 1.5}}, 

853 source_type_preferences={"git": 1.3, "documentation": 1.2}, 

854 expand_query=False, # Don't expand error-specific queries 

855 expansion_aggressiveness=0.1, 

856 semantic_expansion=False, 

857 entity_expansion=False, 

858 max_results=10, 

859 min_score_threshold=0.2, 

860 temporal_bias=0.3 # Prefer recent solutions 

861 ), 

862 

863 IntentType.EXPLORATORY: AdaptiveSearchConfig( 

864 search_strategy="vector", # Vector-first for exploration 

865 vector_weight=0.85, 

866 keyword_weight=0.15, 

867 use_knowledge_graph=True, 

868 kg_traversal_strategy=TraversalStrategy.BREADTH_FIRST, 

869 max_graph_hops=4, # Deeper exploration 

870 kg_expansion_weight=0.5, # Maximum expansion 

871 result_filters={}, 

872 ranking_boosts={}, 

873 source_type_preferences={}, 

874 expand_query=True, 

875 expansion_aggressiveness=0.6, # Very aggressive expansion 

876 semantic_expansion=True, 

877 entity_expansion=True, 

878 max_results=40, # More results for exploration 

879 min_score_threshold=0.03, # Lower threshold 

880 diversity_factor=0.6, # Maximum diversity 

881 authority_bias=0.1 

882 ), 

883 

884 # Fallback configuration 

885 IntentType.GENERAL: AdaptiveSearchConfig( 

886 search_strategy="hybrid", 

887 vector_weight=0.7, 

888 keyword_weight=0.3, 

889 use_knowledge_graph=False, 

890 expand_query=True, 

891 expansion_aggressiveness=0.3, 

892 semantic_expansion=True, 

893 entity_expansion=True, 

894 max_results=20, 

895 min_score_threshold=0.1 

896 ) 

897 } 

898 

899 logger.info("Initialized adaptive search strategy with intent-specific configurations") 

900 

901 def adapt_search( 

902 self, 

903 search_intent: SearchIntent, 

904 query: str, 

905 base_results: Optional[List[SearchResult]] = None 

906 ) -> AdaptiveSearchConfig: 

907 """Adapt search configuration based on classified intent.""" 

908 

909 try: 

910 # Get base configuration for the primary intent 

911 config = self._get_base_config(search_intent.intent_type) 

912 

913 # Apply confidence-based adjustments 

914 config = self._apply_confidence_adjustments(config, search_intent) 

915 

916 # Apply secondary intent blending 

917 if search_intent.secondary_intents: 

918 config = self._blend_secondary_intents(config, search_intent.secondary_intents) 

919 

920 # Apply query-specific adaptations 

921 config = self._apply_query_adaptations(config, search_intent, query) 

922 

923 # Apply session context adaptations 

924 if search_intent.session_context: 

925 config = self._apply_session_adaptations(config, search_intent.session_context) 

926 

927 logger.debug( 

928 f"Adapted search configuration for {search_intent.intent_type.value}", 

929 confidence=search_intent.confidence, 

930 vector_weight=config.vector_weight, 

931 use_kg=config.use_knowledge_graph, 

932 max_results=config.max_results 

933 ) 

934 

935 return config 

936 

937 except Exception as e: 

938 logger.error(f"Failed to adapt search configuration: {e}") 

939 return self.intent_configs[IntentType.GENERAL] 

940 

941 def _get_base_config(self, intent_type: IntentType) -> AdaptiveSearchConfig: 

942 """Get base configuration for intent type.""" 

943 return self.intent_configs.get(intent_type, self.intent_configs[IntentType.GENERAL]) 

944 

945 def _apply_confidence_adjustments( 

946 self, 

947 config: AdaptiveSearchConfig, 

948 search_intent: SearchIntent 

949 ) -> AdaptiveSearchConfig: 

950 """Apply confidence-based adjustments to the configuration.""" 

951 

952 # Low confidence: reduce aggressiveness, increase diversity 

953 if search_intent.confidence < 0.5: 

954 config.expansion_aggressiveness *= 0.7 

955 config.diversity_factor = min(1.0, config.diversity_factor + 0.2) 

956 config.min_score_threshold *= 0.8 

957 

958 # High confidence: increase precision, reduce diversity 

959 elif search_intent.confidence > 0.8: 

960 config.expansion_aggressiveness *= 1.3 

961 config.diversity_factor *= 0.7 

962 config.min_score_threshold *= 1.2 

963 

964 return config 

965 

966 def _blend_secondary_intents( 

967 self, 

968 config: AdaptiveSearchConfig, 

969 secondary_intents: List[Tuple[IntentType, float]] 

970 ) -> AdaptiveSearchConfig: 

971 """Blend secondary intent configurations with primary.""" 

972 

973 for intent_type, confidence in secondary_intents: 

974 if confidence > 0.3: # Only blend significant secondary intents 

975 secondary_config = self.intent_configs.get(intent_type) 

976 if secondary_config: 

977 blend_factor = confidence * 0.3 # Max 30% blending 

978 

979 # Blend key parameters 

980 config.vector_weight = ( 

981 config.vector_weight * (1 - blend_factor) + 

982 secondary_config.vector_weight * blend_factor 

983 ) 

984 config.expansion_aggressiveness = ( 

985 config.expansion_aggressiveness * (1 - blend_factor) + 

986 secondary_config.expansion_aggressiveness * blend_factor 

987 ) 

988 config.diversity_factor = max( 

989 config.diversity_factor, 

990 secondary_config.diversity_factor * blend_factor 

991 ) 

992 

993 return config 

994 

995 def _apply_query_adaptations( 

996 self, 

997 config: AdaptiveSearchConfig, 

998 search_intent: SearchIntent, 

999 query: str 

1000 ) -> AdaptiveSearchConfig: 

1001 """Apply query-specific adaptations.""" 

1002 

1003 # Short queries: increase expansion 

1004 if len(query.split()) <= 3: 

1005 config.expansion_aggressiveness *= 1.4 

1006 config.semantic_expansion = True 

1007 

1008 # Long queries: reduce expansion, increase precision 

1009 elif len(query.split()) >= 8: 

1010 config.expansion_aggressiveness *= 0.7 

1011 config.min_score_threshold *= 1.2 

1012 

1013 # Very complex queries: use knowledge graph more aggressively 

1014 if search_intent.query_complexity > 0.7: 

1015 config.use_knowledge_graph = True 

1016 config.kg_expansion_weight *= 1.3 

1017 config.max_graph_hops = min(4, config.max_graph_hops + 1) 

1018 

1019 # Question queries: increase semantic weight 

1020 if search_intent.is_question: 

1021 config.vector_weight = min(0.9, config.vector_weight + 0.1) 

1022 config.semantic_expansion = True 

1023 

1024 # Technical queries: boost technical sources 

1025 if search_intent.is_technical: 

1026 config.source_type_preferences["git"] = config.source_type_preferences.get("git", 1.0) * 1.2 

1027 config.authority_bias *= 1.2 

1028 

1029 return config 

1030 

1031 def _apply_session_adaptations( 

1032 self, 

1033 config: AdaptiveSearchConfig, 

1034 session_context: Dict[str, Any] 

1035 ) -> AdaptiveSearchConfig: 

1036 """Apply session context adaptations.""" 

1037 

1038 # Time-sensitive sessions: increase temporal bias 

1039 if session_context.get("urgency") == "high": 

1040 config.temporal_bias = min(1.0, config.temporal_bias + 0.3) 

1041 config.max_results = min(15, config.max_results) 

1042 

1043 # Learning sessions: increase diversity and expansion 

1044 session_type = session_context.get("session_type", "") 

1045 if session_type == "learning": 

1046 config.diversity_factor = min(1.0, config.diversity_factor + 0.2) 

1047 config.expansion_aggressiveness *= 1.2 

1048 config.max_results = min(30, config.max_results + 5) 

1049 

1050 # Focused sessions: increase precision 

1051 elif session_type == "focused": 

1052 config.min_score_threshold *= 1.3 

1053 config.expansion_aggressiveness *= 0.8 

1054 config.max_results = max(10, config.max_results - 5) 

1055 

1056 # User experience level 

1057 experience_level = session_context.get("experience_level", "intermediate") 

1058 if experience_level == "beginner": 

1059 config.source_type_preferences["documentation"] = 1.4 

1060 config.ranking_boosts["section_type"] = {"introduction": 1.5, "overview": 1.4} 

1061 elif experience_level == "expert": 

1062 config.source_type_preferences["git"] = 1.3 

1063 config.ranking_boosts["section_type"] = {"implementation": 1.4, "advanced": 1.3} 

1064 

1065 return config 

1066 

1067 def get_strategy_stats(self) -> Dict[str, Any]: 

1068 """Get adaptive search strategy statistics.""" 

1069 return { 

1070 "intent_types_supported": len(self.intent_configs), 

1071 "has_knowledge_graph": self.knowledge_graph is not None, 

1072 "strategy_types": list(set(config.search_strategy for config in self.intent_configs.values())), 

1073 "traversal_strategies": list(set(config.kg_traversal_strategy.value for config in self.intent_configs.values() if config.use_knowledge_graph)) 

1074 }