Coverage for src/qdrant_loader_mcp_server/search/enhanced/intent/classifier.py: 91%

211 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1""" 

2Intent Classification Engine for Search Enhancement. 

3 

4This module implements the main IntentClassifier that uses spaCy analysis and 

5behavioral patterns to classify search intents with high accuracy. 

6""" 

7 

8from __future__ import annotations 

9 

10import time 

11from typing import TYPE_CHECKING, Any 

12 

13from ....utils.logging import LoggingConfig 

14from .models import IntentType, SearchIntent 

15 

16_SPACY_IMPORT_ERROR: BaseException | None = None 

17 

18if TYPE_CHECKING: 

19 from ...nlp.spacy_analyzer import QueryAnalysis, SpaCyQueryAnalyzer 

20else: 

21 try: 

22 from ...nlp.spacy_analyzer import QueryAnalysis, SpaCyQueryAnalyzer 

23 except ( 

24 ImportError, 

25 ModuleNotFoundError, 

26 ) as _exc: # pragma: no cover - optional dep 

27 # Provide safe sentinels for runtime to avoid NameErrors in annotations 

28 QueryAnalysis = Any # type: ignore[assignment] 

29 SpaCyQueryAnalyzer = Any # type: ignore[assignment] 

30 _SPACY_IMPORT_ERROR = _exc 

31 

32logger = LoggingConfig.get_logger(__name__) 

33 

34 

35class IntentClassifier: 

36 """Advanced intent classification using spaCy analysis and behavioral patterns.""" 

37 

38 def __init__(self, spacy_analyzer): 

39 """Initialize the intent classifier. 

40 

41 The constructor validates that the spaCy analyzer dependency is available. 

42 If a valid analyzer instance is not provided, it attempts a runtime import. 

43 On failure, it raises an ImportError with actionable guidance so callers 

44 fail fast rather than encountering None-attribute errors later. 

45 """ 

46 if spacy_analyzer is None: 

47 # Do not perform ad-hoc imports here; require explicit injection 

48 if _SPACY_IMPORT_ERROR is not None: 

49 raise ImportError( 

50 "SpaCyQueryAnalyzer is not available. Install optional NLP deps (spacy and model) " 

51 "and provide an initialized analyzer instance to IntentClassifier." 

52 ) from _SPACY_IMPORT_ERROR 

53 raise ImportError( 

54 "A spaCy analyzer instance must be provided to IntentClassifier. " 

55 "Use SpaCyQueryAnalyzer() and pass it explicitly." 

56 ) 

57 self.spacy_analyzer = spacy_analyzer 

58 

59 # Final sanity check to fail fast if analyzer is misconfigured 

60 if not hasattr(self.spacy_analyzer, "analyze_query_semantic"): 

61 raise ImportError( 

62 "Provided spaCy analyzer does not implement 'analyze_query_semantic'. " 

63 "Pass a compatible analyzer or install the default SpaCyQueryAnalyzer." 

64 ) 

65 self.logger = LoggingConfig.get_logger(__name__) 

66 

67 # Intent classification patterns using spaCy linguistic features 

68 self.intent_patterns = { 

69 IntentType.TECHNICAL_LOOKUP: { 

70 "keywords": { 

71 "api", 

72 "apis", 

73 "endpoint", 

74 "endpoints", 

75 "function", 

76 "functions", 

77 "method", 

78 "methods", 

79 "class", 

80 "classes", 

81 "library", 

82 "libraries", 

83 "framework", 

84 "frameworks", 

85 "code", 

86 "implementation", 

87 "syntax", 

88 "documentation", 

89 "docs", 

90 "reference", 

91 "specification", 

92 "protocol", 

93 }, 

94 "pos_patterns": [ 

95 ["NOUN", "NOUN"], # "API documentation" 

96 ["ADJ", "NOUN"], # "REST API" 

97 ["VERB", "NOUN"], # "implement authentication" 

98 ["NOUN", "VERB"], # "code example" 

99 ], 

100 "entity_types": {"PRODUCT", "ORG", "LANGUAGE"}, 

101 "question_words": {"how", "what"}, 

102 "linguistic_indicators": { 

103 "has_code_terms": True, 

104 "technical_complexity": 0.6, 

105 "verb_imperative": True, 

106 }, 

107 "weight": 1.0, 

108 }, 

109 IntentType.BUSINESS_CONTEXT: { 

110 "keywords": { 

111 "requirements", 

112 "requirement", 

113 "objectives", 

114 "objective", 

115 "goals", 

116 "goal", 

117 "strategy", 

118 "strategies", 

119 "business", 

120 "scope", 

121 "stakeholder", 

122 "stakeholders", 

123 "budget", 

124 "timeline", 

125 "deliverable", 

126 "deliverables", 

127 "milestone", 

128 "criteria", 

129 "specification", 

130 "specifications", 

131 "priority", 

132 "priorities", 

133 }, 

134 "pos_patterns": [ 

135 ["NOUN", "NOUN"], # "business requirements" 

136 ["ADJ", "NOUN"], # "functional requirements" 

137 ["MODAL", "VERB"], # "should implement" 

138 ["DET", "NOUN", "VERB"], # "the system should" 

139 ], 

140 "entity_types": {"ORG", "MONEY", "PERCENT", "CARDINAL"}, 

141 "question_words": {"what", "why", "which"}, 

142 "linguistic_indicators": { 

143 "has_business_terms": True, 

144 "formal_language": True, 

145 "future_tense": True, 

146 }, 

147 "weight": 1.0, 

148 }, 

149 IntentType.VENDOR_EVALUATION: { 

150 "keywords": { 

151 "vendor", 

152 "vendors", 

153 "supplier", 

154 "suppliers", 

155 "proposal", 

156 "proposals", 

157 "bid", 

158 "bids", 

159 "quote", 

160 "quotes", 

161 "cost", 

162 "costs", 

163 "price", 

164 "pricing", 

165 "comparison", 

166 "compare", 

167 "evaluate", 

168 "evaluation", 

169 "criteria", 

170 "selection", 

171 "recommendation", 

172 "assessment", 

173 "analysis", 

174 }, 

175 "pos_patterns": [ 

176 ["NOUN", "NOUN"], # "vendor proposal" 

177 ["VERB", "NOUN"], # "compare vendors" 

178 ["ADJ", "NOUN"], # "best vendor" 

179 ["NOUN", "VERB", "ADJ"], # "vendor is better" 

180 ], 

181 "entity_types": {"ORG", "MONEY", "PERSON"}, 

182 "question_words": {"which", "who", "what", "how much"}, 

183 "linguistic_indicators": { 

184 "has_comparison": True, 

185 "has_evaluation_terms": True, 

186 "superlative_forms": True, 

187 }, 

188 "weight": 1.0, 

189 }, 

190 IntentType.PROCEDURAL: { 

191 "keywords": { 

192 "how", 

193 "steps", 

194 "step", 

195 "process", 

196 "procedure", 

197 "guide", 

198 "tutorial", 

199 "walkthrough", 

200 "instructions", 

201 "setup", 

202 "configure", 

203 "install", 

204 "deploy", 

205 "implement", 

206 "create", 

207 "build", 

208 "make", 

209 "do", 

210 }, 

211 "pos_patterns": [ 

212 ["VERB", "NOUN"], # "install package" 

213 ["VERB", "DET", "NOUN"], # "setup the system" 

214 ["ADV", "VERB"], # "how configure" 

215 ["NOUN", "VERB"], # "steps install" 

216 ], 

217 "entity_types": set(), 

218 "question_words": {"how", "when", "where"}, 

219 "linguistic_indicators": { 

220 "imperative_mood": True, 

221 "action_oriented": True, 

222 "sequential_indicators": True, 

223 }, 

224 "weight": 1.0, 

225 }, 

226 IntentType.INFORMATIONAL: { 

227 "keywords": { 

228 "what", 

229 "definition", 

230 "meaning", 

231 "explain", 

232 "overview", 

233 "about", 

234 "introduction", 

235 "basics", 

236 "fundamentals", 

237 "concept", 

238 "concepts", 

239 "understand", 

240 "learn", 

241 "know", 

242 "information", 

243 "details", 

244 }, 

245 "pos_patterns": [ 

246 ["NOUN"], # "authentication" 

247 ["ADJ", "NOUN"], # "basic concept" 

248 ["VERB", "NOUN"], # "understand API" 

249 ["NOUN", "VERB"], # "concept explains" 

250 ], 

251 "entity_types": set(), 

252 "question_words": {"what", "who", "when", "where"}, 

253 "linguistic_indicators": { 

254 "knowledge_seeking": True, 

255 "present_tense": True, 

256 "general_terms": True, 

257 }, 

258 "weight": 1.0, 

259 }, 

260 IntentType.TROUBLESHOOTING: { 

261 "keywords": { 

262 "error", 

263 "errors", 

264 "problem", 

265 "problems", 

266 "issue", 

267 "issues", 

268 "bug", 

269 "bugs", 

270 "fix", 

271 "fixes", 

272 "solve", 

273 "solution", 

274 "solutions", 

275 "troubleshoot", 

276 "debug", 

277 "debugging", 

278 "failed", 

279 "failing", 

280 "broken", 

281 "not working", 

282 "doesn't work", 

283 }, 

284 "pos_patterns": [ 

285 ["NOUN", "VERB"], # "error occurs" 

286 ["VERB", "NOUN"], # "fix error" 

287 ["ADJ", "NOUN"], # "broken system" 

288 ["NOUN", "ADJ"], # "system broken" 

289 ], 

290 "entity_types": set(), 

291 "question_words": {"why", "how", "what"}, 

292 "linguistic_indicators": { 

293 "negative_sentiment": True, 

294 "problem_indicators": True, 

295 "past_tense": True, 

296 }, 

297 "weight": 1.0, 

298 }, 

299 IntentType.EXPLORATORY: { 

300 "keywords": { 

301 "explore", 

302 "discover", 

303 "find", 

304 "search", 

305 "browse", 

306 "look", 

307 "see", 

308 "show", 

309 "list", 

310 "available", 

311 "options", 

312 "alternatives", 

313 "similar", 

314 "related", 

315 "examples", 

316 "samples", 

317 }, 

318 "pos_patterns": [ 

319 ["VERB"], # "explore" 

320 ["VERB", "NOUN"], # "find examples" 

321 ["ADJ", "NOUN"], # "similar tools" 

322 ["DET", "NOUN"], # "some options" 

323 ], 

324 "entity_types": set(), 

325 "question_words": {"what", "which"}, 

326 "linguistic_indicators": { 

327 "open_ended": True, 

328 "discovery_oriented": True, 

329 "broad_scope": True, 

330 }, 

331 "weight": 0.8, 

332 }, 

333 } 

334 

335 # Behavioral pattern recognition 

336 self.session_patterns = { 

337 "technical_session": [IntentType.TECHNICAL_LOOKUP, IntentType.PROCEDURAL], 

338 "business_session": [ 

339 IntentType.BUSINESS_CONTEXT, 

340 IntentType.VENDOR_EVALUATION, 

341 ], 

342 "learning_session": [ 

343 IntentType.INFORMATIONAL, 

344 IntentType.EXPLORATORY, 

345 IntentType.PROCEDURAL, 

346 ], 

347 "problem_solving": [ 

348 IntentType.TROUBLESHOOTING, 

349 IntentType.PROCEDURAL, 

350 IntentType.TECHNICAL_LOOKUP, 

351 ], 

352 } 

353 

354 # Cache for intent classification results 

355 self._intent_cache: dict[str, SearchIntent] = {} 

356 

357 logger.info("Initialized intent classifier with spaCy integration") 

358 

359 def classify_intent( 

360 self, 

361 query: str, 

362 session_context: dict[str, Any] | None = None, 

363 behavioral_context: list[str] | None = None, 

364 ) -> SearchIntent: 

365 """Classify search intent using comprehensive spaCy analysis.""" 

366 

367 start_time = time.time() 

368 

369 # Check cache first 

370 cache_key = f"{query}:{str(session_context)}:{str(behavioral_context)}" 

371 if cache_key in self._intent_cache: 

372 cached = self._intent_cache[cache_key] 

373 logger.debug(f"Using cached intent classification for: {query[:50]}...") 

374 return cached 

375 

376 # Ensure analyzer is available and valid (extra safety beyond __init__) 

377 if not hasattr(self.spacy_analyzer, "analyze_query_semantic"): 

378 raise ImportError( 

379 "SpaCy analyzer is not initialized correctly. Missing 'analyze_query_semantic'." 

380 ) 

381 

382 try: 

383 # Step 1: Perform spaCy semantic analysis 

384 spacy_analysis = self.spacy_analyzer.analyze_query_semantic(query) 

385 

386 # Step 2: Extract linguistic features for intent classification 

387 linguistic_features = self._extract_linguistic_features( 

388 spacy_analysis, query 

389 ) 

390 

391 # Step 3: Score each intent type using pattern matching 

392 intent_scores = self._score_intent_patterns( 

393 spacy_analysis, linguistic_features, query 

394 ) 

395 

396 # Step 4: Apply behavioral context weighting 

397 if behavioral_context: 

398 intent_scores = self._apply_behavioral_weighting( 

399 intent_scores, behavioral_context 

400 ) 

401 

402 # Step 5: Apply session context boosting 

403 if session_context: 

404 intent_scores = self._apply_session_context( 

405 intent_scores, session_context 

406 ) 

407 

408 # Step 6: Determine primary and secondary intents 

409 primary_intent, confidence = self._select_primary_intent(intent_scores) 

410 secondary_intents = self._select_secondary_intents( 

411 intent_scores, primary_intent 

412 ) 

413 

414 # Step 7: Build supporting evidence 

415 supporting_evidence = self._build_evidence( 

416 spacy_analysis, linguistic_features, intent_scores 

417 ) 

418 

419 # Step 8: Create intent result 

420 classification_time = (time.time() - start_time) * 1000 

421 

422 search_intent = SearchIntent( 

423 intent_type=primary_intent, 

424 confidence=confidence, 

425 secondary_intents=secondary_intents, 

426 supporting_evidence=supporting_evidence, 

427 linguistic_features=linguistic_features, 

428 query_complexity=spacy_analysis.complexity_score, 

429 is_question=spacy_analysis.is_question, 

430 is_technical=spacy_analysis.is_technical, 

431 session_context=session_context or {}, 

432 previous_intents=behavioral_context or [], 

433 classification_time_ms=classification_time, 

434 ) 

435 

436 # Cache the result 

437 self._intent_cache[cache_key] = search_intent 

438 

439 logger.debug( 

440 f"Classified intent in {classification_time:.2f}ms", 

441 query_length=len(query), 

442 primary_intent=primary_intent.value, 

443 confidence=confidence, 

444 secondary_count=len(secondary_intents), 

445 ) 

446 

447 return search_intent 

448 

449 except Exception as e: 

450 logger.error(f"Intent classification failed: {e}") 

451 # Return fallback intent 

452 classification_time = (time.time() - start_time) * 1000 

453 return SearchIntent( 

454 intent_type=IntentType.GENERAL, 

455 confidence=0.5, 

456 classification_time_ms=classification_time, 

457 ) 

458 

459 def _extract_linguistic_features( 

460 self, spacy_analysis, query: str 

461 ) -> dict[str, Any]: 

462 """Extract comprehensive linguistic features for intent classification.""" 

463 

464 features = { 

465 # Basic query characteristics 

466 "query_length": len(query.split()), 

467 "has_question_mark": "?" in query, 

468 "starts_with_question_word": False, 

469 "starts_with_verb": False, 

470 "has_imperative_verbs": False, 

471 "has_modal_verbs": False, 

472 # spaCy-derived features 

473 "entity_count": len(spacy_analysis.entities), 

474 "concept_count": len(spacy_analysis.main_concepts), 

475 "keyword_count": len(spacy_analysis.semantic_keywords), 

476 "pos_diversity": len(set(spacy_analysis.pos_patterns)), 

477 # Semantic features 

478 "technical_indicators": 0, 

479 "business_indicators": 0, 

480 "procedural_indicators": 0, 

481 "problem_indicators": 0, 

482 # Entity type analysis 

483 "entity_types": [ent[1] for ent in spacy_analysis.entities], 

484 "has_org_entities": any(ent[1] == "ORG" for ent in spacy_analysis.entities), 

485 "has_product_entities": any( 

486 ent[1] == "PRODUCT" for ent in spacy_analysis.entities 

487 ), 

488 "has_person_entities": any( 

489 ent[1] == "PERSON" for ent in spacy_analysis.entities 

490 ), 

491 "has_money_entities": any( 

492 ent[1] == "MONEY" for ent in spacy_analysis.entities 

493 ), 

494 } 

495 

496 # Analyze question word patterns 

497 question_words = { 

498 "what", 

499 "how", 

500 "why", 

501 "when", 

502 "who", 

503 "where", 

504 "which", 

505 "whose", 

506 } 

507 query_lower = query.lower() 

508 first_word = query_lower.split()[0] if query_lower.split() else "" 

509 features["starts_with_question_word"] = first_word in question_words 

510 

511 # Count technical, business, and procedural indicators 

512 technical_terms = { 

513 "api", 

514 "code", 

515 "function", 

516 "method", 

517 "library", 

518 "framework", 

519 "implementation", 

520 } 

521 business_terms = { 

522 "requirements", 

523 "objectives", 

524 "strategy", 

525 "business", 

526 "scope", 

527 "criteria", 

528 } 

529 procedural_terms = { 

530 "how", 

531 "steps", 

532 "process", 

533 "guide", 

534 "setup", 

535 "install", 

536 "configure", 

537 } 

538 problem_terms = { 

539 "error", 

540 "problem", 

541 "issue", 

542 "bug", 

543 "fix", 

544 "solve", 

545 "broken", 

546 "failed", 

547 } 

548 

549 keywords_lower = [kw.lower() for kw in spacy_analysis.semantic_keywords] 

550 features["technical_indicators"] = sum( 

551 1 for term in technical_terms if term in keywords_lower 

552 ) 

553 features["business_indicators"] = sum( 

554 1 for term in business_terms if term in keywords_lower 

555 ) 

556 features["procedural_indicators"] = sum( 

557 1 for term in procedural_terms if term in keywords_lower 

558 ) 

559 features["problem_indicators"] = sum( 

560 1 for term in problem_terms if term in keywords_lower 

561 ) 

562 

563 # POS pattern analysis 

564 pos_patterns = spacy_analysis.pos_patterns 

565 features["starts_with_verb"] = bool(pos_patterns) and pos_patterns[0] == "VERB" 

566 # Imperative: sentence starts with a verb and does not start with a question word 

567 features["has_imperative_verbs"] = ( 

568 ("VERB" in pos_patterns) 

569 and features["starts_with_verb"] 

570 and not features.get("starts_with_question_word", False) 

571 ) 

572 features["has_modal_verbs"] = any( 

573 pos in ["MD", "MODAL"] for pos in pos_patterns 

574 ) 

575 

576 return features 

577 

578 def _score_intent_patterns( 

579 self, 

580 spacy_analysis, 

581 linguistic_features: dict[str, Any], 

582 query: str, 

583 ) -> dict[IntentType, float]: 

584 """Score each intent type using pattern matching.""" 

585 

586 intent_scores = {} 

587 keywords_set = {kw.lower() for kw in spacy_analysis.semantic_keywords} 

588 

589 for intent_type, pattern in self.intent_patterns.items(): 

590 score = 0.0 

591 

592 # 1. Keyword matching (40% weight) 

593 keyword_matches = len(keywords_set.intersection(pattern["keywords"])) 

594 keyword_score = keyword_matches / max(len(pattern["keywords"]), 1) 

595 score += keyword_score * 0.4 

596 

597 # 2. POS pattern matching (25% weight) 

598 pos_score = self._match_pos_patterns( 

599 spacy_analysis.pos_patterns, pattern["pos_patterns"] 

600 ) 

601 score += pos_score * 0.25 

602 

603 # 3. Entity type matching (20% weight) 

604 entity_score = self._match_entity_types( 

605 spacy_analysis.entities, pattern["entity_types"] 

606 ) 

607 score += entity_score * 0.20 

608 

609 # 4. Question word matching (10% weight) 

610 question_score = self._match_question_words( 

611 query, pattern["question_words"] 

612 ) 

613 score += question_score * 0.10 

614 

615 # 5. Linguistic indicator bonus (5% weight) 

616 indicator_score = self._match_linguistic_indicators( 

617 linguistic_features, pattern.get("linguistic_indicators", {}) 

618 ) 

619 score += indicator_score * 0.05 

620 

621 # Apply pattern weight 

622 score *= pattern.get("weight", 1.0) 

623 

624 intent_scores[intent_type] = score 

625 

626 return intent_scores 

627 

628 def _match_pos_patterns( 

629 self, query_pos: list[str], target_patterns: list[list[str]] 

630 ) -> float: 

631 """Match POS tag patterns in the query.""" 

632 if not target_patterns or not query_pos: 

633 return 0.0 

634 

635 matches = 0 

636 total_patterns = len(target_patterns) 

637 

638 for pattern in target_patterns: 

639 if self._contains_pos_sequence(query_pos, pattern): 

640 matches += 1 

641 

642 return matches / total_patterns 

643 

644 def _contains_pos_sequence(self, pos_tags: list[str], sequence: list[str]) -> bool: 

645 """Check if POS sequence exists in the query.""" 

646 if len(sequence) > len(pos_tags): 

647 return False 

648 

649 for i in range(len(pos_tags) - len(sequence) + 1): 

650 if pos_tags[i : i + len(sequence)] == sequence: 

651 return True 

652 

653 return False 

654 

655 def _match_entity_types( 

656 self, query_entities: list[tuple[str, str]], target_types: set[str] 

657 ) -> float: 

658 """Match entity types in the query.""" 

659 if not target_types: 

660 return 0.0 

661 

662 query_entity_types = {ent[1] for ent in query_entities} 

663 matches = len(query_entity_types.intersection(target_types)) 

664 

665 return matches / len(target_types) 

666 

667 def _match_question_words(self, query: str, target_words: set[str]) -> float: 

668 """Match question words in the query.""" 

669 if not target_words: 

670 return 0.0 

671 

672 query_words = set(query.lower().split()) 

673 matches = len(query_words.intersection(target_words)) 

674 

675 return matches / len(target_words) 

676 

677 def _match_linguistic_indicators( 

678 self, features: dict[str, Any], target_indicators: dict[str, Any] 

679 ) -> float: 

680 """Match linguistic indicators.""" 

681 if not target_indicators: 

682 return 0.0 

683 

684 score = 0.0 

685 total_indicators = len(target_indicators) 

686 

687 for indicator, expected_value in target_indicators.items(): 

688 if indicator in features: 

689 if isinstance(expected_value, bool): 

690 if features[indicator] == expected_value: 

691 score += 1.0 

692 elif isinstance(expected_value, int | float): 

693 # For numeric indicators, use magnitude-aware similarity 

694 actual_value = features.get(indicator, 0) 

695 if isinstance(actual_value, int | float): 

696 denom = max(abs(expected_value), abs(actual_value), 1.0) 

697 similarity = 1.0 - abs(actual_value - expected_value) / denom 

698 score += max(0.0, similarity) 

699 

700 return score / max(total_indicators, 1) 

701 

702 def _apply_behavioral_weighting( 

703 self, intent_scores: dict[IntentType, float], behavioral_context: list[str] 

704 ) -> dict[IntentType, float]: 

705 """Apply behavioral context weighting to intent scores.""" 

706 

707 if not behavioral_context: 

708 return intent_scores 

709 

710 # Convert string intents to IntentType 

711 previous_intents = [] 

712 for intent_str in behavioral_context[-5:]: # Last 5 intents 

713 try: 

714 previous_intents.append(IntentType(intent_str)) 

715 except ValueError: 

716 continue 

717 

718 if not previous_intents: 

719 return intent_scores 

720 

721 weighted_scores = intent_scores.copy() 

722 

723 # Boost scores for intents that commonly follow previous intents 

724 intent_transitions = { 

725 IntentType.INFORMATIONAL: [ 

726 IntentType.PROCEDURAL, 

727 IntentType.TECHNICAL_LOOKUP, 

728 ], 

729 IntentType.TECHNICAL_LOOKUP: [ 

730 IntentType.PROCEDURAL, 

731 IntentType.TROUBLESHOOTING, 

732 ], 

733 IntentType.BUSINESS_CONTEXT: [ 

734 IntentType.VENDOR_EVALUATION, 

735 IntentType.TECHNICAL_LOOKUP, 

736 ], 

737 IntentType.VENDOR_EVALUATION: [ 

738 IntentType.BUSINESS_CONTEXT, 

739 IntentType.TECHNICAL_LOOKUP, 

740 ], 

741 IntentType.PROCEDURAL: [ 

742 IntentType.TROUBLESHOOTING, 

743 IntentType.TECHNICAL_LOOKUP, 

744 ], 

745 IntentType.TROUBLESHOOTING: [ 

746 IntentType.PROCEDURAL, 

747 IntentType.TECHNICAL_LOOKUP, 

748 ], 

749 } 

750 

751 most_recent_intent = previous_intents[-1] 

752 likely_next_intents = intent_transitions.get(most_recent_intent, []) 

753 

754 for intent_type in likely_next_intents: 

755 if intent_type in weighted_scores: 

756 weighted_scores[intent_type] *= 1.2 # 20% boost 

757 

758 # Apply session pattern recognition 

759 for _pattern_name, pattern_intents in self.session_patterns.items(): 

760 pattern_match_score = sum( 

761 1 for intent in previous_intents if intent in pattern_intents 

762 ) / len(pattern_intents) 

763 

764 if pattern_match_score > 0.5: # More than half of pattern matched 

765 for intent_type in pattern_intents: 

766 if intent_type in weighted_scores: 

767 weighted_scores[intent_type] *= 1.0 + pattern_match_score * 0.3 

768 

769 return weighted_scores 

770 

771 def _apply_session_context( 

772 self, intent_scores: dict[IntentType, float], session_context: dict[str, Any] 

773 ) -> dict[IntentType, float]: 

774 """Apply session context to intent scores.""" 

775 

776 weighted_scores = intent_scores.copy() 

777 

778 # Apply domain context boosting 

779 domain = session_context.get("domain", "") 

780 if domain == "technical": 

781 weighted_scores[IntentType.TECHNICAL_LOOKUP] *= 1.3 

782 weighted_scores[IntentType.PROCEDURAL] *= 1.2 

783 elif domain == "business": 

784 weighted_scores[IntentType.BUSINESS_CONTEXT] *= 1.3 

785 weighted_scores[IntentType.VENDOR_EVALUATION] *= 1.2 

786 

787 # Apply user role context 

788 user_role = session_context.get("user_role", "") 

789 if user_role in ["developer", "engineer", "architect"]: 

790 weighted_scores[IntentType.TECHNICAL_LOOKUP] *= 1.2 

791 weighted_scores[IntentType.PROCEDURAL] *= 1.1 

792 elif user_role in ["manager", "analyst", "consultant"]: 

793 weighted_scores[IntentType.BUSINESS_CONTEXT] *= 1.2 

794 weighted_scores[IntentType.VENDOR_EVALUATION] *= 1.1 

795 

796 # Apply urgency context 

797 urgency = session_context.get("urgency", "normal") 

798 if urgency == "high": 

799 weighted_scores[IntentType.TROUBLESHOOTING] *= 1.4 

800 weighted_scores[IntentType.PROCEDURAL] *= 1.2 

801 

802 return weighted_scores 

803 

804 def _select_primary_intent( 

805 self, intent_scores: dict[IntentType, float] 

806 ) -> tuple[IntentType, float]: 

807 """Select the primary intent with highest confidence.""" 

808 

809 if not intent_scores: 

810 return IntentType.GENERAL, 0.5 

811 

812 # Find the highest scoring intent 

813 primary_intent = max(intent_scores, key=intent_scores.get) 

814 raw_score = intent_scores[primary_intent] 

815 

816 # Normalize confidence score 

817 total_score = sum(intent_scores.values()) 

818 confidence = raw_score / max(total_score, 1.0) 

819 

820 # Apply confidence threshold 

821 if confidence < 0.3: 

822 return IntentType.GENERAL, confidence 

823 

824 return primary_intent, confidence 

825 

826 def _select_secondary_intents( 

827 self, intent_scores: dict[IntentType, float], primary_intent: IntentType 

828 ) -> list[tuple[IntentType, float]]: 

829 """Select secondary intents with meaningful confidence.""" 

830 

831 secondary_intents = [] 

832 

833 # If primary intent is GENERAL (fallback), don't calculate secondary intents 

834 if primary_intent == IntentType.GENERAL or primary_intent not in intent_scores: 

835 return secondary_intents 

836 

837 # Sort intents by score, excluding primary 

838 sorted_intents = sorted( 

839 [ 

840 (intent, score) 

841 for intent, score in intent_scores.items() 

842 if intent != primary_intent 

843 ], 

844 key=lambda x: x[1], 

845 reverse=True, 

846 ) 

847 

848 # Include intents with score > 30% of primary intent score 

849 primary_score = intent_scores[primary_intent] 

850 threshold = primary_score * 0.3 

851 

852 for intent, score in sorted_intents[:3]: # Max 3 secondary intents 

853 if score >= threshold: 

854 confidence = score / max(sum(intent_scores.values()), 1.0) 

855 secondary_intents.append((intent, confidence)) 

856 

857 return secondary_intents 

858 

859 def _build_evidence( 

860 self, 

861 spacy_analysis, 

862 linguistic_features: dict[str, Any], 

863 intent_scores: dict[IntentType, float], 

864 ) -> dict[str, Any]: 

865 """Build supporting evidence for the intent classification.""" 

866 

867 return { 

868 "spacy_processing_time": spacy_analysis.processing_time_ms, 

869 "query_complexity": spacy_analysis.complexity_score, 

870 "semantic_keywords": spacy_analysis.semantic_keywords[:5], # Top 5 

871 "extracted_entities": [ 

872 ent[0] for ent in spacy_analysis.entities[:3] 

873 ], # Top 3 

874 "main_concepts": spacy_analysis.main_concepts[:3], # Top 3 

875 "intent_signals": spacy_analysis.intent_signals, 

876 "linguistic_features": { 

877 "technical_indicators": linguistic_features.get( 

878 "technical_indicators", 0 

879 ), 

880 "business_indicators": linguistic_features.get( 

881 "business_indicators", 0 

882 ), 

883 "procedural_indicators": linguistic_features.get( 

884 "procedural_indicators", 0 

885 ), 

886 "problem_indicators": linguistic_features.get("problem_indicators", 0), 

887 }, 

888 "top_intent_scores": dict( 

889 sorted(intent_scores.items(), key=lambda x: x[1], reverse=True)[:3] 

890 ), 

891 } 

892 

893 def clear_cache(self): 

894 """Clear intent classification cache.""" 

895 self._intent_cache.clear() 

896 logger.debug("Cleared intent classification cache") 

897 

898 def get_cache_stats(self) -> dict[str, int]: 

899 """Get cache statistics.""" 

900 return { 

901 "intent_cache_size": len(self._intent_cache), 

902 }