Coverage for src/qdrant_loader_mcp_server/search/enhanced/intent/classifier.py: 91%
211 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1"""
2Intent Classification Engine for Search Enhancement.
4This module implements the main IntentClassifier that uses spaCy analysis and
5behavioral patterns to classify search intents with high accuracy.
6"""
8from __future__ import annotations
10import time
11from typing import TYPE_CHECKING, Any
13from ....utils.logging import LoggingConfig
14from .models import IntentType, SearchIntent
16_SPACY_IMPORT_ERROR: BaseException | None = None
18if TYPE_CHECKING:
19 from ...nlp.spacy_analyzer import QueryAnalysis, SpaCyQueryAnalyzer
20else:
21 try:
22 from ...nlp.spacy_analyzer import QueryAnalysis, SpaCyQueryAnalyzer
23 except (
24 ImportError,
25 ModuleNotFoundError,
26 ) as _exc: # pragma: no cover - optional dep
27 # Provide safe sentinels for runtime to avoid NameErrors in annotations
28 QueryAnalysis = Any # type: ignore[assignment]
29 SpaCyQueryAnalyzer = Any # type: ignore[assignment]
30 _SPACY_IMPORT_ERROR = _exc
32logger = LoggingConfig.get_logger(__name__)
35class IntentClassifier:
36 """Advanced intent classification using spaCy analysis and behavioral patterns."""
38 def __init__(self, spacy_analyzer):
39 """Initialize the intent classifier.
41 The constructor validates that the spaCy analyzer dependency is available.
42 If a valid analyzer instance is not provided, it attempts a runtime import.
43 On failure, it raises an ImportError with actionable guidance so callers
44 fail fast rather than encountering None-attribute errors later.
45 """
46 if spacy_analyzer is None:
47 # Do not perform ad-hoc imports here; require explicit injection
48 if _SPACY_IMPORT_ERROR is not None:
49 raise ImportError(
50 "SpaCyQueryAnalyzer is not available. Install optional NLP deps (spacy and model) "
51 "and provide an initialized analyzer instance to IntentClassifier."
52 ) from _SPACY_IMPORT_ERROR
53 raise ImportError(
54 "A spaCy analyzer instance must be provided to IntentClassifier. "
55 "Use SpaCyQueryAnalyzer() and pass it explicitly."
56 )
57 self.spacy_analyzer = spacy_analyzer
59 # Final sanity check to fail fast if analyzer is misconfigured
60 if not hasattr(self.spacy_analyzer, "analyze_query_semantic"):
61 raise ImportError(
62 "Provided spaCy analyzer does not implement 'analyze_query_semantic'. "
63 "Pass a compatible analyzer or install the default SpaCyQueryAnalyzer."
64 )
65 self.logger = LoggingConfig.get_logger(__name__)
67 # Intent classification patterns using spaCy linguistic features
68 self.intent_patterns = {
69 IntentType.TECHNICAL_LOOKUP: {
70 "keywords": {
71 "api",
72 "apis",
73 "endpoint",
74 "endpoints",
75 "function",
76 "functions",
77 "method",
78 "methods",
79 "class",
80 "classes",
81 "library",
82 "libraries",
83 "framework",
84 "frameworks",
85 "code",
86 "implementation",
87 "syntax",
88 "documentation",
89 "docs",
90 "reference",
91 "specification",
92 "protocol",
93 },
94 "pos_patterns": [
95 ["NOUN", "NOUN"], # "API documentation"
96 ["ADJ", "NOUN"], # "REST API"
97 ["VERB", "NOUN"], # "implement authentication"
98 ["NOUN", "VERB"], # "code example"
99 ],
100 "entity_types": {"PRODUCT", "ORG", "LANGUAGE"},
101 "question_words": {"how", "what"},
102 "linguistic_indicators": {
103 "has_code_terms": True,
104 "technical_complexity": 0.6,
105 "verb_imperative": True,
106 },
107 "weight": 1.0,
108 },
109 IntentType.BUSINESS_CONTEXT: {
110 "keywords": {
111 "requirements",
112 "requirement",
113 "objectives",
114 "objective",
115 "goals",
116 "goal",
117 "strategy",
118 "strategies",
119 "business",
120 "scope",
121 "stakeholder",
122 "stakeholders",
123 "budget",
124 "timeline",
125 "deliverable",
126 "deliverables",
127 "milestone",
128 "criteria",
129 "specification",
130 "specifications",
131 "priority",
132 "priorities",
133 },
134 "pos_patterns": [
135 ["NOUN", "NOUN"], # "business requirements"
136 ["ADJ", "NOUN"], # "functional requirements"
137 ["MODAL", "VERB"], # "should implement"
138 ["DET", "NOUN", "VERB"], # "the system should"
139 ],
140 "entity_types": {"ORG", "MONEY", "PERCENT", "CARDINAL"},
141 "question_words": {"what", "why", "which"},
142 "linguistic_indicators": {
143 "has_business_terms": True,
144 "formal_language": True,
145 "future_tense": True,
146 },
147 "weight": 1.0,
148 },
149 IntentType.VENDOR_EVALUATION: {
150 "keywords": {
151 "vendor",
152 "vendors",
153 "supplier",
154 "suppliers",
155 "proposal",
156 "proposals",
157 "bid",
158 "bids",
159 "quote",
160 "quotes",
161 "cost",
162 "costs",
163 "price",
164 "pricing",
165 "comparison",
166 "compare",
167 "evaluate",
168 "evaluation",
169 "criteria",
170 "selection",
171 "recommendation",
172 "assessment",
173 "analysis",
174 },
175 "pos_patterns": [
176 ["NOUN", "NOUN"], # "vendor proposal"
177 ["VERB", "NOUN"], # "compare vendors"
178 ["ADJ", "NOUN"], # "best vendor"
179 ["NOUN", "VERB", "ADJ"], # "vendor is better"
180 ],
181 "entity_types": {"ORG", "MONEY", "PERSON"},
182 "question_words": {"which", "who", "what", "how much"},
183 "linguistic_indicators": {
184 "has_comparison": True,
185 "has_evaluation_terms": True,
186 "superlative_forms": True,
187 },
188 "weight": 1.0,
189 },
190 IntentType.PROCEDURAL: {
191 "keywords": {
192 "how",
193 "steps",
194 "step",
195 "process",
196 "procedure",
197 "guide",
198 "tutorial",
199 "walkthrough",
200 "instructions",
201 "setup",
202 "configure",
203 "install",
204 "deploy",
205 "implement",
206 "create",
207 "build",
208 "make",
209 "do",
210 },
211 "pos_patterns": [
212 ["VERB", "NOUN"], # "install package"
213 ["VERB", "DET", "NOUN"], # "setup the system"
214 ["ADV", "VERB"], # "how configure"
215 ["NOUN", "VERB"], # "steps install"
216 ],
217 "entity_types": set(),
218 "question_words": {"how", "when", "where"},
219 "linguistic_indicators": {
220 "imperative_mood": True,
221 "action_oriented": True,
222 "sequential_indicators": True,
223 },
224 "weight": 1.0,
225 },
226 IntentType.INFORMATIONAL: {
227 "keywords": {
228 "what",
229 "definition",
230 "meaning",
231 "explain",
232 "overview",
233 "about",
234 "introduction",
235 "basics",
236 "fundamentals",
237 "concept",
238 "concepts",
239 "understand",
240 "learn",
241 "know",
242 "information",
243 "details",
244 },
245 "pos_patterns": [
246 ["NOUN"], # "authentication"
247 ["ADJ", "NOUN"], # "basic concept"
248 ["VERB", "NOUN"], # "understand API"
249 ["NOUN", "VERB"], # "concept explains"
250 ],
251 "entity_types": set(),
252 "question_words": {"what", "who", "when", "where"},
253 "linguistic_indicators": {
254 "knowledge_seeking": True,
255 "present_tense": True,
256 "general_terms": True,
257 },
258 "weight": 1.0,
259 },
260 IntentType.TROUBLESHOOTING: {
261 "keywords": {
262 "error",
263 "errors",
264 "problem",
265 "problems",
266 "issue",
267 "issues",
268 "bug",
269 "bugs",
270 "fix",
271 "fixes",
272 "solve",
273 "solution",
274 "solutions",
275 "troubleshoot",
276 "debug",
277 "debugging",
278 "failed",
279 "failing",
280 "broken",
281 "not working",
282 "doesn't work",
283 },
284 "pos_patterns": [
285 ["NOUN", "VERB"], # "error occurs"
286 ["VERB", "NOUN"], # "fix error"
287 ["ADJ", "NOUN"], # "broken system"
288 ["NOUN", "ADJ"], # "system broken"
289 ],
290 "entity_types": set(),
291 "question_words": {"why", "how", "what"},
292 "linguistic_indicators": {
293 "negative_sentiment": True,
294 "problem_indicators": True,
295 "past_tense": True,
296 },
297 "weight": 1.0,
298 },
299 IntentType.EXPLORATORY: {
300 "keywords": {
301 "explore",
302 "discover",
303 "find",
304 "search",
305 "browse",
306 "look",
307 "see",
308 "show",
309 "list",
310 "available",
311 "options",
312 "alternatives",
313 "similar",
314 "related",
315 "examples",
316 "samples",
317 },
318 "pos_patterns": [
319 ["VERB"], # "explore"
320 ["VERB", "NOUN"], # "find examples"
321 ["ADJ", "NOUN"], # "similar tools"
322 ["DET", "NOUN"], # "some options"
323 ],
324 "entity_types": set(),
325 "question_words": {"what", "which"},
326 "linguistic_indicators": {
327 "open_ended": True,
328 "discovery_oriented": True,
329 "broad_scope": True,
330 },
331 "weight": 0.8,
332 },
333 }
335 # Behavioral pattern recognition
336 self.session_patterns = {
337 "technical_session": [IntentType.TECHNICAL_LOOKUP, IntentType.PROCEDURAL],
338 "business_session": [
339 IntentType.BUSINESS_CONTEXT,
340 IntentType.VENDOR_EVALUATION,
341 ],
342 "learning_session": [
343 IntentType.INFORMATIONAL,
344 IntentType.EXPLORATORY,
345 IntentType.PROCEDURAL,
346 ],
347 "problem_solving": [
348 IntentType.TROUBLESHOOTING,
349 IntentType.PROCEDURAL,
350 IntentType.TECHNICAL_LOOKUP,
351 ],
352 }
354 # Cache for intent classification results
355 self._intent_cache: dict[str, SearchIntent] = {}
357 logger.info("Initialized intent classifier with spaCy integration")
359 def classify_intent(
360 self,
361 query: str,
362 session_context: dict[str, Any] | None = None,
363 behavioral_context: list[str] | None = None,
364 ) -> SearchIntent:
365 """Classify search intent using comprehensive spaCy analysis."""
367 start_time = time.time()
369 # Check cache first
370 cache_key = f"{query}:{str(session_context)}:{str(behavioral_context)}"
371 if cache_key in self._intent_cache:
372 cached = self._intent_cache[cache_key]
373 logger.debug(f"Using cached intent classification for: {query[:50]}...")
374 return cached
376 # Ensure analyzer is available and valid (extra safety beyond __init__)
377 if not hasattr(self.spacy_analyzer, "analyze_query_semantic"):
378 raise ImportError(
379 "SpaCy analyzer is not initialized correctly. Missing 'analyze_query_semantic'."
380 )
382 try:
383 # Step 1: Perform spaCy semantic analysis
384 spacy_analysis = self.spacy_analyzer.analyze_query_semantic(query)
386 # Step 2: Extract linguistic features for intent classification
387 linguistic_features = self._extract_linguistic_features(
388 spacy_analysis, query
389 )
391 # Step 3: Score each intent type using pattern matching
392 intent_scores = self._score_intent_patterns(
393 spacy_analysis, linguistic_features, query
394 )
396 # Step 4: Apply behavioral context weighting
397 if behavioral_context:
398 intent_scores = self._apply_behavioral_weighting(
399 intent_scores, behavioral_context
400 )
402 # Step 5: Apply session context boosting
403 if session_context:
404 intent_scores = self._apply_session_context(
405 intent_scores, session_context
406 )
408 # Step 6: Determine primary and secondary intents
409 primary_intent, confidence = self._select_primary_intent(intent_scores)
410 secondary_intents = self._select_secondary_intents(
411 intent_scores, primary_intent
412 )
414 # Step 7: Build supporting evidence
415 supporting_evidence = self._build_evidence(
416 spacy_analysis, linguistic_features, intent_scores
417 )
419 # Step 8: Create intent result
420 classification_time = (time.time() - start_time) * 1000
422 search_intent = SearchIntent(
423 intent_type=primary_intent,
424 confidence=confidence,
425 secondary_intents=secondary_intents,
426 supporting_evidence=supporting_evidence,
427 linguistic_features=linguistic_features,
428 query_complexity=spacy_analysis.complexity_score,
429 is_question=spacy_analysis.is_question,
430 is_technical=spacy_analysis.is_technical,
431 session_context=session_context or {},
432 previous_intents=behavioral_context or [],
433 classification_time_ms=classification_time,
434 )
436 # Cache the result
437 self._intent_cache[cache_key] = search_intent
439 logger.debug(
440 f"Classified intent in {classification_time:.2f}ms",
441 query_length=len(query),
442 primary_intent=primary_intent.value,
443 confidence=confidence,
444 secondary_count=len(secondary_intents),
445 )
447 return search_intent
449 except Exception as e:
450 logger.error(f"Intent classification failed: {e}")
451 # Return fallback intent
452 classification_time = (time.time() - start_time) * 1000
453 return SearchIntent(
454 intent_type=IntentType.GENERAL,
455 confidence=0.5,
456 classification_time_ms=classification_time,
457 )
459 def _extract_linguistic_features(
460 self, spacy_analysis, query: str
461 ) -> dict[str, Any]:
462 """Extract comprehensive linguistic features for intent classification."""
464 features = {
465 # Basic query characteristics
466 "query_length": len(query.split()),
467 "has_question_mark": "?" in query,
468 "starts_with_question_word": False,
469 "starts_with_verb": False,
470 "has_imperative_verbs": False,
471 "has_modal_verbs": False,
472 # spaCy-derived features
473 "entity_count": len(spacy_analysis.entities),
474 "concept_count": len(spacy_analysis.main_concepts),
475 "keyword_count": len(spacy_analysis.semantic_keywords),
476 "pos_diversity": len(set(spacy_analysis.pos_patterns)),
477 # Semantic features
478 "technical_indicators": 0,
479 "business_indicators": 0,
480 "procedural_indicators": 0,
481 "problem_indicators": 0,
482 # Entity type analysis
483 "entity_types": [ent[1] for ent in spacy_analysis.entities],
484 "has_org_entities": any(ent[1] == "ORG" for ent in spacy_analysis.entities),
485 "has_product_entities": any(
486 ent[1] == "PRODUCT" for ent in spacy_analysis.entities
487 ),
488 "has_person_entities": any(
489 ent[1] == "PERSON" for ent in spacy_analysis.entities
490 ),
491 "has_money_entities": any(
492 ent[1] == "MONEY" for ent in spacy_analysis.entities
493 ),
494 }
496 # Analyze question word patterns
497 question_words = {
498 "what",
499 "how",
500 "why",
501 "when",
502 "who",
503 "where",
504 "which",
505 "whose",
506 }
507 query_lower = query.lower()
508 first_word = query_lower.split()[0] if query_lower.split() else ""
509 features["starts_with_question_word"] = first_word in question_words
511 # Count technical, business, and procedural indicators
512 technical_terms = {
513 "api",
514 "code",
515 "function",
516 "method",
517 "library",
518 "framework",
519 "implementation",
520 }
521 business_terms = {
522 "requirements",
523 "objectives",
524 "strategy",
525 "business",
526 "scope",
527 "criteria",
528 }
529 procedural_terms = {
530 "how",
531 "steps",
532 "process",
533 "guide",
534 "setup",
535 "install",
536 "configure",
537 }
538 problem_terms = {
539 "error",
540 "problem",
541 "issue",
542 "bug",
543 "fix",
544 "solve",
545 "broken",
546 "failed",
547 }
549 keywords_lower = [kw.lower() for kw in spacy_analysis.semantic_keywords]
550 features["technical_indicators"] = sum(
551 1 for term in technical_terms if term in keywords_lower
552 )
553 features["business_indicators"] = sum(
554 1 for term in business_terms if term in keywords_lower
555 )
556 features["procedural_indicators"] = sum(
557 1 for term in procedural_terms if term in keywords_lower
558 )
559 features["problem_indicators"] = sum(
560 1 for term in problem_terms if term in keywords_lower
561 )
563 # POS pattern analysis
564 pos_patterns = spacy_analysis.pos_patterns
565 features["starts_with_verb"] = bool(pos_patterns) and pos_patterns[0] == "VERB"
566 # Imperative: sentence starts with a verb and does not start with a question word
567 features["has_imperative_verbs"] = (
568 ("VERB" in pos_patterns)
569 and features["starts_with_verb"]
570 and not features.get("starts_with_question_word", False)
571 )
572 features["has_modal_verbs"] = any(
573 pos in ["MD", "MODAL"] for pos in pos_patterns
574 )
576 return features
578 def _score_intent_patterns(
579 self,
580 spacy_analysis,
581 linguistic_features: dict[str, Any],
582 query: str,
583 ) -> dict[IntentType, float]:
584 """Score each intent type using pattern matching."""
586 intent_scores = {}
587 keywords_set = {kw.lower() for kw in spacy_analysis.semantic_keywords}
589 for intent_type, pattern in self.intent_patterns.items():
590 score = 0.0
592 # 1. Keyword matching (40% weight)
593 keyword_matches = len(keywords_set.intersection(pattern["keywords"]))
594 keyword_score = keyword_matches / max(len(pattern["keywords"]), 1)
595 score += keyword_score * 0.4
597 # 2. POS pattern matching (25% weight)
598 pos_score = self._match_pos_patterns(
599 spacy_analysis.pos_patterns, pattern["pos_patterns"]
600 )
601 score += pos_score * 0.25
603 # 3. Entity type matching (20% weight)
604 entity_score = self._match_entity_types(
605 spacy_analysis.entities, pattern["entity_types"]
606 )
607 score += entity_score * 0.20
609 # 4. Question word matching (10% weight)
610 question_score = self._match_question_words(
611 query, pattern["question_words"]
612 )
613 score += question_score * 0.10
615 # 5. Linguistic indicator bonus (5% weight)
616 indicator_score = self._match_linguistic_indicators(
617 linguistic_features, pattern.get("linguistic_indicators", {})
618 )
619 score += indicator_score * 0.05
621 # Apply pattern weight
622 score *= pattern.get("weight", 1.0)
624 intent_scores[intent_type] = score
626 return intent_scores
628 def _match_pos_patterns(
629 self, query_pos: list[str], target_patterns: list[list[str]]
630 ) -> float:
631 """Match POS tag patterns in the query."""
632 if not target_patterns or not query_pos:
633 return 0.0
635 matches = 0
636 total_patterns = len(target_patterns)
638 for pattern in target_patterns:
639 if self._contains_pos_sequence(query_pos, pattern):
640 matches += 1
642 return matches / total_patterns
644 def _contains_pos_sequence(self, pos_tags: list[str], sequence: list[str]) -> bool:
645 """Check if POS sequence exists in the query."""
646 if len(sequence) > len(pos_tags):
647 return False
649 for i in range(len(pos_tags) - len(sequence) + 1):
650 if pos_tags[i : i + len(sequence)] == sequence:
651 return True
653 return False
655 def _match_entity_types(
656 self, query_entities: list[tuple[str, str]], target_types: set[str]
657 ) -> float:
658 """Match entity types in the query."""
659 if not target_types:
660 return 0.0
662 query_entity_types = {ent[1] for ent in query_entities}
663 matches = len(query_entity_types.intersection(target_types))
665 return matches / len(target_types)
667 def _match_question_words(self, query: str, target_words: set[str]) -> float:
668 """Match question words in the query."""
669 if not target_words:
670 return 0.0
672 query_words = set(query.lower().split())
673 matches = len(query_words.intersection(target_words))
675 return matches / len(target_words)
677 def _match_linguistic_indicators(
678 self, features: dict[str, Any], target_indicators: dict[str, Any]
679 ) -> float:
680 """Match linguistic indicators."""
681 if not target_indicators:
682 return 0.0
684 score = 0.0
685 total_indicators = len(target_indicators)
687 for indicator, expected_value in target_indicators.items():
688 if indicator in features:
689 if isinstance(expected_value, bool):
690 if features[indicator] == expected_value:
691 score += 1.0
692 elif isinstance(expected_value, int | float):
693 # For numeric indicators, use magnitude-aware similarity
694 actual_value = features.get(indicator, 0)
695 if isinstance(actual_value, int | float):
696 denom = max(abs(expected_value), abs(actual_value), 1.0)
697 similarity = 1.0 - abs(actual_value - expected_value) / denom
698 score += max(0.0, similarity)
700 return score / max(total_indicators, 1)
702 def _apply_behavioral_weighting(
703 self, intent_scores: dict[IntentType, float], behavioral_context: list[str]
704 ) -> dict[IntentType, float]:
705 """Apply behavioral context weighting to intent scores."""
707 if not behavioral_context:
708 return intent_scores
710 # Convert string intents to IntentType
711 previous_intents = []
712 for intent_str in behavioral_context[-5:]: # Last 5 intents
713 try:
714 previous_intents.append(IntentType(intent_str))
715 except ValueError:
716 continue
718 if not previous_intents:
719 return intent_scores
721 weighted_scores = intent_scores.copy()
723 # Boost scores for intents that commonly follow previous intents
724 intent_transitions = {
725 IntentType.INFORMATIONAL: [
726 IntentType.PROCEDURAL,
727 IntentType.TECHNICAL_LOOKUP,
728 ],
729 IntentType.TECHNICAL_LOOKUP: [
730 IntentType.PROCEDURAL,
731 IntentType.TROUBLESHOOTING,
732 ],
733 IntentType.BUSINESS_CONTEXT: [
734 IntentType.VENDOR_EVALUATION,
735 IntentType.TECHNICAL_LOOKUP,
736 ],
737 IntentType.VENDOR_EVALUATION: [
738 IntentType.BUSINESS_CONTEXT,
739 IntentType.TECHNICAL_LOOKUP,
740 ],
741 IntentType.PROCEDURAL: [
742 IntentType.TROUBLESHOOTING,
743 IntentType.TECHNICAL_LOOKUP,
744 ],
745 IntentType.TROUBLESHOOTING: [
746 IntentType.PROCEDURAL,
747 IntentType.TECHNICAL_LOOKUP,
748 ],
749 }
751 most_recent_intent = previous_intents[-1]
752 likely_next_intents = intent_transitions.get(most_recent_intent, [])
754 for intent_type in likely_next_intents:
755 if intent_type in weighted_scores:
756 weighted_scores[intent_type] *= 1.2 # 20% boost
758 # Apply session pattern recognition
759 for _pattern_name, pattern_intents in self.session_patterns.items():
760 pattern_match_score = sum(
761 1 for intent in previous_intents if intent in pattern_intents
762 ) / len(pattern_intents)
764 if pattern_match_score > 0.5: # More than half of pattern matched
765 for intent_type in pattern_intents:
766 if intent_type in weighted_scores:
767 weighted_scores[intent_type] *= 1.0 + pattern_match_score * 0.3
769 return weighted_scores
771 def _apply_session_context(
772 self, intent_scores: dict[IntentType, float], session_context: dict[str, Any]
773 ) -> dict[IntentType, float]:
774 """Apply session context to intent scores."""
776 weighted_scores = intent_scores.copy()
778 # Apply domain context boosting
779 domain = session_context.get("domain", "")
780 if domain == "technical":
781 weighted_scores[IntentType.TECHNICAL_LOOKUP] *= 1.3
782 weighted_scores[IntentType.PROCEDURAL] *= 1.2
783 elif domain == "business":
784 weighted_scores[IntentType.BUSINESS_CONTEXT] *= 1.3
785 weighted_scores[IntentType.VENDOR_EVALUATION] *= 1.2
787 # Apply user role context
788 user_role = session_context.get("user_role", "")
789 if user_role in ["developer", "engineer", "architect"]:
790 weighted_scores[IntentType.TECHNICAL_LOOKUP] *= 1.2
791 weighted_scores[IntentType.PROCEDURAL] *= 1.1
792 elif user_role in ["manager", "analyst", "consultant"]:
793 weighted_scores[IntentType.BUSINESS_CONTEXT] *= 1.2
794 weighted_scores[IntentType.VENDOR_EVALUATION] *= 1.1
796 # Apply urgency context
797 urgency = session_context.get("urgency", "normal")
798 if urgency == "high":
799 weighted_scores[IntentType.TROUBLESHOOTING] *= 1.4
800 weighted_scores[IntentType.PROCEDURAL] *= 1.2
802 return weighted_scores
804 def _select_primary_intent(
805 self, intent_scores: dict[IntentType, float]
806 ) -> tuple[IntentType, float]:
807 """Select the primary intent with highest confidence."""
809 if not intent_scores:
810 return IntentType.GENERAL, 0.5
812 # Find the highest scoring intent
813 primary_intent = max(intent_scores, key=intent_scores.get)
814 raw_score = intent_scores[primary_intent]
816 # Normalize confidence score
817 total_score = sum(intent_scores.values())
818 confidence = raw_score / max(total_score, 1.0)
820 # Apply confidence threshold
821 if confidence < 0.3:
822 return IntentType.GENERAL, confidence
824 return primary_intent, confidence
826 def _select_secondary_intents(
827 self, intent_scores: dict[IntentType, float], primary_intent: IntentType
828 ) -> list[tuple[IntentType, float]]:
829 """Select secondary intents with meaningful confidence."""
831 secondary_intents = []
833 # If primary intent is GENERAL (fallback), don't calculate secondary intents
834 if primary_intent == IntentType.GENERAL or primary_intent not in intent_scores:
835 return secondary_intents
837 # Sort intents by score, excluding primary
838 sorted_intents = sorted(
839 [
840 (intent, score)
841 for intent, score in intent_scores.items()
842 if intent != primary_intent
843 ],
844 key=lambda x: x[1],
845 reverse=True,
846 )
848 # Include intents with score > 30% of primary intent score
849 primary_score = intent_scores[primary_intent]
850 threshold = primary_score * 0.3
852 for intent, score in sorted_intents[:3]: # Max 3 secondary intents
853 if score >= threshold:
854 confidence = score / max(sum(intent_scores.values()), 1.0)
855 secondary_intents.append((intent, confidence))
857 return secondary_intents
859 def _build_evidence(
860 self,
861 spacy_analysis,
862 linguistic_features: dict[str, Any],
863 intent_scores: dict[IntentType, float],
864 ) -> dict[str, Any]:
865 """Build supporting evidence for the intent classification."""
867 return {
868 "spacy_processing_time": spacy_analysis.processing_time_ms,
869 "query_complexity": spacy_analysis.complexity_score,
870 "semantic_keywords": spacy_analysis.semantic_keywords[:5], # Top 5
871 "extracted_entities": [
872 ent[0] for ent in spacy_analysis.entities[:3]
873 ], # Top 3
874 "main_concepts": spacy_analysis.main_concepts[:3], # Top 3
875 "intent_signals": spacy_analysis.intent_signals,
876 "linguistic_features": {
877 "technical_indicators": linguistic_features.get(
878 "technical_indicators", 0
879 ),
880 "business_indicators": linguistic_features.get(
881 "business_indicators", 0
882 ),
883 "procedural_indicators": linguistic_features.get(
884 "procedural_indicators", 0
885 ),
886 "problem_indicators": linguistic_features.get("problem_indicators", 0),
887 },
888 "top_intent_scores": dict(
889 sorted(intent_scores.items(), key=lambda x: x[1], reverse=True)[:3]
890 ),
891 }
893 def clear_cache(self):
894 """Clear intent classification cache."""
895 self._intent_cache.clear()
896 logger.debug("Cleared intent classification cache")
898 def get_cache_stats(self) -> dict[str, int]:
899 """Get cache statistics."""
900 return {
901 "intent_cache_size": len(self._intent_cache),
902 }