Coverage for src/qdrant_loader_mcp_server/search/enhanced/faceted_search.py: 84%
336 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1"""
2This module provides intelligent faceted search capabilities that leverage the rich
3metadata extracted during document ingestion. It dynamically generates facets from
4HybridSearchResult metadata and provides filtering and refinement capabilities.
6Key Features:
7- Dynamic facet generation from metadata
8- Intelligent facet grouping and sorting
9- Multi-facet filtering with AND/OR logic
10- Real-time facet value counting
11- Smart facet suggestions based on query context
12"""
14import logging
15from collections import Counter
16from dataclasses import dataclass, field
17from datetime import datetime
18from enum import Enum
19from typing import Any
21from ..components.search_result_models import HybridSearchResult
23logger = logging.getLogger(__name__)
26class FacetType(Enum):
27 """Types of facets available for filtering."""
29 # Content-based facets
30 CONTENT_TYPE = "content_type"
31 SOURCE_TYPE = "source_type"
32 FILE_TYPE = "file_type"
33 HAS_FEATURES = "has_features"
35 # Hierarchical facets
36 HIERARCHY_DEPTH = "hierarchy_depth"
37 SECTION_LEVEL = "section_level"
38 SECTION_TYPE = "section_type"
40 # Project/Organization facets
41 PROJECT = "project"
42 COLLECTION = "collection"
43 REPOSITORY = "repository"
45 # Semantic facets
46 ENTITIES = "entities"
47 ENTITY_TYPES = "entity_types"
48 TOPICS = "topics"
49 KEY_PHRASES = "key_phrases"
51 # Content size facets
52 READ_TIME = "read_time"
53 WORD_COUNT = "word_count"
54 FILE_SIZE = "file_size"
56 # Document structure facets
57 ATTACHMENT_TYPE = "attachment_type"
58 CONVERSION_TYPE = "conversion_type"
59 CHUNKING_STRATEGY = "chunking_strategy"
62@dataclass
63class FacetValue:
64 """A single facet value with count and metadata."""
66 value: str
67 count: int
68 display_name: str
69 description: str | None = None
70 metadata: dict[str, Any] = field(default_factory=dict)
72 def __str__(self) -> str:
73 return f"{self.display_name} ({self.count})"
76@dataclass
77class Facet:
78 """A facet with its type, values, and configuration."""
80 facet_type: FacetType
81 name: str
82 display_name: str
83 values: list[FacetValue]
84 description: str | None = None
85 is_multi_select: bool = True
86 is_hierarchical: bool = False
87 sort_by: str = "count" # "count", "name", "relevance"
88 max_visible: int = 10
90 def get_top_values(self, limit: int | None = None) -> list[FacetValue]:
91 """Get top facet values sorted by the configured sort method."""
92 if limit is None:
93 limit = self.max_visible
95 if self.sort_by == "count":
96 return sorted(self.values, key=lambda v: v.count, reverse=True)[:limit]
97 elif self.sort_by == "name":
98 return sorted(self.values, key=lambda v: v.display_name.lower())[:limit]
99 else: # relevance - for now same as count
100 return sorted(self.values, key=lambda v: v.count, reverse=True)[:limit]
103@dataclass
104class FacetFilter:
105 """A filter applied to search results based on facet selections."""
107 facet_type: FacetType
108 values: list[str]
109 operator: str = "OR" # "OR", "AND"
111 def matches(self, result: HybridSearchResult) -> bool:
112 """Check if a search result matches this facet filter."""
113 result_values = self._extract_values_from_result(result)
115 if self.operator == "OR":
116 return any(value in result_values for value in self.values)
117 else: # AND
118 return all(value in result_values for value in self.values)
120 def _extract_values_from_result(self, result: HybridSearchResult) -> list[str]:
121 """Extract values for this facet type from a search result."""
122 if self.facet_type == FacetType.CONTENT_TYPE:
123 return [result.source_type] if result.source_type else []
124 elif self.facet_type == FacetType.SOURCE_TYPE:
125 return [result.source_type] if result.source_type else []
126 elif self.facet_type == FacetType.FILE_TYPE:
127 file_type = result.get_file_type()
128 return [file_type] if file_type else []
129 elif self.facet_type == FacetType.HAS_FEATURES:
130 features = []
131 if result.has_code_blocks:
132 features.append("code")
133 if result.has_tables:
134 features.append("tables")
135 if result.has_images:
136 features.append("images")
137 if result.has_links:
138 features.append("links")
139 return features
140 elif self.facet_type == FacetType.PROJECT:
141 return [result.project_name] if result.project_name else []
142 elif self.facet_type == FacetType.ENTITIES:
143 entities = []
144 for entity in result.entities:
145 if isinstance(entity, dict) and "text" in entity:
146 entities.append(entity["text"].lower())
147 elif isinstance(entity, str):
148 entities.append(entity.lower())
149 return entities
150 elif self.facet_type == FacetType.TOPICS:
151 topics = []
152 for topic in result.topics:
153 if isinstance(topic, dict) and "text" in topic:
154 topics.append(topic["text"].lower())
155 elif isinstance(topic, str):
156 topics.append(topic.lower())
157 return topics
158 elif self.facet_type == FacetType.HIERARCHY_DEPTH:
159 if result.depth is not None:
160 if result.depth <= 2:
161 return ["shallow"]
162 elif result.depth <= 4:
163 return ["medium"]
164 else:
165 return ["deep"]
166 return []
167 elif self.facet_type == FacetType.READ_TIME:
168 if result.estimated_read_time is not None:
169 if result.estimated_read_time <= 2:
170 return ["quick"]
171 elif result.estimated_read_time <= 10:
172 return ["medium"]
173 else:
174 return ["long"]
175 return []
177 return []
180@dataclass
181class FacetedSearchResults:
182 """Container for faceted search results with facets and filtered results."""
184 results: list[HybridSearchResult]
185 facets: list[Facet]
186 applied_filters: list[FacetFilter]
187 total_results: int
188 filtered_count: int
189 generation_time_ms: float
191 def get_facet(self, facet_type: FacetType) -> Facet | None:
192 """Get a specific facet by type."""
193 return next((f for f in self.facets if f.facet_type == facet_type), None)
195 def has_active_filters(self) -> bool:
196 """Check if any filters are currently applied."""
197 return len(self.applied_filters) > 0
200class DynamicFacetGenerator:
201 """
202 Dynamic Facet Generator
204 Analyzes HybridSearchResult metadata to dynamically generate relevant facets
205 for filtering and exploration. Leverages the rich metadata infrastructure
206 from previous phases.
207 """
209 def __init__(self):
210 self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
212 # Configuration for facet generation
213 self.facet_config = {
214 FacetType.CONTENT_TYPE: {
215 "display_name": "Content Type",
216 "description": "Type of content source",
217 "max_values": 10,
218 "min_count": 1,
219 },
220 FacetType.HAS_FEATURES: {
221 "display_name": "Content Features",
222 "description": "Features present in the content",
223 "max_values": 8,
224 "min_count": 1,
225 },
226 FacetType.PROJECT: {
227 "display_name": "Project",
228 "description": "Project or workspace",
229 "max_values": 15,
230 "min_count": 1,
231 },
232 FacetType.REPOSITORY: {
233 "display_name": "Repository",
234 "description": "Source repository or code host",
235 "max_values": 15,
236 "min_count": 1,
237 },
238 FacetType.ENTITIES: {
239 "display_name": "Entities",
240 "description": "Named entities found in content",
241 "max_values": 20,
242 "min_count": 1,
243 },
244 FacetType.ENTITY_TYPES: {
245 "display_name": "Entity Types",
246 "description": "Types of named entities",
247 "max_values": 15,
248 "min_count": 1,
249 },
250 FacetType.TOPICS: {
251 "display_name": "Topics",
252 "description": "Topics and themes",
253 "max_values": 15,
254 "min_count": 1,
255 },
256 FacetType.KEY_PHRASES: {
257 "display_name": "Key Phrases",
258 "description": "Key phrases extracted from content",
259 "max_values": 20,
260 "min_count": 1,
261 },
262 FacetType.HIERARCHY_DEPTH: {
263 "display_name": "Content Depth",
264 "description": "Hierarchical depth in document structure",
265 "max_values": 5,
266 "min_count": 1,
267 },
268 FacetType.READ_TIME: {
269 "display_name": "Reading Time",
270 "description": "Estimated time to read",
271 "max_values": 5,
272 "min_count": 1,
273 },
274 FacetType.FILE_TYPE: {
275 "display_name": "File Type",
276 "description": "Original file type or format",
277 "max_values": 10,
278 "min_count": 1,
279 },
280 }
282 def generate_facets(self, search_results: list[HybridSearchResult]) -> list[Facet]:
283 """
284 Generate dynamic facets from search results metadata.
286 Args:
287 search_results: List of search results to analyze
289 Returns:
290 List of generated facets with counts
291 """
292 start_time = datetime.now()
294 if not search_results:
295 return []
297 facets = []
299 # Generate each configured facet type
300 for facet_type, config in self.facet_config.items():
301 facet = self._generate_facet(facet_type, search_results, config)
302 if facet and len(facet.values) > 0:
303 facets.append(facet)
305 # Sort facets by priority (most useful first)
306 facets = self._sort_facets_by_priority(facets, search_results)
308 generation_time = (datetime.now() - start_time).total_seconds() * 1000
309 self.logger.debug(f"Generated {len(facets)} facets in {generation_time:.2f}ms")
311 return facets
313 def _generate_facet(
314 self,
315 facet_type: FacetType,
316 search_results: list[HybridSearchResult],
317 config: dict[str, Any],
318 ) -> Facet | None:
319 """Generate a specific facet from search results."""
321 # Extract values for this facet type
322 value_counts = Counter()
324 for result in search_results:
325 values = self._extract_facet_values(result, facet_type)
326 for value in values:
327 if value: # Skip empty values
328 value_counts[value] += 1
330 # Filter by minimum count
331 min_count = config.get("min_count", 1)
332 filtered_counts = {k: v for k, v in value_counts.items() if v >= min_count}
334 if not filtered_counts:
335 return None
337 # Create facet values
338 facet_values = []
339 for value, count in filtered_counts.items():
340 display_name = self._get_display_name(facet_type, value)
341 description = self._get_value_description(facet_type, value)
343 facet_values.append(
344 FacetValue(
345 value=value,
346 count=count,
347 display_name=display_name,
348 description=description,
349 )
350 )
352 # Limit to max values
353 max_values = config.get("max_values", 10)
354 facet_values = sorted(facet_values, key=lambda v: v.count, reverse=True)[
355 :max_values
356 ]
358 return Facet(
359 facet_type=facet_type,
360 name=facet_type.value,
361 display_name=config["display_name"],
362 description=config.get("description"),
363 values=facet_values,
364 sort_by="count",
365 )
367 def _extract_facet_values(
368 self, result: HybridSearchResult, facet_type: FacetType
369 ) -> list[str]:
370 """Extract values for a specific facet type from a search result."""
372 if facet_type == FacetType.CONTENT_TYPE:
373 return [result.source_type] if result.source_type else []
375 elif facet_type == FacetType.SOURCE_TYPE:
376 return [result.source_type] if result.source_type else []
378 elif facet_type == FacetType.FILE_TYPE:
379 file_type = result.get_file_type()
380 return [file_type] if file_type else []
382 elif facet_type == FacetType.HAS_FEATURES:
383 features = []
384 if result.has_code_blocks:
385 features.append("code")
386 if result.has_tables:
387 features.append("tables")
388 if result.has_images:
389 features.append("images")
390 if result.has_links:
391 features.append("links")
392 if result.is_attachment:
393 features.append("attachment")
394 return features
396 elif facet_type == FacetType.PROJECT:
397 values = []
398 if result.project_name:
399 values.append(result.project_name)
400 if result.collection_name and result.collection_name != result.project_name:
401 values.append(result.collection_name)
402 return values
404 elif facet_type == FacetType.REPOSITORY:
405 return [result.repo_name] if result.repo_name else []
407 elif facet_type == FacetType.ENTITIES:
408 entities = []
409 for entity in result.entities:
410 if isinstance(entity, dict) and "text" in entity:
411 entities.append(entity["text"].lower().strip())
412 elif isinstance(entity, str):
413 entities.append(entity.lower().strip())
414 return [e for e in entities if len(e) >= 2] # Filter very short entities
416 elif facet_type == FacetType.ENTITY_TYPES:
417 entity_types = []
418 for entity in result.entities:
419 if isinstance(entity, dict) and "label" in entity:
420 entity_types.append(entity["label"])
421 return entity_types
423 elif facet_type == FacetType.TOPICS:
424 topics = []
425 for topic in result.topics:
426 if isinstance(topic, dict) and "text" in topic:
427 topics.append(topic["text"].lower().strip())
428 elif isinstance(topic, str):
429 topics.append(topic.lower().strip())
430 return [t for t in topics if len(t) > 2] # Filter short topics
432 elif facet_type == FacetType.KEY_PHRASES:
433 phrases = []
434 for phrase in result.key_phrases:
435 if isinstance(phrase, dict) and "text" in phrase:
436 phrases.append(phrase["text"].lower().strip())
437 elif isinstance(phrase, str):
438 phrases.append(phrase.lower().strip())
439 return [p for p in phrases if len(p) > 3] # Filter short phrases
441 elif facet_type == FacetType.HIERARCHY_DEPTH:
442 if result.depth is not None:
443 if result.depth <= 2:
444 return ["shallow"]
445 elif result.depth <= 4:
446 return ["medium"]
447 else:
448 return ["deep"]
449 return []
451 elif facet_type == FacetType.SECTION_LEVEL:
452 if result.section_level is not None:
453 return [f"level_{result.section_level}"]
454 return []
456 elif facet_type == FacetType.SECTION_TYPE:
457 return [result.section_type] if result.section_type else []
459 elif facet_type == FacetType.READ_TIME:
460 if result.estimated_read_time is not None:
461 if result.estimated_read_time <= 2:
462 return ["quick"]
463 elif result.estimated_read_time <= 10:
464 return ["medium"]
465 else:
466 return ["long"]
467 return []
469 elif facet_type == FacetType.WORD_COUNT:
470 if result.word_count is not None:
471 if result.word_count <= 100:
472 return ["short"]
473 elif result.word_count <= 500:
474 return ["medium"]
475 else:
476 return ["long"]
477 return []
479 elif facet_type == FacetType.ATTACHMENT_TYPE:
480 if result.is_attachment and result.mime_type:
481 return [result.mime_type]
482 return []
484 elif facet_type == FacetType.CONVERSION_TYPE:
485 if result.is_converted and result.conversion_method:
486 return [result.conversion_method]
487 return []
489 elif facet_type == FacetType.CHUNKING_STRATEGY:
490 return [result.chunking_strategy] if result.chunking_strategy else []
492 return []
494 def _get_display_name(self, facet_type: FacetType, value: str) -> str:
495 """Get a human-readable display name for a facet value."""
497 # Custom display names for specific facet types
498 if facet_type == FacetType.HAS_FEATURES:
499 feature_names = {
500 "code": "Code Blocks",
501 "tables": "Tables",
502 "images": "Images",
503 "links": "Links",
504 "attachment": "Attachments",
505 }
506 return feature_names.get(value, value.title())
508 elif facet_type == FacetType.HIERARCHY_DEPTH:
509 depth_names = {
510 "shallow": "Shallow (1-2 levels)",
511 "medium": "Medium (3-4 levels)",
512 "deep": "Deep (5+ levels)",
513 }
514 return depth_names.get(value, value.title())
516 elif facet_type == FacetType.READ_TIME:
517 time_names = {
518 "quick": "Quick Read (≤2 min)",
519 "medium": "Medium Read (3-10 min)",
520 "long": "Long Read (10+ min)",
521 }
522 return time_names.get(value, value.title())
524 elif facet_type == FacetType.WORD_COUNT:
525 count_names = {
526 "short": "Short (≤100 words)",
527 "medium": "Medium (101-500 words)",
528 "long": "Long (500+ words)",
529 }
530 return count_names.get(value, value.title())
532 # Default: capitalize first letter
533 return value.replace("_", " ").title()
535 def _get_value_description(self, facet_type: FacetType, value: str) -> str | None:
536 """Get a description for a facet value."""
538 if facet_type == FacetType.HAS_FEATURES:
539 descriptions = {
540 "code": "Contains code blocks or snippets",
541 "tables": "Contains structured data tables",
542 "images": "Contains images or diagrams",
543 "links": "Contains hyperlinks",
544 "attachment": "File attachments",
545 }
546 return descriptions.get(value)
548 return None
550 def _sort_facets_by_priority(
551 self, facets: list[Facet], search_results: list[HybridSearchResult]
552 ) -> list[Facet]:
553 """Sort facets by priority/usefulness for the current result set."""
555 # Priority order - most useful facets first
556 priority_order = [
557 FacetType.CONTENT_TYPE,
558 FacetType.PROJECT,
559 FacetType.HAS_FEATURES,
560 FacetType.ENTITIES,
561 FacetType.TOPICS,
562 FacetType.READ_TIME,
563 FacetType.HIERARCHY_DEPTH,
564 FacetType.FILE_TYPE,
565 FacetType.SECTION_TYPE,
566 ]
568 # Create priority map
569 priority_map = {facet_type: i for i, facet_type in enumerate(priority_order)}
571 # Sort facets by priority, then by value count
572 def facet_sort_key(facet: Facet) -> tuple[int, int]:
573 priority = priority_map.get(facet.facet_type, 999)
574 value_count = len(facet.values)
575 return (priority, -value_count) # Negative for descending count
577 return sorted(facets, key=facet_sort_key)
580class FacetedSearchEngine:
581 """
582 Faceted Search Engine
584 Provides faceted search capabilities with filtering and refinement.
585 Integrates with the existing HybridSearchEngine to add faceting layer.
586 """
588 def __init__(self):
589 self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
590 self.facet_generator = DynamicFacetGenerator()
592 def apply_facet_filters(
593 self, results: list[HybridSearchResult], filters: list[FacetFilter]
594 ) -> list[HybridSearchResult]:
595 """
596 Apply facet filters to search results.
598 Args:
599 results: Original search results
600 filters: List of facet filters to apply
602 Returns:
603 Filtered search results
604 """
605 if not filters:
606 return results
608 filtered_results = []
610 for result in results:
611 # Check if result matches ALL filters (AND logic between different facets)
612 matches_all = True
614 for filter_obj in filters:
615 if not filter_obj.matches(result):
616 matches_all = False
617 break
619 if matches_all:
620 filtered_results.append(result)
622 return filtered_results
624 def generate_faceted_results(
625 self,
626 results: list[HybridSearchResult],
627 applied_filters: list[FacetFilter] | None = None,
628 ) -> FacetedSearchResults:
629 """
630 Generate faceted search results with facets and filtered results.
632 Args:
633 results: Original search results
634 applied_filters: Currently applied filters
636 Returns:
637 FacetedSearchResults with facets and filtered results
638 """
639 start_time = datetime.now()
641 applied_filters = applied_filters or []
643 # Apply filters if any
644 filtered_results = self.apply_facet_filters(results, applied_filters)
646 # Generate facets from ALL results (not just filtered ones)
647 # This allows users to see all available filter options
648 facets = self.facet_generator.generate_facets(results)
650 generation_time = (datetime.now() - start_time).total_seconds() * 1000
652 return FacetedSearchResults(
653 results=filtered_results,
654 facets=facets,
655 applied_filters=applied_filters,
656 total_results=len(results),
657 filtered_count=len(filtered_results),
658 generation_time_ms=generation_time,
659 )
661 def create_filter_from_selection(
662 self, facet_type: FacetType, selected_values: list[str], operator: str = "OR"
663 ) -> FacetFilter:
664 """Create a facet filter from user selections."""
665 return FacetFilter(
666 facet_type=facet_type, values=selected_values, operator=operator
667 )
669 def suggest_refinements(
670 self,
671 current_results: list[HybridSearchResult],
672 current_filters: list[FacetFilter],
673 ) -> list[dict[str, Any]]:
674 """
675 Suggest facet refinements based on current results and filters.
677 Returns:
678 List of suggested refinements with impact estimates
679 """
680 suggestions = []
682 # Generate facets for current results
683 facets = self.facet_generator.generate_facets(current_results)
685 # Suggest filters that would significantly narrow results
686 for facet in facets:
687 # Skip facets that are already filtered
688 if any(f.facet_type == facet.facet_type for f in current_filters):
689 continue
691 # Suggest top values that would filter to reasonable result count
692 for facet_value in facet.get_top_values(3):
693 # Estimate impact
694 test_filter = FacetFilter(facet.facet_type, [facet_value.value])
695 filtered_count = len(
696 self.apply_facet_filters(current_results, [test_filter])
697 )
699 if 0 < filtered_count < len(current_results) * 0.8: # 20%+ reduction
700 suggestions.append(
701 {
702 "facet_type": facet.facet_type.value,
703 "facet_display_name": facet.display_name,
704 "value": facet_value.value,
705 "display_name": facet_value.display_name,
706 "current_count": len(current_results),
707 "filtered_count": filtered_count,
708 "reduction_percent": round(
709 (1 - filtered_count / len(current_results)) * 100
710 ),
711 }
712 )
714 # Sort by usefulness (highest reduction first)
715 suggestions.sort(key=lambda s: s["reduction_percent"], reverse=True)
717 return suggestions[:5] # Top 5 suggestions