Coverage for src/qdrant_loader_mcp_server/search/engine/faceted.py: 64%
110 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1"""
2Faceted Search Operations.
4This module implements faceted search functionality with dynamic
5facet generation and interactive filtering capabilities.
6"""
8from typing import TYPE_CHECKING
10if TYPE_CHECKING:
11 from .core import SearchEngine
13from ...utils.logging import LoggingConfig
14from ..components.search_result_models import HybridSearchResult
16logger = LoggingConfig.get_logger(__name__)
19class FacetedSearchOperations:
20 """Handles faceted search operations."""
22 def __init__(self, engine: "SearchEngine"):
23 """Initialize with search engine reference."""
24 self.engine = engine
25 self.logger = LoggingConfig.get_logger(__name__)
27 async def search_with_facets(
28 self,
29 query: str,
30 limit: int = 5,
31 source_types: list[str] | None = None,
32 project_ids: list[str] | None = None,
33 facet_filters: list[dict] | None = None,
34 ) -> dict:
35 """
36 Perform faceted search with dynamic facet generation.
38 Returns search results with generated facets for interactive filtering.
40 Args:
41 query: Search query
42 limit: Maximum number of results to return
43 source_types: Optional list of source types to filter by
44 project_ids: Optional list of project IDs to filter by
45 facet_filters: Optional list of facet filters to apply
47 Returns:
48 Dictionary containing:
49 - results: List of search results
50 - facets: List of generated facets with counts
51 - total_results: Total results before facet filtering
52 - filtered_count: Results after facet filtering
53 - applied_filters: Currently applied facet filters
54 """
55 if not self.engine.hybrid_search:
56 raise RuntimeError("Search engine not initialized")
58 try:
59 # Convert facet filter dictionaries to FacetFilter objects if provided
60 filter_objects = []
61 if facet_filters:
62 from ..enhanced.faceted_search import FacetFilter, FacetType
64 for filter_dict in facet_filters:
65 try:
66 facet_type = FacetType(filter_dict["facet_type"])
67 except (ValueError, TypeError) as e:
68 self.logger.warning(
69 "Invalid facet_type provided; skipping facet filter",
70 facet_type=str(filter_dict.get("facet_type")),
71 error=str(e),
72 exc_info=True,
73 )
74 continue
76 # Validate and normalize values
77 values_raw = filter_dict.get("values")
78 if not values_raw:
79 self.logger.warning(
80 "Missing or empty 'values' for facet filter; skipping",
81 facet_type=facet_type.value,
82 )
83 continue
84 if isinstance(values_raw, set | tuple):
85 values = list(values_raw)
86 elif isinstance(values_raw, list):
87 values = values_raw
88 else:
89 values = [values_raw]
91 # Validate operator
92 allowed_operators = {"OR", "AND"}
93 operator = str(filter_dict.get("operator", "OR")).upper()
94 if operator not in allowed_operators:
95 self.logger.warning(
96 "Invalid operator for facet filter; defaulting to 'OR'",
97 operator=str(filter_dict.get("operator")),
98 )
99 operator = "OR"
101 filter_objects.append(
102 FacetFilter(
103 facet_type=facet_type,
104 values=values,
105 operator=operator,
106 )
107 )
109 faceted_results = await self.engine.hybrid_search.search_with_facets(
110 query=query,
111 limit=limit,
112 source_types=source_types,
113 project_ids=project_ids,
114 facet_filters=filter_objects,
115 generate_facets=True,
116 )
118 # Convert to MCP-friendly format
119 return {
120 "results": faceted_results.results,
121 "facets": [
122 {
123 "type": facet.facet_type.value,
124 "name": facet.name,
125 "display_name": facet.display_name,
126 "description": facet.description,
127 "values": [
128 {
129 "value": fv.value,
130 "count": fv.count,
131 "display_name": fv.display_name,
132 "description": fv.description,
133 }
134 for fv in facet.get_top_values(10)
135 ],
136 }
137 for facet in faceted_results.facets
138 ],
139 "total_results": faceted_results.total_results,
140 "filtered_count": faceted_results.filtered_count,
141 "applied_filters": [
142 {
143 "facet_type": f.facet_type.value,
144 "values": f.values,
145 "operator": f.operator,
146 }
147 for f in faceted_results.applied_filters
148 ],
149 "generation_time_ms": faceted_results.generation_time_ms,
150 }
152 except Exception as e:
153 self.logger.error("Faceted search failed", error=str(e), query=query)
154 raise
156 async def get_facet_suggestions(
157 self,
158 documents: list[HybridSearchResult],
159 max_facets_per_type: int = 5,
160 enable_dynamic_generation: bool = True,
161 ) -> dict:
162 """
163 Generate facet suggestions from a collection of documents.
165 Analyzes document metadata to suggest useful facets for filtering.
167 Args:
168 documents: List of documents to analyze
169 max_facets_per_type: Maximum facets to generate per type
170 enable_dynamic_generation: Whether to enable AI-powered facet generation
172 Returns:
173 Dictionary containing:
174 - suggested_facets: List of facet suggestions with metadata
175 - facet_coverage: Coverage statistics for each facet type
176 - generation_metadata: Information about facet generation process
177 """
178 if not self.engine.hybrid_search:
179 raise RuntimeError("Search engine not initialized")
181 try:
182 # Use the hybrid search engine to generate facet suggestions
183 from ..enhanced.faceted_search import DynamicFacetGenerator
185 facet_generator = DynamicFacetGenerator()
187 suggestions = await facet_generator.generate_facets_from_documents(
188 documents=documents,
189 max_facets_per_type=max_facets_per_type,
190 enable_ai_generation=enable_dynamic_generation,
191 )
193 # Calculate coverage statistics
194 coverage_stats = self._calculate_facet_coverage(documents, suggestions)
196 return {
197 "suggested_facets": [
198 {
199 "type": facet.facet_type.value,
200 "name": facet.name,
201 "display_name": facet.display_name,
202 "description": facet.description,
203 "coverage_percentage": coverage_stats.get(facet.name, 0),
204 "unique_values": len(facet.values),
205 "top_values": [
206 {
207 "value": fv.value,
208 "count": fv.count,
209 "display_name": fv.display_name,
210 }
211 for fv in facet.get_top_values(5)
212 ],
213 }
214 for facet in suggestions.facets
215 ],
216 "facet_coverage": coverage_stats,
217 "generation_metadata": {
218 "total_documents_analyzed": len(documents),
219 "facet_types_generated": len(
220 {f.facet_type for f in suggestions.facets}
221 ),
222 "total_facets_generated": len(suggestions.facets),
223 "generation_time_ms": suggestions.generation_time_ms,
224 "ai_generation_enabled": enable_dynamic_generation,
225 },
226 }
228 except Exception as e:
229 self.logger.error("Facet suggestion generation failed", error=str(e))
230 raise
232 def _calculate_facet_coverage(
233 self, documents: list[HybridSearchResult], suggestions
234 ) -> dict[str, float]:
235 """Calculate coverage statistics for generated facets."""
236 if not documents:
237 return {}
239 coverage_stats = {}
240 total_docs = len(documents)
242 for facet in suggestions.facets:
243 # Calculate how many documents have values for this facet
244 covered_count = 0
245 facet_key = str(facet.name).lower()
247 # Explicit normalized facet key to document attribute mapping
248 facet_to_attrs: dict[str, tuple[str, ...]] = {
249 "source": ("source_type", "source"),
250 "project": ("project_id", "project_ids", "project"),
251 "created": ("created_at", "created", "timestamp"),
252 "date": ("created_at", "updated_at", "date"),
253 "content_type": ("content_type", "mime_type", "type"),
254 "topic": ("topics", "tags", "labels"),
255 "entity": ("entities", "named_entities"),
256 }
258 # Normalize a few common variants to our keys
259 normalized_key = facet_key
260 if facet_key in {"source_type", "source types", "sources"}:
261 normalized_key = "source"
262 elif facet_key in {"projects", "project id", "project ids"}:
263 normalized_key = "project"
264 elif facet_key in {"created at", "creation date", "time", "datetime"}:
265 normalized_key = "created"
266 elif facet_key in {"content", "type", "mime"}:
267 normalized_key = "content_type"
268 elif facet_key in {"topics", "labels", "tags"}:
269 normalized_key = "topic"
270 elif facet_key in {"entities", "ner"}:
271 normalized_key = "entity"
273 mapped_attrs = facet_to_attrs.get(normalized_key)
275 for doc in documents:
276 has_value = False
278 if mapped_attrs:
279 for attr in mapped_attrs:
280 value = getattr(doc, attr, None)
281 if value is None and isinstance(doc, dict):
282 value = doc.get(attr)
283 # Treat iterables specially: non-empty list/tuple/set/etc counts
284 if isinstance(value, list | tuple | set):
285 if len(value) > 0:
286 has_value = True
287 break
288 else:
289 if bool(value):
290 has_value = True
291 break
292 else:
293 # Fallback: check metadata mapping if present; otherwise conservative False
294 metadata = getattr(doc, "metadata", None)
295 if metadata is None and isinstance(doc, dict):
296 metadata = doc.get("metadata")
297 if isinstance(metadata, dict):
298 # Try direct key, or normalized variations
299 value = metadata.get(facet_key)
300 if value is None:
301 value = metadata.get(normalized_key)
302 if value is None:
303 # Try common singular/plural variants
304 if facet_key.endswith("s"):
305 value = metadata.get(facet_key[:-1])
306 else:
307 value = metadata.get(f"{facet_key}s")
308 if isinstance(value, list | tuple | set):
309 has_value = len(value) > 0
310 else:
311 has_value = bool(value)
312 else:
313 has_value = False
315 if has_value:
316 covered_count += 1
318 coverage_percentage = (covered_count / total_docs) * 100
319 coverage_stats[facet.name] = round(coverage_percentage, 1)
321 return coverage_stats