Coverage for src/qdrant_loader_mcp_server/search/engine/faceted.py: 64%

110 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1""" 

2Faceted Search Operations. 

3 

4This module implements faceted search functionality with dynamic 

5facet generation and interactive filtering capabilities. 

6""" 

7 

8from typing import TYPE_CHECKING 

9 

10if TYPE_CHECKING: 

11 from .core import SearchEngine 

12 

13from ...utils.logging import LoggingConfig 

14from ..components.search_result_models import HybridSearchResult 

15 

16logger = LoggingConfig.get_logger(__name__) 

17 

18 

19class FacetedSearchOperations: 

20 """Handles faceted search operations.""" 

21 

22 def __init__(self, engine: "SearchEngine"): 

23 """Initialize with search engine reference.""" 

24 self.engine = engine 

25 self.logger = LoggingConfig.get_logger(__name__) 

26 

27 async def search_with_facets( 

28 self, 

29 query: str, 

30 limit: int = 5, 

31 source_types: list[str] | None = None, 

32 project_ids: list[str] | None = None, 

33 facet_filters: list[dict] | None = None, 

34 ) -> dict: 

35 """ 

36 Perform faceted search with dynamic facet generation. 

37 

38 Returns search results with generated facets for interactive filtering. 

39 

40 Args: 

41 query: Search query 

42 limit: Maximum number of results to return 

43 source_types: Optional list of source types to filter by 

44 project_ids: Optional list of project IDs to filter by 

45 facet_filters: Optional list of facet filters to apply 

46 

47 Returns: 

48 Dictionary containing: 

49 - results: List of search results 

50 - facets: List of generated facets with counts 

51 - total_results: Total results before facet filtering 

52 - filtered_count: Results after facet filtering 

53 - applied_filters: Currently applied facet filters 

54 """ 

55 if not self.engine.hybrid_search: 

56 raise RuntimeError("Search engine not initialized") 

57 

58 try: 

59 # Convert facet filter dictionaries to FacetFilter objects if provided 

60 filter_objects = [] 

61 if facet_filters: 

62 from ..enhanced.faceted_search import FacetFilter, FacetType 

63 

64 for filter_dict in facet_filters: 

65 try: 

66 facet_type = FacetType(filter_dict["facet_type"]) 

67 except (ValueError, TypeError) as e: 

68 self.logger.warning( 

69 "Invalid facet_type provided; skipping facet filter", 

70 facet_type=str(filter_dict.get("facet_type")), 

71 error=str(e), 

72 exc_info=True, 

73 ) 

74 continue 

75 

76 # Validate and normalize values 

77 values_raw = filter_dict.get("values") 

78 if not values_raw: 

79 self.logger.warning( 

80 "Missing or empty 'values' for facet filter; skipping", 

81 facet_type=facet_type.value, 

82 ) 

83 continue 

84 if isinstance(values_raw, set | tuple): 

85 values = list(values_raw) 

86 elif isinstance(values_raw, list): 

87 values = values_raw 

88 else: 

89 values = [values_raw] 

90 

91 # Validate operator 

92 allowed_operators = {"OR", "AND"} 

93 operator = str(filter_dict.get("operator", "OR")).upper() 

94 if operator not in allowed_operators: 

95 self.logger.warning( 

96 "Invalid operator for facet filter; defaulting to 'OR'", 

97 operator=str(filter_dict.get("operator")), 

98 ) 

99 operator = "OR" 

100 

101 filter_objects.append( 

102 FacetFilter( 

103 facet_type=facet_type, 

104 values=values, 

105 operator=operator, 

106 ) 

107 ) 

108 

109 faceted_results = await self.engine.hybrid_search.search_with_facets( 

110 query=query, 

111 limit=limit, 

112 source_types=source_types, 

113 project_ids=project_ids, 

114 facet_filters=filter_objects, 

115 generate_facets=True, 

116 ) 

117 

118 # Convert to MCP-friendly format 

119 return { 

120 "results": faceted_results.results, 

121 "facets": [ 

122 { 

123 "type": facet.facet_type.value, 

124 "name": facet.name, 

125 "display_name": facet.display_name, 

126 "description": facet.description, 

127 "values": [ 

128 { 

129 "value": fv.value, 

130 "count": fv.count, 

131 "display_name": fv.display_name, 

132 "description": fv.description, 

133 } 

134 for fv in facet.get_top_values(10) 

135 ], 

136 } 

137 for facet in faceted_results.facets 

138 ], 

139 "total_results": faceted_results.total_results, 

140 "filtered_count": faceted_results.filtered_count, 

141 "applied_filters": [ 

142 { 

143 "facet_type": f.facet_type.value, 

144 "values": f.values, 

145 "operator": f.operator, 

146 } 

147 for f in faceted_results.applied_filters 

148 ], 

149 "generation_time_ms": faceted_results.generation_time_ms, 

150 } 

151 

152 except Exception as e: 

153 self.logger.error("Faceted search failed", error=str(e), query=query) 

154 raise 

155 

156 async def get_facet_suggestions( 

157 self, 

158 documents: list[HybridSearchResult], 

159 max_facets_per_type: int = 5, 

160 enable_dynamic_generation: bool = True, 

161 ) -> dict: 

162 """ 

163 Generate facet suggestions from a collection of documents. 

164 

165 Analyzes document metadata to suggest useful facets for filtering. 

166 

167 Args: 

168 documents: List of documents to analyze 

169 max_facets_per_type: Maximum facets to generate per type 

170 enable_dynamic_generation: Whether to enable AI-powered facet generation 

171 

172 Returns: 

173 Dictionary containing: 

174 - suggested_facets: List of facet suggestions with metadata 

175 - facet_coverage: Coverage statistics for each facet type 

176 - generation_metadata: Information about facet generation process 

177 """ 

178 if not self.engine.hybrid_search: 

179 raise RuntimeError("Search engine not initialized") 

180 

181 try: 

182 # Use the hybrid search engine to generate facet suggestions 

183 from ..enhanced.faceted_search import DynamicFacetGenerator 

184 

185 facet_generator = DynamicFacetGenerator() 

186 

187 suggestions = await facet_generator.generate_facets_from_documents( 

188 documents=documents, 

189 max_facets_per_type=max_facets_per_type, 

190 enable_ai_generation=enable_dynamic_generation, 

191 ) 

192 

193 # Calculate coverage statistics 

194 coverage_stats = self._calculate_facet_coverage(documents, suggestions) 

195 

196 return { 

197 "suggested_facets": [ 

198 { 

199 "type": facet.facet_type.value, 

200 "name": facet.name, 

201 "display_name": facet.display_name, 

202 "description": facet.description, 

203 "coverage_percentage": coverage_stats.get(facet.name, 0), 

204 "unique_values": len(facet.values), 

205 "top_values": [ 

206 { 

207 "value": fv.value, 

208 "count": fv.count, 

209 "display_name": fv.display_name, 

210 } 

211 for fv in facet.get_top_values(5) 

212 ], 

213 } 

214 for facet in suggestions.facets 

215 ], 

216 "facet_coverage": coverage_stats, 

217 "generation_metadata": { 

218 "total_documents_analyzed": len(documents), 

219 "facet_types_generated": len( 

220 {f.facet_type for f in suggestions.facets} 

221 ), 

222 "total_facets_generated": len(suggestions.facets), 

223 "generation_time_ms": suggestions.generation_time_ms, 

224 "ai_generation_enabled": enable_dynamic_generation, 

225 }, 

226 } 

227 

228 except Exception as e: 

229 self.logger.error("Facet suggestion generation failed", error=str(e)) 

230 raise 

231 

232 def _calculate_facet_coverage( 

233 self, documents: list[HybridSearchResult], suggestions 

234 ) -> dict[str, float]: 

235 """Calculate coverage statistics for generated facets.""" 

236 if not documents: 

237 return {} 

238 

239 coverage_stats = {} 

240 total_docs = len(documents) 

241 

242 for facet in suggestions.facets: 

243 # Calculate how many documents have values for this facet 

244 covered_count = 0 

245 facet_key = str(facet.name).lower() 

246 

247 # Explicit normalized facet key to document attribute mapping 

248 facet_to_attrs: dict[str, tuple[str, ...]] = { 

249 "source": ("source_type", "source"), 

250 "project": ("project_id", "project_ids", "project"), 

251 "created": ("created_at", "created", "timestamp"), 

252 "date": ("created_at", "updated_at", "date"), 

253 "content_type": ("content_type", "mime_type", "type"), 

254 "topic": ("topics", "tags", "labels"), 

255 "entity": ("entities", "named_entities"), 

256 } 

257 

258 # Normalize a few common variants to our keys 

259 normalized_key = facet_key 

260 if facet_key in {"source_type", "source types", "sources"}: 

261 normalized_key = "source" 

262 elif facet_key in {"projects", "project id", "project ids"}: 

263 normalized_key = "project" 

264 elif facet_key in {"created at", "creation date", "time", "datetime"}: 

265 normalized_key = "created" 

266 elif facet_key in {"content", "type", "mime"}: 

267 normalized_key = "content_type" 

268 elif facet_key in {"topics", "labels", "tags"}: 

269 normalized_key = "topic" 

270 elif facet_key in {"entities", "ner"}: 

271 normalized_key = "entity" 

272 

273 mapped_attrs = facet_to_attrs.get(normalized_key) 

274 

275 for doc in documents: 

276 has_value = False 

277 

278 if mapped_attrs: 

279 for attr in mapped_attrs: 

280 value = getattr(doc, attr, None) 

281 if value is None and isinstance(doc, dict): 

282 value = doc.get(attr) 

283 # Treat iterables specially: non-empty list/tuple/set/etc counts 

284 if isinstance(value, list | tuple | set): 

285 if len(value) > 0: 

286 has_value = True 

287 break 

288 else: 

289 if bool(value): 

290 has_value = True 

291 break 

292 else: 

293 # Fallback: check metadata mapping if present; otherwise conservative False 

294 metadata = getattr(doc, "metadata", None) 

295 if metadata is None and isinstance(doc, dict): 

296 metadata = doc.get("metadata") 

297 if isinstance(metadata, dict): 

298 # Try direct key, or normalized variations 

299 value = metadata.get(facet_key) 

300 if value is None: 

301 value = metadata.get(normalized_key) 

302 if value is None: 

303 # Try common singular/plural variants 

304 if facet_key.endswith("s"): 

305 value = metadata.get(facet_key[:-1]) 

306 else: 

307 value = metadata.get(f"{facet_key}s") 

308 if isinstance(value, list | tuple | set): 

309 has_value = len(value) > 0 

310 else: 

311 has_value = bool(value) 

312 else: 

313 has_value = False 

314 

315 if has_value: 

316 covered_count += 1 

317 

318 coverage_percentage = (covered_count / total_docs) * 100 

319 coverage_stats[facet.name] = round(coverage_percentage, 1) 

320 

321 return coverage_stats