Coverage for src/qdrant_loader_mcp_server/search/enhanced/faceted_search.py: 84%

336 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1""" 

2This module provides intelligent faceted search capabilities that leverage the rich 

3metadata extracted during document ingestion. It dynamically generates facets from 

4HybridSearchResult metadata and provides filtering and refinement capabilities. 

5 

6Key Features: 

7- Dynamic facet generation from metadata 

8- Intelligent facet grouping and sorting 

9- Multi-facet filtering with AND/OR logic 

10- Real-time facet value counting 

11- Smart facet suggestions based on query context 

12""" 

13 

14import logging 

15from collections import Counter 

16from dataclasses import dataclass, field 

17from datetime import datetime 

18from enum import Enum 

19from typing import Any 

20 

21from ..components.search_result_models import HybridSearchResult 

22 

23logger = logging.getLogger(__name__) 

24 

25 

26class FacetType(Enum): 

27 """Types of facets available for filtering.""" 

28 

29 # Content-based facets 

30 CONTENT_TYPE = "content_type" 

31 SOURCE_TYPE = "source_type" 

32 FILE_TYPE = "file_type" 

33 HAS_FEATURES = "has_features" 

34 

35 # Hierarchical facets 

36 HIERARCHY_DEPTH = "hierarchy_depth" 

37 SECTION_LEVEL = "section_level" 

38 SECTION_TYPE = "section_type" 

39 

40 # Project/Organization facets 

41 PROJECT = "project" 

42 COLLECTION = "collection" 

43 REPOSITORY = "repository" 

44 

45 # Semantic facets 

46 ENTITIES = "entities" 

47 ENTITY_TYPES = "entity_types" 

48 TOPICS = "topics" 

49 KEY_PHRASES = "key_phrases" 

50 

51 # Content size facets 

52 READ_TIME = "read_time" 

53 WORD_COUNT = "word_count" 

54 FILE_SIZE = "file_size" 

55 

56 # Document structure facets 

57 ATTACHMENT_TYPE = "attachment_type" 

58 CONVERSION_TYPE = "conversion_type" 

59 CHUNKING_STRATEGY = "chunking_strategy" 

60 

61 

62@dataclass 

63class FacetValue: 

64 """A single facet value with count and metadata.""" 

65 

66 value: str 

67 count: int 

68 display_name: str 

69 description: str | None = None 

70 metadata: dict[str, Any] = field(default_factory=dict) 

71 

72 def __str__(self) -> str: 

73 return f"{self.display_name} ({self.count})" 

74 

75 

76@dataclass 

77class Facet: 

78 """A facet with its type, values, and configuration.""" 

79 

80 facet_type: FacetType 

81 name: str 

82 display_name: str 

83 values: list[FacetValue] 

84 description: str | None = None 

85 is_multi_select: bool = True 

86 is_hierarchical: bool = False 

87 sort_by: str = "count" # "count", "name", "relevance" 

88 max_visible: int = 10 

89 

90 def get_top_values(self, limit: int | None = None) -> list[FacetValue]: 

91 """Get top facet values sorted by the configured sort method.""" 

92 if limit is None: 

93 limit = self.max_visible 

94 

95 if self.sort_by == "count": 

96 return sorted(self.values, key=lambda v: v.count, reverse=True)[:limit] 

97 elif self.sort_by == "name": 

98 return sorted(self.values, key=lambda v: v.display_name.lower())[:limit] 

99 else: # relevance - for now same as count 

100 return sorted(self.values, key=lambda v: v.count, reverse=True)[:limit] 

101 

102 

103@dataclass 

104class FacetFilter: 

105 """A filter applied to search results based on facet selections.""" 

106 

107 facet_type: FacetType 

108 values: list[str] 

109 operator: str = "OR" # "OR", "AND" 

110 

111 def matches(self, result: HybridSearchResult) -> bool: 

112 """Check if a search result matches this facet filter.""" 

113 result_values = self._extract_values_from_result(result) 

114 

115 if self.operator == "OR": 

116 return any(value in result_values for value in self.values) 

117 else: # AND 

118 return all(value in result_values for value in self.values) 

119 

120 def _extract_values_from_result(self, result: HybridSearchResult) -> list[str]: 

121 """Extract values for this facet type from a search result.""" 

122 if self.facet_type == FacetType.CONTENT_TYPE: 

123 return [result.source_type] if result.source_type else [] 

124 elif self.facet_type == FacetType.SOURCE_TYPE: 

125 return [result.source_type] if result.source_type else [] 

126 elif self.facet_type == FacetType.FILE_TYPE: 

127 file_type = result.get_file_type() 

128 return [file_type] if file_type else [] 

129 elif self.facet_type == FacetType.HAS_FEATURES: 

130 features = [] 

131 if result.has_code_blocks: 

132 features.append("code") 

133 if result.has_tables: 

134 features.append("tables") 

135 if result.has_images: 

136 features.append("images") 

137 if result.has_links: 

138 features.append("links") 

139 return features 

140 elif self.facet_type == FacetType.PROJECT: 

141 return [result.project_name] if result.project_name else [] 

142 elif self.facet_type == FacetType.ENTITIES: 

143 entities = [] 

144 for entity in result.entities: 

145 if isinstance(entity, dict) and "text" in entity: 

146 entities.append(entity["text"].lower()) 

147 elif isinstance(entity, str): 

148 entities.append(entity.lower()) 

149 return entities 

150 elif self.facet_type == FacetType.TOPICS: 

151 topics = [] 

152 for topic in result.topics: 

153 if isinstance(topic, dict) and "text" in topic: 

154 topics.append(topic["text"].lower()) 

155 elif isinstance(topic, str): 

156 topics.append(topic.lower()) 

157 return topics 

158 elif self.facet_type == FacetType.HIERARCHY_DEPTH: 

159 if result.depth is not None: 

160 if result.depth <= 2: 

161 return ["shallow"] 

162 elif result.depth <= 4: 

163 return ["medium"] 

164 else: 

165 return ["deep"] 

166 return [] 

167 elif self.facet_type == FacetType.READ_TIME: 

168 if result.estimated_read_time is not None: 

169 if result.estimated_read_time <= 2: 

170 return ["quick"] 

171 elif result.estimated_read_time <= 10: 

172 return ["medium"] 

173 else: 

174 return ["long"] 

175 return [] 

176 

177 return [] 

178 

179 

180@dataclass 

181class FacetedSearchResults: 

182 """Container for faceted search results with facets and filtered results.""" 

183 

184 results: list[HybridSearchResult] 

185 facets: list[Facet] 

186 applied_filters: list[FacetFilter] 

187 total_results: int 

188 filtered_count: int 

189 generation_time_ms: float 

190 

191 def get_facet(self, facet_type: FacetType) -> Facet | None: 

192 """Get a specific facet by type.""" 

193 return next((f for f in self.facets if f.facet_type == facet_type), None) 

194 

195 def has_active_filters(self) -> bool: 

196 """Check if any filters are currently applied.""" 

197 return len(self.applied_filters) > 0 

198 

199 

200class DynamicFacetGenerator: 

201 """ 

202 Dynamic Facet Generator 

203 

204 Analyzes HybridSearchResult metadata to dynamically generate relevant facets 

205 for filtering and exploration. Leverages the rich metadata infrastructure 

206 from previous phases. 

207 """ 

208 

209 def __init__(self): 

210 self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") 

211 

212 # Configuration for facet generation 

213 self.facet_config = { 

214 FacetType.CONTENT_TYPE: { 

215 "display_name": "Content Type", 

216 "description": "Type of content source", 

217 "max_values": 10, 

218 "min_count": 1, 

219 }, 

220 FacetType.HAS_FEATURES: { 

221 "display_name": "Content Features", 

222 "description": "Features present in the content", 

223 "max_values": 8, 

224 "min_count": 1, 

225 }, 

226 FacetType.PROJECT: { 

227 "display_name": "Project", 

228 "description": "Project or workspace", 

229 "max_values": 15, 

230 "min_count": 1, 

231 }, 

232 FacetType.REPOSITORY: { 

233 "display_name": "Repository", 

234 "description": "Source repository or code host", 

235 "max_values": 15, 

236 "min_count": 1, 

237 }, 

238 FacetType.ENTITIES: { 

239 "display_name": "Entities", 

240 "description": "Named entities found in content", 

241 "max_values": 20, 

242 "min_count": 1, 

243 }, 

244 FacetType.ENTITY_TYPES: { 

245 "display_name": "Entity Types", 

246 "description": "Types of named entities", 

247 "max_values": 15, 

248 "min_count": 1, 

249 }, 

250 FacetType.TOPICS: { 

251 "display_name": "Topics", 

252 "description": "Topics and themes", 

253 "max_values": 15, 

254 "min_count": 1, 

255 }, 

256 FacetType.KEY_PHRASES: { 

257 "display_name": "Key Phrases", 

258 "description": "Key phrases extracted from content", 

259 "max_values": 20, 

260 "min_count": 1, 

261 }, 

262 FacetType.HIERARCHY_DEPTH: { 

263 "display_name": "Content Depth", 

264 "description": "Hierarchical depth in document structure", 

265 "max_values": 5, 

266 "min_count": 1, 

267 }, 

268 FacetType.READ_TIME: { 

269 "display_name": "Reading Time", 

270 "description": "Estimated time to read", 

271 "max_values": 5, 

272 "min_count": 1, 

273 }, 

274 FacetType.FILE_TYPE: { 

275 "display_name": "File Type", 

276 "description": "Original file type or format", 

277 "max_values": 10, 

278 "min_count": 1, 

279 }, 

280 } 

281 

282 def generate_facets(self, search_results: list[HybridSearchResult]) -> list[Facet]: 

283 """ 

284 Generate dynamic facets from search results metadata. 

285 

286 Args: 

287 search_results: List of search results to analyze 

288 

289 Returns: 

290 List of generated facets with counts 

291 """ 

292 start_time = datetime.now() 

293 

294 if not search_results: 

295 return [] 

296 

297 facets = [] 

298 

299 # Generate each configured facet type 

300 for facet_type, config in self.facet_config.items(): 

301 facet = self._generate_facet(facet_type, search_results, config) 

302 if facet and len(facet.values) > 0: 

303 facets.append(facet) 

304 

305 # Sort facets by priority (most useful first) 

306 facets = self._sort_facets_by_priority(facets, search_results) 

307 

308 generation_time = (datetime.now() - start_time).total_seconds() * 1000 

309 self.logger.debug(f"Generated {len(facets)} facets in {generation_time:.2f}ms") 

310 

311 return facets 

312 

313 def _generate_facet( 

314 self, 

315 facet_type: FacetType, 

316 search_results: list[HybridSearchResult], 

317 config: dict[str, Any], 

318 ) -> Facet | None: 

319 """Generate a specific facet from search results.""" 

320 

321 # Extract values for this facet type 

322 value_counts = Counter() 

323 

324 for result in search_results: 

325 values = self._extract_facet_values(result, facet_type) 

326 for value in values: 

327 if value: # Skip empty values 

328 value_counts[value] += 1 

329 

330 # Filter by minimum count 

331 min_count = config.get("min_count", 1) 

332 filtered_counts = {k: v for k, v in value_counts.items() if v >= min_count} 

333 

334 if not filtered_counts: 

335 return None 

336 

337 # Create facet values 

338 facet_values = [] 

339 for value, count in filtered_counts.items(): 

340 display_name = self._get_display_name(facet_type, value) 

341 description = self._get_value_description(facet_type, value) 

342 

343 facet_values.append( 

344 FacetValue( 

345 value=value, 

346 count=count, 

347 display_name=display_name, 

348 description=description, 

349 ) 

350 ) 

351 

352 # Limit to max values 

353 max_values = config.get("max_values", 10) 

354 facet_values = sorted(facet_values, key=lambda v: v.count, reverse=True)[ 

355 :max_values 

356 ] 

357 

358 return Facet( 

359 facet_type=facet_type, 

360 name=facet_type.value, 

361 display_name=config["display_name"], 

362 description=config.get("description"), 

363 values=facet_values, 

364 sort_by="count", 

365 ) 

366 

367 def _extract_facet_values( 

368 self, result: HybridSearchResult, facet_type: FacetType 

369 ) -> list[str]: 

370 """Extract values for a specific facet type from a search result.""" 

371 

372 if facet_type == FacetType.CONTENT_TYPE: 

373 return [result.source_type] if result.source_type else [] 

374 

375 elif facet_type == FacetType.SOURCE_TYPE: 

376 return [result.source_type] if result.source_type else [] 

377 

378 elif facet_type == FacetType.FILE_TYPE: 

379 file_type = result.get_file_type() 

380 return [file_type] if file_type else [] 

381 

382 elif facet_type == FacetType.HAS_FEATURES: 

383 features = [] 

384 if result.has_code_blocks: 

385 features.append("code") 

386 if result.has_tables: 

387 features.append("tables") 

388 if result.has_images: 

389 features.append("images") 

390 if result.has_links: 

391 features.append("links") 

392 if result.is_attachment: 

393 features.append("attachment") 

394 return features 

395 

396 elif facet_type == FacetType.PROJECT: 

397 values = [] 

398 if result.project_name: 

399 values.append(result.project_name) 

400 if result.collection_name and result.collection_name != result.project_name: 

401 values.append(result.collection_name) 

402 return values 

403 

404 elif facet_type == FacetType.REPOSITORY: 

405 return [result.repo_name] if result.repo_name else [] 

406 

407 elif facet_type == FacetType.ENTITIES: 

408 entities = [] 

409 for entity in result.entities: 

410 if isinstance(entity, dict) and "text" in entity: 

411 entities.append(entity["text"].lower().strip()) 

412 elif isinstance(entity, str): 

413 entities.append(entity.lower().strip()) 

414 return [e for e in entities if len(e) >= 2] # Filter very short entities 

415 

416 elif facet_type == FacetType.ENTITY_TYPES: 

417 entity_types = [] 

418 for entity in result.entities: 

419 if isinstance(entity, dict) and "label" in entity: 

420 entity_types.append(entity["label"]) 

421 return entity_types 

422 

423 elif facet_type == FacetType.TOPICS: 

424 topics = [] 

425 for topic in result.topics: 

426 if isinstance(topic, dict) and "text" in topic: 

427 topics.append(topic["text"].lower().strip()) 

428 elif isinstance(topic, str): 

429 topics.append(topic.lower().strip()) 

430 return [t for t in topics if len(t) > 2] # Filter short topics 

431 

432 elif facet_type == FacetType.KEY_PHRASES: 

433 phrases = [] 

434 for phrase in result.key_phrases: 

435 if isinstance(phrase, dict) and "text" in phrase: 

436 phrases.append(phrase["text"].lower().strip()) 

437 elif isinstance(phrase, str): 

438 phrases.append(phrase.lower().strip()) 

439 return [p for p in phrases if len(p) > 3] # Filter short phrases 

440 

441 elif facet_type == FacetType.HIERARCHY_DEPTH: 

442 if result.depth is not None: 

443 if result.depth <= 2: 

444 return ["shallow"] 

445 elif result.depth <= 4: 

446 return ["medium"] 

447 else: 

448 return ["deep"] 

449 return [] 

450 

451 elif facet_type == FacetType.SECTION_LEVEL: 

452 if result.section_level is not None: 

453 return [f"level_{result.section_level}"] 

454 return [] 

455 

456 elif facet_type == FacetType.SECTION_TYPE: 

457 return [result.section_type] if result.section_type else [] 

458 

459 elif facet_type == FacetType.READ_TIME: 

460 if result.estimated_read_time is not None: 

461 if result.estimated_read_time <= 2: 

462 return ["quick"] 

463 elif result.estimated_read_time <= 10: 

464 return ["medium"] 

465 else: 

466 return ["long"] 

467 return [] 

468 

469 elif facet_type == FacetType.WORD_COUNT: 

470 if result.word_count is not None: 

471 if result.word_count <= 100: 

472 return ["short"] 

473 elif result.word_count <= 500: 

474 return ["medium"] 

475 else: 

476 return ["long"] 

477 return [] 

478 

479 elif facet_type == FacetType.ATTACHMENT_TYPE: 

480 if result.is_attachment and result.mime_type: 

481 return [result.mime_type] 

482 return [] 

483 

484 elif facet_type == FacetType.CONVERSION_TYPE: 

485 if result.is_converted and result.conversion_method: 

486 return [result.conversion_method] 

487 return [] 

488 

489 elif facet_type == FacetType.CHUNKING_STRATEGY: 

490 return [result.chunking_strategy] if result.chunking_strategy else [] 

491 

492 return [] 

493 

494 def _get_display_name(self, facet_type: FacetType, value: str) -> str: 

495 """Get a human-readable display name for a facet value.""" 

496 

497 # Custom display names for specific facet types 

498 if facet_type == FacetType.HAS_FEATURES: 

499 feature_names = { 

500 "code": "Code Blocks", 

501 "tables": "Tables", 

502 "images": "Images", 

503 "links": "Links", 

504 "attachment": "Attachments", 

505 } 

506 return feature_names.get(value, value.title()) 

507 

508 elif facet_type == FacetType.HIERARCHY_DEPTH: 

509 depth_names = { 

510 "shallow": "Shallow (1-2 levels)", 

511 "medium": "Medium (3-4 levels)", 

512 "deep": "Deep (5+ levels)", 

513 } 

514 return depth_names.get(value, value.title()) 

515 

516 elif facet_type == FacetType.READ_TIME: 

517 time_names = { 

518 "quick": "Quick Read (≤2 min)", 

519 "medium": "Medium Read (3-10 min)", 

520 "long": "Long Read (10+ min)", 

521 } 

522 return time_names.get(value, value.title()) 

523 

524 elif facet_type == FacetType.WORD_COUNT: 

525 count_names = { 

526 "short": "Short (≤100 words)", 

527 "medium": "Medium (101-500 words)", 

528 "long": "Long (500+ words)", 

529 } 

530 return count_names.get(value, value.title()) 

531 

532 # Default: capitalize first letter 

533 return value.replace("_", " ").title() 

534 

535 def _get_value_description(self, facet_type: FacetType, value: str) -> str | None: 

536 """Get a description for a facet value.""" 

537 

538 if facet_type == FacetType.HAS_FEATURES: 

539 descriptions = { 

540 "code": "Contains code blocks or snippets", 

541 "tables": "Contains structured data tables", 

542 "images": "Contains images or diagrams", 

543 "links": "Contains hyperlinks", 

544 "attachment": "File attachments", 

545 } 

546 return descriptions.get(value) 

547 

548 return None 

549 

550 def _sort_facets_by_priority( 

551 self, facets: list[Facet], search_results: list[HybridSearchResult] 

552 ) -> list[Facet]: 

553 """Sort facets by priority/usefulness for the current result set.""" 

554 

555 # Priority order - most useful facets first 

556 priority_order = [ 

557 FacetType.CONTENT_TYPE, 

558 FacetType.PROJECT, 

559 FacetType.HAS_FEATURES, 

560 FacetType.ENTITIES, 

561 FacetType.TOPICS, 

562 FacetType.READ_TIME, 

563 FacetType.HIERARCHY_DEPTH, 

564 FacetType.FILE_TYPE, 

565 FacetType.SECTION_TYPE, 

566 ] 

567 

568 # Create priority map 

569 priority_map = {facet_type: i for i, facet_type in enumerate(priority_order)} 

570 

571 # Sort facets by priority, then by value count 

572 def facet_sort_key(facet: Facet) -> tuple[int, int]: 

573 priority = priority_map.get(facet.facet_type, 999) 

574 value_count = len(facet.values) 

575 return (priority, -value_count) # Negative for descending count 

576 

577 return sorted(facets, key=facet_sort_key) 

578 

579 

580class FacetedSearchEngine: 

581 """ 

582 Faceted Search Engine 

583 

584 Provides faceted search capabilities with filtering and refinement. 

585 Integrates with the existing HybridSearchEngine to add faceting layer. 

586 """ 

587 

588 def __init__(self): 

589 self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") 

590 self.facet_generator = DynamicFacetGenerator() 

591 

592 def apply_facet_filters( 

593 self, results: list[HybridSearchResult], filters: list[FacetFilter] 

594 ) -> list[HybridSearchResult]: 

595 """ 

596 Apply facet filters to search results. 

597 

598 Args: 

599 results: Original search results 

600 filters: List of facet filters to apply 

601 

602 Returns: 

603 Filtered search results 

604 """ 

605 if not filters: 

606 return results 

607 

608 filtered_results = [] 

609 

610 for result in results: 

611 # Check if result matches ALL filters (AND logic between different facets) 

612 matches_all = True 

613 

614 for filter_obj in filters: 

615 if not filter_obj.matches(result): 

616 matches_all = False 

617 break 

618 

619 if matches_all: 

620 filtered_results.append(result) 

621 

622 return filtered_results 

623 

624 def generate_faceted_results( 

625 self, 

626 results: list[HybridSearchResult], 

627 applied_filters: list[FacetFilter] | None = None, 

628 ) -> FacetedSearchResults: 

629 """ 

630 Generate faceted search results with facets and filtered results. 

631 

632 Args: 

633 results: Original search results 

634 applied_filters: Currently applied filters 

635 

636 Returns: 

637 FacetedSearchResults with facets and filtered results 

638 """ 

639 start_time = datetime.now() 

640 

641 applied_filters = applied_filters or [] 

642 

643 # Apply filters if any 

644 filtered_results = self.apply_facet_filters(results, applied_filters) 

645 

646 # Generate facets from ALL results (not just filtered ones) 

647 # This allows users to see all available filter options 

648 facets = self.facet_generator.generate_facets(results) 

649 

650 generation_time = (datetime.now() - start_time).total_seconds() * 1000 

651 

652 return FacetedSearchResults( 

653 results=filtered_results, 

654 facets=facets, 

655 applied_filters=applied_filters, 

656 total_results=len(results), 

657 filtered_count=len(filtered_results), 

658 generation_time_ms=generation_time, 

659 ) 

660 

661 def create_filter_from_selection( 

662 self, facet_type: FacetType, selected_values: list[str], operator: str = "OR" 

663 ) -> FacetFilter: 

664 """Create a facet filter from user selections.""" 

665 return FacetFilter( 

666 facet_type=facet_type, values=selected_values, operator=operator 

667 ) 

668 

669 def suggest_refinements( 

670 self, 

671 current_results: list[HybridSearchResult], 

672 current_filters: list[FacetFilter], 

673 ) -> list[dict[str, Any]]: 

674 """ 

675 Suggest facet refinements based on current results and filters. 

676 

677 Returns: 

678 List of suggested refinements with impact estimates 

679 """ 

680 suggestions = [] 

681 

682 # Generate facets for current results 

683 facets = self.facet_generator.generate_facets(current_results) 

684 

685 # Suggest filters that would significantly narrow results 

686 for facet in facets: 

687 # Skip facets that are already filtered 

688 if any(f.facet_type == facet.facet_type for f in current_filters): 

689 continue 

690 

691 # Suggest top values that would filter to reasonable result count 

692 for facet_value in facet.get_top_values(3): 

693 # Estimate impact 

694 test_filter = FacetFilter(facet.facet_type, [facet_value.value]) 

695 filtered_count = len( 

696 self.apply_facet_filters(current_results, [test_filter]) 

697 ) 

698 

699 if 0 < filtered_count < len(current_results) * 0.8: # 20%+ reduction 

700 suggestions.append( 

701 { 

702 "facet_type": facet.facet_type.value, 

703 "facet_display_name": facet.display_name, 

704 "value": facet_value.value, 

705 "display_name": facet_value.display_name, 

706 "current_count": len(current_results), 

707 "filtered_count": filtered_count, 

708 "reduction_percent": round( 

709 (1 - filtered_count / len(current_results)) * 100 

710 ), 

711 } 

712 ) 

713 

714 # Sort by usefulness (highest reduction first) 

715 suggestions.sort(key=lambda s: s["reduction_percent"], reverse=True) 

716 

717 return suggestions[:5] # Top 5 suggestions