Coverage for src/qdrant_loader_mcp_server/search/enhanced/faceted

1"""

2This module provides intelligent faceted search capabilities that leverage the rich

3metadata extracted during document ingestion. It dynamically generates facets from

4HybridSearchResult metadata and provides filtering and refinement capabilities.

6Key Features:

7- Dynamic facet generation from metadata

8- Intelligent facet grouping and sorting

9- Multi-facet filtering with AND/OR logic

10- Real-time facet value counting

11- Smart facet suggestions based on query context

12"""

14import logging

15from collections import Counter

16from dataclasses import dataclass, field

17from datetime import datetime

18from enum import Enum

19from typing import Any

21from ..components.search_result_models import HybridSearchResult

23logger = logging.getLogger(__name__)

26class FacetType(Enum):

27 """Types of facets available for filtering."""

29 # Content-based facets

30 CONTENT_TYPE = "content_type"

31 SOURCE_TYPE = "source_type"

32 FILE_TYPE = "file_type"

33 HAS_FEATURES = "has_features"

35 # Hierarchical facets

36 HIERARCHY_DEPTH = "hierarchy_depth"

37 SECTION_LEVEL = "section_level"

38 SECTION_TYPE = "section_type"

40 # Project/Organization facets

41 PROJECT = "project"

42 COLLECTION = "collection"

43 REPOSITORY = "repository"

45 # Semantic facets

46 ENTITIES = "entities"

47 ENTITY_TYPES = "entity_types"

48 TOPICS = "topics"

49 KEY_PHRASES = "key_phrases"

51 # Content size facets

52 READ_TIME = "read_time"

53 WORD_COUNT = "word_count"

54 FILE_SIZE = "file_size"

56 # Document structure facets

57 ATTACHMENT_TYPE = "attachment_type"

58 CONVERSION_TYPE = "conversion_type"

59 CHUNKING_STRATEGY = "chunking_strategy"

62@dataclass

63class FacetValue:

64 """A single facet value with count and metadata."""

66 value: str

67 count: int

68 display_name: str

69 description: str | None = None

70 metadata: dict[str, Any] = field(default_factory=dict)

72 def __str__(self) -> str:

73 return f"{self.display_name} ({self.count})"

76@dataclass

77class Facet:

78 """A facet with its type, values, and configuration."""

80 facet_type: FacetType

81 name: str

82 display_name: str

83 values: list[FacetValue]

84 description: str | None = None

85 is_multi_select: bool = True

86 is_hierarchical: bool = False

87 sort_by: str = "count" # "count", "name", "relevance"

88 max_visible: int = 10

90 def get_top_values(self, limit: int | None = None) -> list[FacetValue]:

91 """Get top facet values sorted by the configured sort method."""

92 if limit is None:

93 limit = self.max_visible

95 if self.sort_by == "count":

96 return sorted(self.values, key=lambda v: v.count, reverse=True)[:limit]

97 elif self.sort_by == "name":

98 return sorted(self.values, key=lambda v: v.display_name.lower())[:limit]

99 else: # relevance - for now same as count

100 return sorted(self.values, key=lambda v: v.count, reverse=True)[:limit]

101

102

103@dataclass

104class FacetFilter:

105 """A filter applied to search results based on facet selections."""

106

107 facet_type: FacetType

108 values: list[str]

109 operator: str = "OR" # "OR", "AND"

110

111 def matches(self, result: HybridSearchResult) -> bool:

112 """Check if a search result matches this facet filter."""

113 result_values = self._extract_values_from_result(result)

114

115 if self.operator == "OR":

116 return any(value in result_values for value in self.values)

117 else: # AND

118 return all(value in result_values for value in self.values)

119

120 def _extract_values_from_result(self, result: HybridSearchResult) -> list[str]:

121 """Extract values for this facet type from a search result."""

122 if self.facet_type == FacetType.CONTENT_TYPE:

123 return [result.source_type] if result.source_type else []

124 elif self.facet_type == FacetType.SOURCE_TYPE:

125 return [result.source_type] if result.source_type else []

126 elif self.facet_type == FacetType.FILE_TYPE:

127 file_type = result.get_file_type()

128 return [file_type] if file_type else []

129 elif self.facet_type == FacetType.HAS_FEATURES:

130 features = []

131 if result.has_code_blocks:

132 features.append("code")

133 if result.has_tables:

134 features.append("tables")

135 if result.has_images:

136 features.append("images")

137 if result.has_links:

138 features.append("links")

139 return features

140 elif self.facet_type == FacetType.PROJECT:

141 return [result.project_name] if result.project_name else []

142 elif self.facet_type == FacetType.ENTITIES:

143 entities = []

144 for entity in result.entities:

145 if isinstance(entity, dict) and "text" in entity:

146 entities.append(entity["text"].lower())

147 elif isinstance(entity, str):

148 entities.append(entity.lower())

149 return entities

150 elif self.facet_type == FacetType.TOPICS:

151 topics = []

152 for topic in result.topics:

153 if isinstance(topic, dict) and "text" in topic:

154 topics.append(topic["text"].lower())

155 elif isinstance(topic, str):

156 topics.append(topic.lower())

157 return topics

158 elif self.facet_type == FacetType.HIERARCHY_DEPTH:

159 if result.depth is not None:

160 if result.depth <= 2:

161 return ["shallow"]

162 elif result.depth <= 4:

163 return ["medium"]

164 else:

165 return ["deep"]

166 return []

167 elif self.facet_type == FacetType.READ_TIME:

168 if result.estimated_read_time is not None:

169 if result.estimated_read_time <= 2:

170 return ["quick"]

171 elif result.estimated_read_time <= 10:

172 return ["medium"]

173 else:

174 return ["long"]

175 return []

176

177 return []

178

179

180@dataclass

181class FacetedSearchResults:

182 """Container for faceted search results with facets and filtered results."""

183

184 results: list[HybridSearchResult]

185 facets: list[Facet]

186 applied_filters: list[FacetFilter]

187 total_results: int

188 filtered_count: int

189 generation_time_ms: float

190

191 def get_facet(self, facet_type: FacetType) -> Facet | None:

192 """Get a specific facet by type."""

193 return next((f for f in self.facets if f.facet_type == facet_type), None)

194

195 def has_active_filters(self) -> bool:

196 """Check if any filters are currently applied."""

197 return len(self.applied_filters) > 0

198

199

200class DynamicFacetGenerator:

201 """

202 Dynamic Facet Generator

203

204 Analyzes HybridSearchResult metadata to dynamically generate relevant facets

205 for filtering and exploration. Leverages the rich metadata infrastructure

206 from previous phases.

207 """

208

209 def __init__(self):

210 self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")

211

212 # Configuration for facet generation

213 self.facet_config = {

214 FacetType.CONTENT_TYPE: {

215 "display_name": "Content Type",

216 "description": "Type of content source",

217 "max_values": 10,

218 "min_count": 1,

219 },

220 FacetType.HAS_FEATURES: {

221 "display_name": "Content Features",

222 "description": "Features present in the content",

223 "max_values": 8,

224 "min_count": 1,

225 },

226 FacetType.PROJECT: {

227 "display_name": "Project",

228 "description": "Project or workspace",

229 "max_values": 15,

230 "min_count": 1,

231 },

232 FacetType.REPOSITORY: {

233 "display_name": "Repository",

234 "description": "Source repository or code host",

235 "max_values": 15,

236 "min_count": 1,

237 },

238 FacetType.ENTITIES: {

239 "display_name": "Entities",

240 "description": "Named entities found in content",

241 "max_values": 20,

242 "min_count": 1,

243 },

244 FacetType.ENTITY_TYPES: {

245 "display_name": "Entity Types",

246 "description": "Types of named entities",

247 "max_values": 15,

248 "min_count": 1,

249 },

250 FacetType.TOPICS: {

251 "display_name": "Topics",

252 "description": "Topics and themes",

253 "max_values": 15,

254 "min_count": 1,

255 },

256 FacetType.KEY_PHRASES: {

257 "display_name": "Key Phrases",

258 "description": "Key phrases extracted from content",

259 "max_values": 20,

260 "min_count": 1,

261 },

262 FacetType.HIERARCHY_DEPTH: {

263 "display_name": "Content Depth",

264 "description": "Hierarchical depth in document structure",

265 "max_values": 5,

266 "min_count": 1,

267 },

268 FacetType.READ_TIME: {

269 "display_name": "Reading Time",

270 "description": "Estimated time to read",

271 "max_values": 5,

272 "min_count": 1,

273 },

274 FacetType.FILE_TYPE: {

275 "display_name": "File Type",

276 "description": "Original file type or format",

277 "max_values": 10,

278 "min_count": 1,

279 },

280 }

281

282 def generate_facets(self, search_results: list[HybridSearchResult]) -> list[Facet]:

283 """

284 Generate dynamic facets from search results metadata.

285

286 Args:

287 search_results: List of search results to analyze

288

289 Returns:

290 List of generated facets with counts

291 """

292 start_time = datetime.now()

293

294 if not search_results:

295 return []

296

297 facets = []

298

299 # Generate each configured facet type

300 for facet_type, config in self.facet_config.items():

301 facet = self._generate_facet(facet_type, search_results, config)

302 if facet and len(facet.values) > 0:

303 facets.append(facet)

304

305 # Sort facets by priority (most useful first)

306 facets = self._sort_facets_by_priority(facets, search_results)

307

308 generation_time = (datetime.now() - start_time).total_seconds() * 1000

309 self.logger.debug(f"Generated {len(facets)} facets in {generation_time:.2f}ms")

310

311 return facets

312

313 def _generate_facet(

314 self,

315 facet_type: FacetType,

316 search_results: list[HybridSearchResult],

317 config: dict[str, Any],

318 ) -> Facet | None:

319 """Generate a specific facet from search results."""

320

321 # Extract values for this facet type

322 value_counts = Counter()

323

324 for result in search_results:

325 values = self._extract_facet_values(result, facet_type)

326 for value in values:

327 if value: # Skip empty values

328 value_counts[value] += 1

329

330 # Filter by minimum count

331 min_count = config.get("min_count", 1)

332 filtered_counts = {k: v for k, v in value_counts.items() if v >= min_count}

333

334 if not filtered_counts:

335 return None

336

337 # Create facet values

338 facet_values = []

339 for value, count in filtered_counts.items():

340 display_name = self._get_display_name(facet_type, value)

341 description = self._get_value_description(facet_type, value)

342

343 facet_values.append(

344 FacetValue(

345 value=value,

346 count=count,

347 display_name=display_name,

348 description=description,

349 )

350 )

351

352 # Limit to max values

353 max_values = config.get("max_values", 10)

354 facet_values = sorted(facet_values, key=lambda v: v.count, reverse=True)[

355 :max_values

356 ]

357

358 return Facet(

359 facet_type=facet_type,

360 name=facet_type.value,

361 display_name=config["display_name"],

362 description=config.get("description"),

363 values=facet_values,

364 sort_by="count",

365 )

366

367 def _extract_facet_values(

368 self, result: HybridSearchResult, facet_type: FacetType

369 ) -> list[str]:

370 """Extract values for a specific facet type from a search result."""

371

372 if facet_type == FacetType.CONTENT_TYPE:

373 return [result.source_type] if result.source_type else []

374

375 elif facet_type == FacetType.SOURCE_TYPE:

376 return [result.source_type] if result.source_type else []

377

378 elif facet_type == FacetType.FILE_TYPE:

379 file_type = result.get_file_type()

380 return [file_type] if file_type else []

381

382 elif facet_type == FacetType.HAS_FEATURES:

383 features = []

384 if result.has_code_blocks:

385 features.append("code")

386 if result.has_tables:

387 features.append("tables")

388 if result.has_images:

389 features.append("images")

390 if result.has_links:

391 features.append("links")

392 if result.is_attachment:

393 features.append("attachment")

394 return features

395

396 elif facet_type == FacetType.PROJECT:

397 values = []

398 if result.project_name:

399 values.append(result.project_name)

400 if result.collection_name and result.collection_name != result.project_name:

401 values.append(result.collection_name)

402 return values

403

404 elif facet_type == FacetType.REPOSITORY:

405 return [result.repo_name] if result.repo_name else []

406

407 elif facet_type == FacetType.ENTITIES:

408 entities = []

409 for entity in result.entities:

410 if isinstance(entity, dict) and "text" in entity:

411 entities.append(entity["text"].lower().strip())

412 elif isinstance(entity, str):

413 entities.append(entity.lower().strip())

414 return [e for e in entities if len(e) >= 2] # Filter very short entities

415

416 elif facet_type == FacetType.ENTITY_TYPES:

417 entity_types = []

418 for entity in result.entities:

419 if isinstance(entity, dict) and "label" in entity:

420 entity_types.append(entity["label"])

421 return entity_types

422

423 elif facet_type == FacetType.TOPICS:

424 topics = []

425 for topic in result.topics:

426 if isinstance(topic, dict) and "text" in topic:

427 topics.append(topic["text"].lower().strip())

428 elif isinstance(topic, str):

429 topics.append(topic.lower().strip())

430 return [t for t in topics if len(t) > 2] # Filter short topics

431

432 elif facet_type == FacetType.KEY_PHRASES:

433 phrases = []

434 for phrase in result.key_phrases:

435 if isinstance(phrase, dict) and "text" in phrase:

436 phrases.append(phrase["text"].lower().strip())

437 elif isinstance(phrase, str):

438 phrases.append(phrase.lower().strip())

439 return [p for p in phrases if len(p) > 3] # Filter short phrases

440

441 elif facet_type == FacetType.HIERARCHY_DEPTH:

442 if result.depth is not None:

443 if result.depth <= 2:

444 return ["shallow"]

445 elif result.depth <= 4:

446 return ["medium"]

447 else:

448 return ["deep"]

449 return []

450

451 elif facet_type == FacetType.SECTION_LEVEL:

452 if result.section_level is not None:

453 return [f"level_{result.section_level}"]

454 return []

455

456 elif facet_type == FacetType.SECTION_TYPE:

457 return [result.section_type] if result.section_type else []

458

459 elif facet_type == FacetType.READ_TIME:

460 if result.estimated_read_time is not None:

461 if result.estimated_read_time <= 2:

462 return ["quick"]

463 elif result.estimated_read_time <= 10:

464 return ["medium"]

465 else:

466 return ["long"]

467 return []

468

469 elif facet_type == FacetType.WORD_COUNT:

470 if result.word_count is not None:

471 if result.word_count <= 100:

472 return ["short"]

473 elif result.word_count <= 500:

474 return ["medium"]

475 else:

476 return ["long"]

477 return []

478

479 elif facet_type == FacetType.ATTACHMENT_TYPE:

480 if result.is_attachment and result.mime_type:

481 return [result.mime_type]

482 return []

483

484 elif facet_type == FacetType.CONVERSION_TYPE:

485 if result.is_converted and result.conversion_method:

486 return [result.conversion_method]

487 return []

488

489 elif facet_type == FacetType.CHUNKING_STRATEGY:

490 return [result.chunking_strategy] if result.chunking_strategy else []

491

492 return []

493

494 def _get_display_name(self, facet_type: FacetType, value: str) -> str:

495 """Get a human-readable display name for a facet value."""

496

497 # Custom display names for specific facet types

498 if facet_type == FacetType.HAS_FEATURES:

499 feature_names = {

500 "code": "Code Blocks",

501 "tables": "Tables",

502 "images": "Images",

503 "links": "Links",

504 "attachment": "Attachments",

505 }

506 return feature_names.get(value, value.title())

507

508 elif facet_type == FacetType.HIERARCHY_DEPTH:

509 depth_names = {

510 "shallow": "Shallow (1-2 levels)",

511 "medium": "Medium (3-4 levels)",

512 "deep": "Deep (5+ levels)",

513 }

514 return depth_names.get(value, value.title())

515

516 elif facet_type == FacetType.READ_TIME:

517 time_names = {

518 "quick": "Quick Read (≤2 min)",

519 "medium": "Medium Read (3-10 min)",

520 "long": "Long Read (10+ min)",

521 }

522 return time_names.get(value, value.title())

523

524 elif facet_type == FacetType.WORD_COUNT:

525 count_names = {

526 "short": "Short (≤100 words)",

527 "medium": "Medium (101-500 words)",

528 "long": "Long (500+ words)",

529 }

530 return count_names.get(value, value.title())

531

532 # Default: capitalize first letter

533 return value.replace("_", " ").title()

534

535 def _get_value_description(self, facet_type: FacetType, value: str) -> str | None:

536 """Get a description for a facet value."""

537

538 if facet_type == FacetType.HAS_FEATURES:

539 descriptions = {

540 "code": "Contains code blocks or snippets",

541 "tables": "Contains structured data tables",

542 "images": "Contains images or diagrams",

543 "links": "Contains hyperlinks",

544 "attachment": "File attachments",

545 }

546 return descriptions.get(value)

547

548 return None

549

550 def _sort_facets_by_priority(

551 self, facets: list[Facet], search_results: list[HybridSearchResult]

552 ) -> list[Facet]:

553 """Sort facets by priority/usefulness for the current result set."""

554

555 # Priority order - most useful facets first

556 priority_order = [

557 FacetType.CONTENT_TYPE,

558 FacetType.PROJECT,

559 FacetType.HAS_FEATURES,

560 FacetType.ENTITIES,

561 FacetType.TOPICS,

562 FacetType.READ_TIME,

563 FacetType.HIERARCHY_DEPTH,

564 FacetType.FILE_TYPE,

565 FacetType.SECTION_TYPE,

566 ]

567

568 # Create priority map

569 priority_map = {facet_type: i for i, facet_type in enumerate(priority_order)}

570

571 # Sort facets by priority, then by value count

572 def facet_sort_key(facet: Facet) -> tuple[int, int]:

573 priority = priority_map.get(facet.facet_type, 999)

574 value_count = len(facet.values)

575 return (priority, -value_count) # Negative for descending count

576

577 return sorted(facets, key=facet_sort_key)

578

579

580class FacetedSearchEngine:

581 """

582 Faceted Search Engine

583

584 Provides faceted search capabilities with filtering and refinement.

585 Integrates with the existing HybridSearchEngine to add faceting layer.

586 """

587

588 def __init__(self):

589 self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")

590 self.facet_generator = DynamicFacetGenerator()

591

592 def apply_facet_filters(

593 self, results: list[HybridSearchResult], filters: list[FacetFilter]

594 ) -> list[HybridSearchResult]:

595 """

596 Apply facet filters to search results.

597

598 Args:

599 results: Original search results

600 filters: List of facet filters to apply

601

602 Returns:

603 Filtered search results

604 """

605 if not filters:

606 return results

607

608 filtered_results = []

609

610 for result in results:

611 # Check if result matches ALL filters (AND logic between different facets)

612 matches_all = True

613

614 for filter_obj in filters:

615 if not filter_obj.matches(result):

616 matches_all = False

617 break

618

619 if matches_all:

620 filtered_results.append(result)

621

622 return filtered_results

623

624 def generate_faceted_results(

625 self,

626 results: list[HybridSearchResult],

627 applied_filters: list[FacetFilter] | None = None,

628 ) -> FacetedSearchResults:

629 """

630 Generate faceted search results with facets and filtered results.

631

632 Args:

633 results: Original search results

634 applied_filters: Currently applied filters

635

636 Returns:

637 FacetedSearchResults with facets and filtered results

638 """

639 start_time = datetime.now()

640

641 applied_filters = applied_filters or []

642

643 # Apply filters if any

644 filtered_results = self.apply_facet_filters(results, applied_filters)

645

646 # Generate facets from ALL results (not just filtered ones)

647 # This allows users to see all available filter options

648 facets = self.facet_generator.generate_facets(results)

649

650 generation_time = (datetime.now() - start_time).total_seconds() * 1000

651

652 return FacetedSearchResults(

653 results=filtered_results,

654 facets=facets,

655 applied_filters=applied_filters,

656 total_results=len(results),

657 filtered_count=len(filtered_results),

658 generation_time_ms=generation_time,

659 )

660

661 def create_filter_from_selection(

662 self, facet_type: FacetType, selected_values: list[str], operator: str = "OR"

663 ) -> FacetFilter:

664 """Create a facet filter from user selections."""

665 return FacetFilter(

666 facet_type=facet_type, values=selected_values, operator=operator

667 )

668

669 def suggest_refinements(

670 self,

671 current_results: list[HybridSearchResult],

672 current_filters: list[FacetFilter],

673 ) -> list[dict[str, Any]]:

674 """

675 Suggest facet refinements based on current results and filters.

676

677 Returns:

678 List of suggested refinements with impact estimates

679 """

680 suggestions = []

681

682 # Generate facets for current results

683 facets = self.facet_generator.generate_facets(current_results)

684

685 # Suggest filters that would significantly narrow results

686 for facet in facets:

687 # Skip facets that are already filtered

688 if any(f.facet_type == facet.facet_type for f in current_filters):

689 continue

690

691 # Suggest top values that would filter to reasonable result count

692 for facet_value in facet.get_top_values(3):

693 # Estimate impact

694 test_filter = FacetFilter(facet.facet_type, [facet_value.value])

695 filtered_count = len(

696 self.apply_facet_filters(current_results, [test_filter])

697 )

698

699 if 0 < filtered_count < len(current_results) * 0.8: # 20%+ reduction

700 suggestions.append(

701 {

702 "facet_type": facet.facet_type.value,

703 "facet_display_name": facet.display_name,

704 "value": facet_value.value,

705 "display_name": facet_value.display_name,

706 "current_count": len(current_results),

707 "filtered_count": filtered_count,

708 "reduction_percent": round(

709 (1 - filtered_count / len(current_results)) * 100

710 ),

711 }

712 )

713

714 # Sort by usefulness (highest reduction first)

715 suggestions.sort(key=lambda s: s["reduction_percent"], reverse=True)

716

717 return suggestions[:5] # Top 5 suggestions

Coverage for src/qdrant_loader_mcp_server/search/enhanced/faceted_search.py: 84%

336 statements