Coverage for src/qdrant_loader_mcp_server/search/components/models/hybrid.py: 81%

377 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1from __future__ import annotations 

2 

3import os 

4from dataclasses import dataclass 

5from pathlib import PurePosixPath, PureWindowsPath 

6 

7from .attachment import AttachmentInfo 

8from .base import BaseSearchResult 

9from .chunking import ChunkingContext 

10from .content import ContentAnalysis 

11from .conversion import ConversionInfo 

12from .cross_reference import CrossReferenceInfo 

13from .hierarchy import HierarchyInfo 

14from .navigation import NavigationContext 

15from .project import ProjectInfo 

16from .section import SectionInfo 

17from .semantic import SemanticAnalysis 

18 

19 

20@dataclass 

21class HybridSearchResult: 

22 base: BaseSearchResult 

23 project: ProjectInfo | None = None 

24 hierarchy: HierarchyInfo | None = None 

25 attachment: AttachmentInfo | None = None 

26 section: SectionInfo | None = None 

27 content: ContentAnalysis | None = None 

28 semantic: SemanticAnalysis | None = None 

29 navigation: NavigationContext | None = None 

30 chunking: ChunkingContext | None = None 

31 conversion: ConversionInfo | None = None 

32 cross_reference: CrossReferenceInfo | None = None 

33 

34 # Convenience properties (subset to keep file concise) 

35 @property 

36 def score(self) -> float: # pragma: no cover - simple passthrough 

37 return self.base.score 

38 

39 @property 

40 def text(self) -> str: # pragma: no cover 

41 return self.base.text 

42 

43 @property 

44 def source_type(self) -> str: # pragma: no cover 

45 return self.base.source_type 

46 

47 @property 

48 def source_title(self) -> str: # pragma: no cover 

49 return self.base.source_title 

50 

51 @property 

52 def document_id(self) -> str | None: # pragma: no cover 

53 return self.base.document_id 

54 

55 @property 

56 def source_url(self) -> str | None: 

57 return self.base.source_url 

58 

59 @property 

60 def file_path(self) -> str | None: 

61 return self.base.file_path 

62 

63 @property 

64 def repo_name(self) -> str | None: 

65 return self.base.repo_name 

66 

67 @property 

68 def vector_score(self) -> float: 

69 return self.base.vector_score 

70 

71 @property 

72 def keyword_score(self) -> float: 

73 return self.base.keyword_score 

74 

75 @property 

76 def created_at(self) -> str | None: 

77 return self.base.created_at 

78 

79 @property 

80 def last_modified(self) -> str | None: 

81 return self.base.last_modified 

82 

83 # Project info properties 

84 @property 

85 def project_id(self) -> str | None: 

86 return self.project.project_id if self.project else None 

87 

88 @property 

89 def project_name(self) -> str | None: 

90 return self.project.project_name if self.project else None 

91 

92 @property 

93 def project_description(self) -> str | None: 

94 return self.project.project_description if self.project else None 

95 

96 @property 

97 def collection_name(self) -> str | None: 

98 return self.project.collection_name if self.project else None 

99 

100 # Hierarchy info 

101 @property 

102 def parent_id(self) -> str | None: 

103 return self.hierarchy.parent_id if self.hierarchy else None 

104 

105 @property 

106 def parent_title(self) -> str | None: 

107 return self.hierarchy.parent_title if self.hierarchy else None 

108 

109 @property 

110 def breadcrumb_text(self) -> str | None: 

111 return self.hierarchy.breadcrumb_text if self.hierarchy else None 

112 

113 @property 

114 def depth(self) -> int | None: 

115 return self.hierarchy.depth if self.hierarchy else None 

116 

117 @property 

118 def children_count(self) -> int | None: 

119 return self.hierarchy.children_count if self.hierarchy else None 

120 

121 @property 

122 def hierarchy_context(self) -> str | None: 

123 return self.hierarchy.hierarchy_context if self.hierarchy else None 

124 

125 # Attachment info 

126 @property 

127 def is_attachment(self) -> bool: 

128 return self.attachment.is_attachment if self.attachment else False 

129 

130 @property 

131 def parent_document_id(self) -> str | None: 

132 return self.attachment.parent_document_id if self.attachment else None 

133 

134 @property 

135 def parent_document_title(self) -> str | None: 

136 return self.attachment.parent_document_title if self.attachment else None 

137 

138 @property 

139 def attachment_id(self) -> str | None: 

140 return self.attachment.attachment_id if self.attachment else None 

141 

142 @property 

143 def original_filename(self) -> str | None: 

144 return self.attachment.original_filename if self.attachment else None 

145 

146 @property 

147 def file_size(self) -> int | None: 

148 return self.attachment.file_size if self.attachment else None 

149 

150 @property 

151 def mime_type(self) -> str | None: 

152 return self.attachment.mime_type if self.attachment else None 

153 

154 @property 

155 def attachment_author(self) -> str | None: 

156 return self.attachment.attachment_author if self.attachment else None 

157 

158 @property 

159 def attachment_context(self) -> str | None: 

160 return self.attachment.attachment_context if self.attachment else None 

161 

162 # Section info 

163 @property 

164 def section_title(self) -> str | None: 

165 return self.section.section_title if self.section else None 

166 

167 @property 

168 def section_type(self) -> str | None: 

169 return self.section.section_type if self.section else None 

170 

171 @property 

172 def section_level(self) -> int | None: 

173 return self.section.section_level if self.section else None 

174 

175 @property 

176 def section_anchor(self) -> str | None: 

177 return self.section.section_anchor if self.section else None 

178 

179 @property 

180 def section_breadcrumb(self) -> str | None: 

181 return self.section.section_breadcrumb if self.section else None 

182 

183 @property 

184 def section_depth(self) -> int | None: 

185 return self.section.section_depth if self.section else None 

186 

187 # Content analysis 

188 @property 

189 def has_code_blocks(self) -> bool: 

190 return self.content.has_code_blocks if self.content else False 

191 

192 @property 

193 def has_tables(self) -> bool: 

194 return self.content.has_tables if self.content else False 

195 

196 @property 

197 def has_images(self) -> bool: 

198 return self.content.has_images if self.content else False 

199 

200 @property 

201 def has_links(self) -> bool: 

202 return self.content.has_links if self.content else False 

203 

204 @property 

205 def word_count(self) -> int | None: 

206 return self.content.word_count if self.content else None 

207 

208 @property 

209 def char_count(self) -> int | None: 

210 return self.content.char_count if self.content else None 

211 

212 @property 

213 def estimated_read_time(self) -> int | None: 

214 return self.content.estimated_read_time if self.content else None 

215 

216 @property 

217 def paragraph_count(self) -> int | None: 

218 return self.content.paragraph_count if self.content else None 

219 

220 # Semantic 

221 @property 

222 def entities(self) -> list[dict | str]: 

223 return self.semantic.entities if self.semantic else [] 

224 

225 @property 

226 def topics(self) -> list[dict | str]: 

227 return self.semantic.topics if self.semantic else [] 

228 

229 @property 

230 def key_phrases(self) -> list[dict | str]: 

231 return self.semantic.key_phrases if self.semantic else [] 

232 

233 @property 

234 def pos_tags(self) -> list[dict]: 

235 return self.semantic.pos_tags if self.semantic else [] 

236 

237 # Navigation 

238 @property 

239 def previous_section(self) -> str | None: 

240 return self.navigation.previous_section if self.navigation else None 

241 

242 @property 

243 def next_section(self) -> str | None: 

244 return self.navigation.next_section if self.navigation else None 

245 

246 @property 

247 def sibling_sections(self) -> list[str]: 

248 return self.navigation.sibling_sections if self.navigation else [] 

249 

250 @property 

251 def subsections(self) -> list[str]: 

252 return self.navigation.subsections if self.navigation else [] 

253 

254 @property 

255 def document_hierarchy(self) -> list[str]: 

256 return self.navigation.document_hierarchy if self.navigation else [] 

257 

258 # Chunking 

259 @property 

260 def chunk_index(self) -> int | None: 

261 return self.chunking.chunk_index if self.chunking else None 

262 

263 @property 

264 def total_chunks(self) -> int | None: 

265 return self.chunking.total_chunks if self.chunking else None 

266 

267 @property 

268 def chunking_strategy(self) -> str | None: 

269 return self.chunking.chunking_strategy if self.chunking else None 

270 

271 # Conversion 

272 @property 

273 def original_file_type(self) -> str | None: 

274 return self.conversion.original_file_type if self.conversion else None 

275 

276 @property 

277 def conversion_method(self) -> str | None: 

278 return self.conversion.conversion_method if self.conversion else None 

279 

280 @property 

281 def is_excel_sheet(self) -> bool: 

282 return self.conversion.is_excel_sheet if self.conversion else False 

283 

284 @property 

285 def is_converted(self) -> bool: 

286 return self.conversion.is_converted if self.conversion else False 

287 

288 # Cross-reference 

289 @property 

290 def cross_references(self) -> list[dict]: 

291 return self.cross_reference.cross_references if self.cross_reference else [] 

292 

293 @property 

294 def topic_analysis(self) -> dict | None: 

295 return self.cross_reference.topic_analysis if self.cross_reference else None 

296 

297 @property 

298 def content_type_context(self) -> str | None: 

299 return ( 

300 self.cross_reference.content_type_context if self.cross_reference else None 

301 ) 

302 

303 # Helper methods for display/compatibility 

304 def get_display_title(self) -> str: 

305 base_title = self.source_title 

306 if not base_title or base_title.strip() == "": 

307 if self.file_path: 

308 base_title = os.path.basename(self.file_path) 

309 elif self.repo_name: 

310 base_title = self.repo_name 

311 else: 

312 base_title = "Untitled" 

313 if self.section_breadcrumb: 

314 return f"{self.section_title or base_title} ({self.section_breadcrumb})" 

315 elif self.breadcrumb_text and self.source_type == "confluence": 

316 return f"{base_title} ({self.breadcrumb_text})" 

317 elif self.section_title and self.section_title != base_title: 

318 return f"{base_title} > {self.section_title}" 

319 return base_title 

320 

321 def get_project_info(self) -> str | None: 

322 if not self.project_id: 

323 return None 

324 project_info = f"Project: {self.project_name or self.project_id}" 

325 if self.project_description: 

326 project_info += f" - {self.project_description}" 

327 if self.collection_name: 

328 project_info += f" (Collection: {self.collection_name})" 

329 return project_info 

330 

331 def get_hierarchy_info(self) -> str | None: 

332 if self.source_type != "confluence": 

333 return None 

334 parts: list[str] = [] 

335 if self.hierarchy_context: 

336 parts.append(self.hierarchy_context) 

337 if self.section_breadcrumb: 

338 parts.append(f"Section: {self.section_breadcrumb}") 

339 if self.chunk_index is not None and self.total_chunks is not None: 

340 parts.append(f"Chunk: {self.chunk_index + 1}/{self.total_chunks}") 

341 return " | ".join(parts) if parts else None 

342 

343 def get_content_info(self) -> str | None: 

344 if not any( 

345 [self.has_code_blocks, self.has_tables, self.has_images, self.has_links] 

346 ): 

347 return None 

348 content_parts: list[str] = [] 

349 if self.has_code_blocks: 

350 content_parts.append("Code") 

351 if self.has_tables: 

352 content_parts.append("Tables") 

353 if self.has_images: 

354 content_parts.append("Images") 

355 if self.has_links: 

356 content_parts.append("Links") 

357 content_info = f"Contains: {', '.join(content_parts)}" 

358 if self.word_count: 

359 content_info += f" | {self.word_count} words" 

360 if self.estimated_read_time: 

361 content_info += f" | ~{self.estimated_read_time}min read" 

362 return content_info 

363 

364 def get_semantic_info(self) -> str | None: 

365 parts: list[str] = [] 

366 if self.entities: 

367 parts.append(f"{len(self.entities)} entities") 

368 if self.topics: 

369 topic_texts: list[str] = [] 

370 for topic in self.topics[:3]: 

371 if isinstance(topic, str): 

372 topic_texts.append(topic) 

373 elif isinstance(topic, dict): 

374 topic_texts.append(topic.get("text", str(topic))) 

375 else: 

376 topic_texts.append(str(topic)) 

377 topic_list = ", ".join(topic_texts) 

378 if len(self.topics) > 3: 

379 topic_list += f" (+{len(self.topics) - 3} more)" 

380 parts.append(f"Topics: {topic_list}") 

381 if self.key_phrases: 

382 parts.append(f"{len(self.key_phrases)} key phrases") 

383 return " | ".join(parts) if parts else None 

384 

385 def get_section_context(self) -> str | None: 

386 if not self.section_title: 

387 return None 

388 context = self.section_title 

389 if self.section_type and self.section_level: 

390 context = f"[{self.section_type.upper()}] {context}" 

391 if self.section_anchor: 

392 context += f" (#{self.section_anchor})" 

393 return context 

394 

395 def get_attachment_info(self) -> str | None: 

396 if not self.is_attachment or not self.attachment_context: 

397 return None 

398 return self.attachment_context 

399 

400 def get_file_type(self) -> str | None: 

401 if self.original_file_type: 

402 file_type = self.original_file_type 

403 if self.is_converted and self.conversion_method: 

404 file_type += f" (converted via {self.conversion_method})" 

405 return file_type 

406 elif self.mime_type: 

407 return self.mime_type 

408 elif self.original_filename: 

409 _, ext = os.path.splitext(self.original_filename) 

410 return ext.lower().lstrip(".") if ext else None 

411 return None 

412 

413 def is_root_document(self) -> bool: 

414 # Local files: determine roots using normalized path semantics (POSIX or Windows) 

415 if self.source_type == "localfile": 

416 fp = self.file_path 

417 if isinstance(fp, str) and fp.strip(): 

418 try: 

419 # Choose Windows parsing if backslashes dominate; otherwise POSIX 

420 if "\\" in fp and ( 

421 "/" not in fp or fp.count("\\") >= fp.count("/") 

422 ): 

423 p = PureWindowsPath(fp) 

424 else: 

425 # Normalize any accidental backslashes for POSIX parsing 

426 p = PurePosixPath(fp.replace("\\", "/")) 

427 

428 parts = list(p.parts) 

429 # Remove drive/root anchors (e.g., 'C:\\', '/' or '\\\\server\\share\\') 

430 anchor = p.anchor 

431 meaningful_parts = [ 

432 part 

433 for part in parts 

434 if part and part != anchor and part not in ("/", "\\") 

435 ] 

436 

437 # If repo name is present as leading part, ignore it for depth calculation 

438 repo = self.repo_name or "" 

439 if repo and meaningful_parts and meaningful_parts[0] == repo: 

440 meaningful_parts = meaningful_parts[1:] 

441 

442 # Root document when there's only a single name part 

443 return len(meaningful_parts) <= 1 

444 except Exception: 

445 return False 

446 return False 

447 # Other sources: root documents have no parent identifiers 

448 return self.parent_id is None and self.parent_document_id is None 

449 

450 def has_children(self) -> bool: 

451 return (self.children_count is not None and self.children_count > 0) or bool( 

452 self.subsections 

453 ) 

454 

455 def is_file_attachment(self) -> bool: 

456 return self.is_attachment 

457 

458 def belongs_to_project(self, project_id: str) -> bool: 

459 return self.project_id == project_id 

460 

461 def belongs_to_any_project(self, project_ids: list[str]) -> bool: 

462 return self.project_id is not None and self.project_id in project_ids 

463 

464 def is_code_content(self) -> bool: 

465 return self.has_code_blocks or self.section_type == "code" 

466 

467 def is_documentation(self) -> bool: 

468 return ( 

469 self.source_type in ["confluence", "localfile"] and not self.has_code_blocks 

470 ) 

471 

472 def is_structured_data(self) -> bool: 

473 return self.has_tables or self.is_excel_sheet 

474 

475 

476def create_hybrid_search_result( 

477 score: float, 

478 text: str, 

479 source_type: str, 

480 source_title: str, 

481 vector_score: float = 0.0, 

482 keyword_score: float = 0.0, 

483 **kwargs, 

484) -> HybridSearchResult: 

485 base = BaseSearchResult( 

486 score=score, 

487 text=text, 

488 source_type=source_type, 

489 source_title=source_title, 

490 source_url=kwargs.get("source_url"), 

491 file_path=kwargs.get("file_path"), 

492 repo_name=kwargs.get("repo_name"), 

493 vector_score=vector_score, 

494 keyword_score=keyword_score, 

495 document_id=kwargs.get("document_id"), 

496 created_at=kwargs.get("created_at"), 

497 last_modified=kwargs.get("last_modified"), 

498 ) 

499 

500 project = None 

501 if any(key.startswith("project_") for key in kwargs): 

502 project = ProjectInfo( 

503 project_id=kwargs.get("project_id"), 

504 project_name=kwargs.get("project_name"), 

505 project_description=kwargs.get("project_description"), 

506 collection_name=kwargs.get("collection_name"), 

507 ) 

508 

509 hierarchy = None 

510 hierarchy_fields = [ 

511 "parent_id", 

512 "parent_title", 

513 "breadcrumb_text", 

514 "depth", 

515 "children_count", 

516 "hierarchy_context", 

517 ] 

518 if any(field in kwargs for field in hierarchy_fields): 

519 hierarchy = HierarchyInfo( 

520 parent_id=kwargs.get("parent_id"), 

521 parent_title=kwargs.get("parent_title"), 

522 breadcrumb_text=kwargs.get("breadcrumb_text"), 

523 depth=kwargs.get("depth"), 

524 children_count=kwargs.get("children_count"), 

525 hierarchy_context=kwargs.get("hierarchy_context"), 

526 ) 

527 

528 attachment = None 

529 attachment_fields = [ 

530 "is_attachment", 

531 "parent_document_id", 

532 "parent_document_title", 

533 "attachment_id", 

534 "original_filename", 

535 "file_size", 

536 "mime_type", 

537 "attachment_author", 

538 "attachment_context", 

539 ] 

540 if any(field in kwargs for field in attachment_fields): 

541 attachment = AttachmentInfo( 

542 is_attachment=kwargs.get("is_attachment", False), 

543 parent_document_id=kwargs.get("parent_document_id"), 

544 parent_document_title=kwargs.get("parent_document_title"), 

545 attachment_id=kwargs.get("attachment_id"), 

546 original_filename=kwargs.get("original_filename"), 

547 file_size=kwargs.get("file_size"), 

548 mime_type=kwargs.get("mime_type"), 

549 attachment_author=kwargs.get("attachment_author"), 

550 attachment_context=kwargs.get("attachment_context"), 

551 ) 

552 

553 section = None 

554 section_fields = [ 

555 "section_title", 

556 "section_type", 

557 "section_level", 

558 "section_anchor", 

559 "section_breadcrumb", 

560 "section_depth", 

561 ] 

562 if any(field in kwargs for field in section_fields): 

563 section = SectionInfo( 

564 section_title=kwargs.get("section_title"), 

565 section_type=kwargs.get("section_type"), 

566 section_level=kwargs.get("section_level"), 

567 section_anchor=kwargs.get("section_anchor"), 

568 section_breadcrumb=kwargs.get("section_breadcrumb"), 

569 section_depth=kwargs.get("section_depth"), 

570 ) 

571 

572 content = None 

573 content_fields = [ 

574 "has_code_blocks", 

575 "has_tables", 

576 "has_images", 

577 "has_links", 

578 "word_count", 

579 "char_count", 

580 "estimated_read_time", 

581 "paragraph_count", 

582 ] 

583 if any(field in kwargs for field in content_fields): 

584 content = ContentAnalysis( 

585 has_code_blocks=kwargs.get("has_code_blocks", False), 

586 has_tables=kwargs.get("has_tables", False), 

587 has_images=kwargs.get("has_images", False), 

588 has_links=kwargs.get("has_links", False), 

589 word_count=kwargs.get("word_count"), 

590 char_count=kwargs.get("char_count"), 

591 estimated_read_time=kwargs.get("estimated_read_time"), 

592 paragraph_count=kwargs.get("paragraph_count"), 

593 ) 

594 

595 semantic = None 

596 semantic_fields = ["entities", "topics", "key_phrases", "pos_tags"] 

597 if any(field in kwargs for field in semantic_fields): 

598 semantic = SemanticAnalysis( 

599 entities=kwargs.get("entities", []), 

600 topics=kwargs.get("topics", []), 

601 key_phrases=kwargs.get("key_phrases", []), 

602 pos_tags=kwargs.get("pos_tags", []), 

603 ) 

604 

605 navigation = None 

606 navigation_fields = [ 

607 "previous_section", 

608 "next_section", 

609 "sibling_sections", 

610 "subsections", 

611 "document_hierarchy", 

612 ] 

613 if any(field in kwargs for field in navigation_fields): 

614 navigation = NavigationContext( 

615 previous_section=kwargs.get("previous_section"), 

616 next_section=kwargs.get("next_section"), 

617 sibling_sections=kwargs.get("sibling_sections", []), 

618 subsections=kwargs.get("subsections", []), 

619 document_hierarchy=kwargs.get("document_hierarchy", []), 

620 ) 

621 

622 chunking = None 

623 chunking_fields = ["chunk_index", "total_chunks", "chunking_strategy"] 

624 if any(field in kwargs for field in chunking_fields): 

625 chunking = ChunkingContext( 

626 chunk_index=kwargs.get("chunk_index"), 

627 total_chunks=kwargs.get("total_chunks"), 

628 chunking_strategy=kwargs.get("chunking_strategy"), 

629 ) 

630 

631 conversion = None 

632 conversion_fields = [ 

633 "original_file_type", 

634 "conversion_method", 

635 "is_excel_sheet", 

636 "is_converted", 

637 ] 

638 if any(field in kwargs for field in conversion_fields): 

639 conversion = ConversionInfo( 

640 original_file_type=kwargs.get("original_file_type"), 

641 conversion_method=kwargs.get("conversion_method"), 

642 is_excel_sheet=kwargs.get("is_excel_sheet", False), 

643 is_converted=kwargs.get("is_converted", False), 

644 ) 

645 

646 cross_reference = None 

647 cross_ref_fields = ["cross_references", "topic_analysis", "content_type_context"] 

648 if any(field in kwargs for field in cross_ref_fields): 

649 cross_reference = CrossReferenceInfo( 

650 cross_references=kwargs.get("cross_references", []), 

651 topic_analysis=kwargs.get("topic_analysis"), 

652 content_type_context=kwargs.get("content_type_context"), 

653 ) 

654 

655 return HybridSearchResult( 

656 base=base, 

657 project=project, 

658 hierarchy=hierarchy, 

659 attachment=attachment, 

660 section=section, 

661 content=content, 

662 semantic=semantic, 

663 navigation=navigation, 

664 chunking=chunking, 

665 conversion=conversion, 

666 cross_reference=cross_reference, 

667 )