Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/document_lookup.py: 68%
87 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1from __future__ import annotations
3import re
4from typing import Any
6from ...components.search_result_models import HybridSearchResult
9def build_document_lookup(
10 documents: list[HybridSearchResult],
11 robust: bool = False,
12 logger: Any | None = None,
13) -> dict[str, HybridSearchResult]:
14 """Build a multi-key lookup for `HybridSearchResult` documents.
16 Keys include composite `source_type:source_title`, `document_id` when present,
17 and `source_title`. When `robust` is True, missing values are tolerated and a
18 sanitized composite key is also added.
19 """
20 lookup: dict[str, HybridSearchResult] = {}
22 def _normalize_key(value: Any) -> str | None:
23 if value is None:
24 return None
25 try:
26 text = str(value)
27 except Exception:
28 return None
29 text = text.strip()
30 return text if text else None
32 def _set_key(key: str | None, new_doc: HybridSearchResult) -> None:
33 if key is None:
34 return
35 existing = lookup.get(key)
36 if existing is None:
37 lookup[key] = new_doc
38 return
39 # Prevent silent overwrite: keep existing, warn about conflict
40 if existing is not new_doc and logger is not None:
41 try:
42 logger.warning(
43 "Duplicate key detected in document lookup; keeping existing",
44 extra={
45 "key": key,
46 "existing_document_id": str(
47 getattr(existing, "document_id", "")
48 ),
49 "new_document_id": str(getattr(new_doc, "document_id", "")),
50 },
51 )
52 except Exception:
53 pass
55 for doc in documents:
56 # Validate item type
57 if not isinstance(doc, HybridSearchResult):
58 if logger is not None:
59 try:
60 logger.warning(
61 "Skipping non-HybridSearchResult in build_document_lookup"
62 )
63 except Exception:
64 pass
65 continue
67 source_type = doc.source_type or ("unknown" if robust else None)
68 source_title = doc.source_title or ("" if robust else None)
70 # Primary lookup by composite key when both parts are present
71 if source_type is not None and source_title is not None:
72 composite_key = f"{source_type}:{source_title}"
73 # Preserve original composite key verbatim (may include spaces)
74 _set_key(composite_key, doc)
76 # Secondary lookup by document_id if available
77 if getattr(doc, "document_id", None):
78 _set_key(_normalize_key(doc.document_id), doc)
80 # Tertiary lookup by source_title only (fallback)
81 if source_title:
82 _set_key(_normalize_key(source_title), doc)
84 # Quaternary lookup: sanitized composite key (robust mode only)
85 if robust and isinstance(source_type, str) and isinstance(source_title, str):
86 sanitized_key = f"{source_type.strip()}:{source_title.strip()}"
87 normalized_sanitized = _normalize_key(sanitized_key)
88 if normalized_sanitized and normalized_sanitized not in lookup:
89 lookup[normalized_sanitized] = doc
91 if logger is not None:
92 try:
93 logger.debug(
94 f"Built{' robust' if robust else ''} document lookup with {len(lookup)} keys for {len(documents)} documents"
95 )
96 except Exception:
97 pass
99 return lookup
102def find_document_by_id(
103 doc_id: str,
104 doc_lookup: dict[str, HybridSearchResult],
105 logger: Any | None = None,
106) -> HybridSearchResult | None:
107 """Find a document by ID using multiple lookup strategies.
109 Attempts direct, sanitized, partial, and title-based matches.
110 """
111 if not doc_id:
112 return None
114 # Direct lookup
115 if doc_id in doc_lookup:
116 return doc_lookup[doc_id]
118 # Try sanitized lookup
119 sanitized_id = doc_id.strip()
120 if sanitized_id in doc_lookup:
121 return doc_lookup[sanitized_id]
123 # Normalization helpers
124 def _normalize_for_match(value: str) -> str:
125 text = value.strip().lower()
126 # Standardize common separators to a single space
127 text = re.sub(r"[\s:_\-]+", " ", text)
128 return text
130 normalized_query = _normalize_for_match(doc_id)
132 # Exact match on normalized keys
133 for lookup_key, doc in doc_lookup.items():
134 if _normalize_for_match(lookup_key) == normalized_query:
135 return doc
137 # Delimiter/word-boundary aware partial matching
138 # Build a regex that matches the normalized query as a whole token
139 if normalized_query:
140 token_pattern = re.compile(
141 rf"(^|\b|[\s:_\-]){re.escape(normalized_query)}($|\b|[\s:_\-])"
142 )
143 for lookup_key, doc in doc_lookup.items():
144 normalized_key = _normalize_for_match(lookup_key)
145 if token_pattern.search(normalized_key):
146 if logger is not None:
147 try:
148 logger.debug(
149 f"Found document via boundary-aware match: {doc_id} -> {lookup_key}"
150 )
151 except Exception:
152 pass
153 return doc
155 # Try by source title extraction (handle composite keys)
156 if ":" in doc_id:
157 title_part = doc_id.split(":", 1)[1]
158 if title_part in doc_lookup:
159 return doc_lookup[title_part]
161 return None