Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/document

1from __future__ import annotations

3import re

4from typing import Any

6from ...components.search_result_models import HybridSearchResult

9def build_document_lookup(

10 documents: list[HybridSearchResult],

11 robust: bool = False,

12 logger: Any | None = None,

13) -> dict[str, HybridSearchResult]:

14 """Build a multi-key lookup for `HybridSearchResult` documents.

16 Keys include composite `source_type:source_title`, `document_id` when present,

17 and `source_title`. When `robust` is True, missing values are tolerated and a

18 sanitized composite key is also added.

19 """

20 lookup: dict[str, HybridSearchResult] = {}

22 def _normalize_key(value: Any) -> str | None:

23 if value is None:

24 return None

25 try:

26 text = str(value)

27 except Exception:

28 return None

29 text = text.strip()

30 return text if text else None

32 def _set_key(key: str | None, new_doc: HybridSearchResult) -> None:

33 if key is None:

34 return

35 existing = lookup.get(key)

36 if existing is None:

37 lookup[key] = new_doc

38 return

39 # Prevent silent overwrite: keep existing, warn about conflict

40 if existing is not new_doc and logger is not None:

41 try:

42 logger.warning(

43 "Duplicate key detected in document lookup; keeping existing",

44 extra={

45 "key": key,

46 "existing_document_id": str(

47 getattr(existing, "document_id", "")

48 ),

49 "new_document_id": str(getattr(new_doc, "document_id", "")),

50 },

51 )

52 except Exception:

53 pass

55 for doc in documents:

56 # Validate item type

57 if not isinstance(doc, HybridSearchResult):

58 if logger is not None:

59 try:

60 logger.warning(

61 "Skipping non-HybridSearchResult in build_document_lookup"

62 )

63 except Exception:

64 pass

65 continue

67 source_type = doc.source_type or ("unknown" if robust else None)

68 source_title = doc.source_title or ("" if robust else None)

70 # Primary lookup by composite key when both parts are present

71 if source_type is not None and source_title is not None:

72 composite_key = f"{source_type}:{source_title}"

73 # Preserve original composite key verbatim (may include spaces)

74 _set_key(composite_key, doc)

76 # Secondary lookup by document_id if available

77 if getattr(doc, "document_id", None):

78 _set_key(_normalize_key(doc.document_id), doc)

80 # Tertiary lookup by source_title only (fallback)

81 if source_title:

82 _set_key(_normalize_key(source_title), doc)

84 # Quaternary lookup: sanitized composite key (robust mode only)

85 if robust and isinstance(source_type, str) and isinstance(source_title, str):

86 sanitized_key = f"{source_type.strip()}:{source_title.strip()}"

87 normalized_sanitized = _normalize_key(sanitized_key)

88 if normalized_sanitized and normalized_sanitized not in lookup:

89 lookup[normalized_sanitized] = doc

91 if logger is not None:

92 try:

93 logger.debug(

94 f"Built{' robust' if robust else ''} document lookup with {len(lookup)} keys for {len(documents)} documents"

95 )

96 except Exception:

97 pass

99 return lookup

100

101

102def find_document_by_id(

103 doc_id: str,

104 doc_lookup: dict[str, HybridSearchResult],

105 logger: Any | None = None,

106) -> HybridSearchResult | None:

107 """Find a document by ID using multiple lookup strategies.

108

109 Attempts direct, sanitized, partial, and title-based matches.

110 """

111 if not doc_id:

112 return None

113

114 # Direct lookup

115 if doc_id in doc_lookup:

116 return doc_lookup[doc_id]

117

118 # Try sanitized lookup

119 sanitized_id = doc_id.strip()

120 if sanitized_id in doc_lookup:

121 return doc_lookup[sanitized_id]

122

123 # Normalization helpers

124 def _normalize_for_match(value: str) -> str:

125 text = value.strip().lower()

126 # Standardize common separators to a single space

127 text = re.sub(r"[\s:_\-]+", " ", text)

128 return text

129

130 normalized_query = _normalize_for_match(doc_id)

131

132 # Exact match on normalized keys

133 for lookup_key, doc in doc_lookup.items():

134 if _normalize_for_match(lookup_key) == normalized_query:

135 return doc

136

137 # Delimiter/word-boundary aware partial matching

138 # Build a regex that matches the normalized query as a whole token

139 if normalized_query:

140 token_pattern = re.compile(

141 rf"(^|\b|[\s:_\-]){re.escape(normalized_query)}($|\b|[\s:_\-])"

142 )

143 for lookup_key, doc in doc_lookup.items():

144 normalized_key = _normalize_for_match(lookup_key)

145 if token_pattern.search(normalized_key):

146 if logger is not None:

147 try:

148 logger.debug(

149 f"Found document via boundary-aware match: {doc_id} -> {lookup_key}"

150 )

151 except Exception:

152 pass

153 return doc

154

155 # Try by source title extraction (handle composite keys)

156 if ":" in doc_id:

157 title_part = doc_id.split(":", 1)[1]

158 if title_part in doc_lookup:

159 return doc_lookup[title_part]

160

161 return None

Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/document_lookup.py: 68%

87 statements