Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/document_lookup.py: 68%

87 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1from __future__ import annotations 

2 

3import re 

4from typing import Any 

5 

6from ...components.search_result_models import HybridSearchResult 

7 

8 

9def build_document_lookup( 

10 documents: list[HybridSearchResult], 

11 robust: bool = False, 

12 logger: Any | None = None, 

13) -> dict[str, HybridSearchResult]: 

14 """Build a multi-key lookup for `HybridSearchResult` documents. 

15 

16 Keys include composite `source_type:source_title`, `document_id` when present, 

17 and `source_title`. When `robust` is True, missing values are tolerated and a 

18 sanitized composite key is also added. 

19 """ 

20 lookup: dict[str, HybridSearchResult] = {} 

21 

22 def _normalize_key(value: Any) -> str | None: 

23 if value is None: 

24 return None 

25 try: 

26 text = str(value) 

27 except Exception: 

28 return None 

29 text = text.strip() 

30 return text if text else None 

31 

32 def _set_key(key: str | None, new_doc: HybridSearchResult) -> None: 

33 if key is None: 

34 return 

35 existing = lookup.get(key) 

36 if existing is None: 

37 lookup[key] = new_doc 

38 return 

39 # Prevent silent overwrite: keep existing, warn about conflict 

40 if existing is not new_doc and logger is not None: 

41 try: 

42 logger.warning( 

43 "Duplicate key detected in document lookup; keeping existing", 

44 extra={ 

45 "key": key, 

46 "existing_document_id": str( 

47 getattr(existing, "document_id", "") 

48 ), 

49 "new_document_id": str(getattr(new_doc, "document_id", "")), 

50 }, 

51 ) 

52 except Exception: 

53 pass 

54 

55 for doc in documents: 

56 # Validate item type 

57 if not isinstance(doc, HybridSearchResult): 

58 if logger is not None: 

59 try: 

60 logger.warning( 

61 "Skipping non-HybridSearchResult in build_document_lookup" 

62 ) 

63 except Exception: 

64 pass 

65 continue 

66 

67 source_type = doc.source_type or ("unknown" if robust else None) 

68 source_title = doc.source_title or ("" if robust else None) 

69 

70 # Primary lookup by composite key when both parts are present 

71 if source_type is not None and source_title is not None: 

72 composite_key = f"{source_type}:{source_title}" 

73 # Preserve original composite key verbatim (may include spaces) 

74 _set_key(composite_key, doc) 

75 

76 # Secondary lookup by document_id if available 

77 if getattr(doc, "document_id", None): 

78 _set_key(_normalize_key(doc.document_id), doc) 

79 

80 # Tertiary lookup by source_title only (fallback) 

81 if source_title: 

82 _set_key(_normalize_key(source_title), doc) 

83 

84 # Quaternary lookup: sanitized composite key (robust mode only) 

85 if robust and isinstance(source_type, str) and isinstance(source_title, str): 

86 sanitized_key = f"{source_type.strip()}:{source_title.strip()}" 

87 normalized_sanitized = _normalize_key(sanitized_key) 

88 if normalized_sanitized and normalized_sanitized not in lookup: 

89 lookup[normalized_sanitized] = doc 

90 

91 if logger is not None: 

92 try: 

93 logger.debug( 

94 f"Built{' robust' if robust else ''} document lookup with {len(lookup)} keys for {len(documents)} documents" 

95 ) 

96 except Exception: 

97 pass 

98 

99 return lookup 

100 

101 

102def find_document_by_id( 

103 doc_id: str, 

104 doc_lookup: dict[str, HybridSearchResult], 

105 logger: Any | None = None, 

106) -> HybridSearchResult | None: 

107 """Find a document by ID using multiple lookup strategies. 

108 

109 Attempts direct, sanitized, partial, and title-based matches. 

110 """ 

111 if not doc_id: 

112 return None 

113 

114 # Direct lookup 

115 if doc_id in doc_lookup: 

116 return doc_lookup[doc_id] 

117 

118 # Try sanitized lookup 

119 sanitized_id = doc_id.strip() 

120 if sanitized_id in doc_lookup: 

121 return doc_lookup[sanitized_id] 

122 

123 # Normalization helpers 

124 def _normalize_for_match(value: str) -> str: 

125 text = value.strip().lower() 

126 # Standardize common separators to a single space 

127 text = re.sub(r"[\s:_\-]+", " ", text) 

128 return text 

129 

130 normalized_query = _normalize_for_match(doc_id) 

131 

132 # Exact match on normalized keys 

133 for lookup_key, doc in doc_lookup.items(): 

134 if _normalize_for_match(lookup_key) == normalized_query: 

135 return doc 

136 

137 # Delimiter/word-boundary aware partial matching 

138 # Build a regex that matches the normalized query as a whole token 

139 if normalized_query: 

140 token_pattern = re.compile( 

141 rf"(^|\b|[\s:_\-]){re.escape(normalized_query)}($|\b|[\s:_\-])" 

142 ) 

143 for lookup_key, doc in doc_lookup.items(): 

144 normalized_key = _normalize_for_match(lookup_key) 

145 if token_pattern.search(normalized_key): 

146 if logger is not None: 

147 try: 

148 logger.debug( 

149 f"Found document via boundary-aware match: {doc_id} -> {lookup_key}" 

150 ) 

151 except Exception: 

152 pass 

153 return doc 

154 

155 # Try by source title extraction (handle composite keys) 

156 if ":" in doc_id: 

157 title_part = doc_id.split(":", 1)[1] 

158 if title_part in doc_lookup: 

159 return doc_lookup[title_part] 

160 

161 return None