Coverage for src/qdrant_loader_mcp_server/search/enhanced/cdi/calculators.py: 93%

95 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1""" 

2Document Similarity Calculation for Cross-Document Intelligence. 

3 

4This module implements comprehensive document similarity calculation using multiple 

5metrics including entity overlap, topic overlap, metadata similarity, content features, 

6hierarchical distance, and semantic similarity with spaCy integration. 

7""" 

8 

9from __future__ import annotations 

10 

11import time 

12import warnings 

13 

14from ....utils.logging import LoggingConfig 

15from ...models import SearchResult 

16from ...nlp.spacy_analyzer import SpaCyQueryAnalyzer 

17from .extractors.similarity_helpers import ( 

18 calculate_content_features_similarity as cdi_calc_content_features_similarity, 

19) 

20from .extractors.similarity_helpers import ( 

21 calculate_entity_overlap as cdi_calculate_entity_overlap, 

22) 

23from .extractors.similarity_helpers import ( 

24 calculate_metadata_similarity as cdi_calc_metadata_similarity, 

25) 

26from .extractors.similarity_helpers import ( 

27 calculate_semantic_similarity_spacy as cdi_calc_semantic_spacy, 

28) 

29from .extractors.similarity_helpers import ( 

30 calculate_topic_overlap as cdi_calculate_topic_overlap, 

31) 

32from .extractors.similarity_helpers import ( 

33 combine_metric_scores as cdi_combine_metric_scores, 

34) 

35from .extractors.similarity_helpers import ( 

36 get_shared_entities as cdi_get_shared_entities, 

37) 

38from .extractors.similarity_helpers import get_shared_topics as cdi_get_shared_topics 

39from .models import DocumentSimilarity, RelationshipType, SimilarityMetric 

40from .utils import extract_texts_from_mixed, hierarchical_distance_from_breadcrumbs 

41 

42logger = LoggingConfig.get_logger(__name__) 

43 

44 

45class DocumentSimilarityCalculator: 

46 """Calculates similarity between documents using multiple metrics.""" 

47 

48 def __init__(self, spacy_analyzer: SpaCyQueryAnalyzer): 

49 """Initialize the similarity calculator.""" 

50 self.spacy_analyzer = spacy_analyzer 

51 self.logger = LoggingConfig.get_logger(__name__) 

52 

53 def calculate_similarity( 

54 self, 

55 doc1: SearchResult, 

56 doc2: SearchResult, 

57 metrics: list[SimilarityMetric] = None, 

58 ) -> DocumentSimilarity: 

59 """Calculate comprehensive similarity between two documents.""" 

60 if metrics is None: 

61 metrics = [ 

62 SimilarityMetric.ENTITY_OVERLAP, 

63 SimilarityMetric.TOPIC_OVERLAP, 

64 SimilarityMetric.METADATA_SIMILARITY, 

65 SimilarityMetric.CONTENT_FEATURES, 

66 ] 

67 

68 start_time = time.time() 

69 metric_scores = {} 

70 

71 # Calculate individual metric scores 

72 for metric in metrics: 

73 if metric == SimilarityMetric.ENTITY_OVERLAP: 

74 # Use CDI extractors directly to avoid deprecated wrappers 

75 metric_scores[metric] = cdi_calculate_entity_overlap(doc1, doc2) 

76 elif metric == SimilarityMetric.TOPIC_OVERLAP: 

77 # Use CDI extractors directly to avoid deprecated wrappers 

78 metric_scores[metric] = cdi_calculate_topic_overlap(doc1, doc2) 

79 elif metric == SimilarityMetric.METADATA_SIMILARITY: 

80 metric_scores[metric] = self._calculate_metadata_similarity(doc1, doc2) 

81 elif metric == SimilarityMetric.CONTENT_FEATURES: 

82 metric_scores[metric] = self._calculate_content_features_similarity( 

83 doc1, doc2 

84 ) 

85 elif metric == SimilarityMetric.HIERARCHICAL_DISTANCE: 

86 metric_scores[metric] = self._calculate_hierarchical_similarity( 

87 doc1, doc2 

88 ) 

89 elif metric == SimilarityMetric.SEMANTIC_SIMILARITY: 

90 metric_scores[metric] = self._calculate_semantic_similarity(doc1, doc2) 

91 

92 # Calculate combined similarity score 

93 combined_score = self._combine_metric_scores(metric_scores) 

94 

95 # Extract shared entities and topics 

96 shared_entities = self._get_shared_entities(doc1, doc2) 

97 shared_topics = self._get_shared_topics(doc1, doc2) 

98 

99 # Determine relationship type 

100 relationship_type = self._determine_relationship_type(doc1, doc2, metric_scores) 

101 

102 processing_time = (time.time() - start_time) * 1000 

103 self.logger.debug( 

104 f"Calculated similarity between documents in {processing_time:.2f}ms" 

105 ) 

106 

107 return DocumentSimilarity( 

108 doc1_id=f"{doc1.source_type}:{doc1.source_title}", 

109 doc2_id=f"{doc2.source_type}:{doc2.source_title}", 

110 similarity_score=combined_score, 

111 metric_scores=metric_scores, 

112 shared_entities=shared_entities, 

113 shared_topics=shared_topics, 

114 relationship_type=relationship_type, 

115 ) 

116 

117 def _calculate_entity_overlap( 

118 self, doc1: SearchResult, doc2: SearchResult 

119 ) -> float: 

120 """Calculate entity overlap between documents (delegates to CDI helper).""" 

121 return cdi_calculate_entity_overlap(doc1, doc2) 

122 

123 def _calculate_topic_overlap(self, doc1: SearchResult, doc2: SearchResult) -> float: 

124 """Deprecated. Use CDI extractors.calculate_topic_overlap instead. 

125 

126 TODO: Remove this method after 2026-01-01 once all external callers migrate. 

127 """ 

128 warnings.warn( 

129 "DocumentSimilarityCalculator._calculate_topic_overlap is deprecated; " 

130 "use CDI extractors.calculate_topic_overlap instead", 

131 category=DeprecationWarning, 

132 stacklevel=2, 

133 ) 

134 return cdi_calculate_topic_overlap(doc1, doc2) 

135 

136 def _calculate_metadata_similarity( 

137 self, doc1: SearchResult, doc2: SearchResult 

138 ) -> float: 

139 """Calculate metadata similarity (delegates to CDI helper).""" 

140 return cdi_calc_metadata_similarity(doc1, doc2) 

141 

142 def _calculate_content_features_similarity( 

143 self, doc1: SearchResult, doc2: SearchResult 

144 ) -> float: 

145 """Calculate content features similarity (delegates to CDI helper).""" 

146 return cdi_calc_content_features_similarity(doc1, doc2) 

147 

148 def _calculate_hierarchical_similarity( 

149 self, doc1: SearchResult, doc2: SearchResult 

150 ) -> float: 

151 """Calculate hierarchical relationship similarity.""" 

152 # Check for direct parent-child relationship 

153 if ( 

154 doc1.parent_id 

155 and doc1.parent_id == f"{doc2.source_type}:{doc2.source_title}" 

156 ): 

157 return 1.0 

158 if ( 

159 doc2.parent_id 

160 and doc2.parent_id == f"{doc1.source_type}:{doc1.source_title}" 

161 ): 

162 return 1.0 

163 

164 # Check for sibling relationship (same parent) 

165 if doc1.parent_id and doc2.parent_id and doc1.parent_id == doc2.parent_id: 

166 return 0.8 

167 

168 # Breadcrumb-based similarity 

169 if doc1.breadcrumb_text and doc2.breadcrumb_text: 

170 return hierarchical_distance_from_breadcrumbs( 

171 doc1.breadcrumb_text, doc2.breadcrumb_text 

172 ) 

173 

174 return 0.0 

175 

176 def _calculate_semantic_similarity( 

177 self, doc1: SearchResult, doc2: SearchResult 

178 ) -> float: 

179 """Calculate semantic similarity using spaCy.""" 

180 try: 

181 return cdi_calc_semantic_spacy(self.spacy_analyzer, doc1.text, doc2.text) 

182 except Exception as e: 

183 self.logger.warning(f"Failed to calculate semantic similarity: {e}") 

184 return 0.0 

185 

186 def _combine_metric_scores( 

187 self, metric_scores: dict[SimilarityMetric, float] 

188 ) -> float: 

189 """Combine multiple metric scores into final similarity score (delegates to CDI helper).""" 

190 return cdi_combine_metric_scores(metric_scores) 

191 

192 def _get_shared_entities(self, doc1: SearchResult, doc2: SearchResult) -> list[str]: 

193 """Get shared entities between documents (delegates to CDI helper).""" 

194 return cdi_get_shared_entities(doc1, doc2) 

195 

196 def _get_shared_topics(self, doc1: SearchResult, doc2: SearchResult) -> list[str]: 

197 """Get shared topics between documents (delegates to CDI helper).""" 

198 return cdi_get_shared_topics(doc1, doc2) 

199 

200 def extract_entity_texts(self, entities: list[dict | str]) -> list[str]: 

201 """Extract entity texts (public API delegating to CDI utils).""" 

202 return extract_texts_from_mixed(entities) 

203 

204 def extract_topic_texts(self, topics: list[dict | str]) -> list[str]: 

205 """Extract topic texts (public API delegating to CDI utils).""" 

206 return extract_texts_from_mixed(topics) 

207 

208 def _extract_entity_texts(self, entities: list[dict | str]) -> list[str]: 

209 """Deprecated. Use CDI utils instead: `utils.extract_texts_from_mixed`. 

210 

211 Replacement: use CDI utils `extract_texts_from_mixed`. 

212 

213 TODO: Remove this wrapper after 2026-01-01; ensure all callers are migrated. 

214 """ 

215 warnings.warn( 

216 "DocumentSimilarityCalculator._extract_entity_texts is deprecated; " 

217 "use CDI utils.extract_texts_from_mixed instead", 

218 category=DeprecationWarning, 

219 stacklevel=2, 

220 ) 

221 return extract_texts_from_mixed(entities) 

222 

223 def _extract_topic_texts(self, topics: list[dict | str]) -> list[str]: 

224 """Deprecated. Use CDI utils instead: `utils.extract_texts_from_mixed`. 

225 

226 Replacement: use CDI utils `extract_texts_from_mixed`. 

227 

228 TODO: Remove this wrapper after 2026-01-01; ensure all callers are migrated. 

229 """ 

230 warnings.warn( 

231 "DocumentSimilarityCalculator._extract_topic_texts is deprecated; " 

232 "use CDI utils.extract_texts_from_mixed instead", 

233 category=DeprecationWarning, 

234 stacklevel=2, 

235 ) 

236 return extract_texts_from_mixed(topics) 

237 

238 def _determine_relationship_type( 

239 self, 

240 doc1: SearchResult, 

241 doc2: SearchResult, 

242 metric_scores: dict[SimilarityMetric, float], 

243 ) -> RelationshipType: 

244 """Determine the type of relationship between documents.""" 

245 # Check for hierarchical relationship 

246 if ( 

247 SimilarityMetric.HIERARCHICAL_DISTANCE in metric_scores 

248 and metric_scores[SimilarityMetric.HIERARCHICAL_DISTANCE] > 0.7 

249 ): 

250 return RelationshipType.HIERARCHICAL 

251 

252 # Check for cross-references 

253 if doc1.cross_references or doc2.cross_references: 

254 return RelationshipType.CROSS_REFERENCE 

255 

256 # Check for project grouping 

257 if doc1.project_id and doc2.project_id and doc1.project_id == doc2.project_id: 

258 return RelationshipType.PROJECT_GROUPING 

259 

260 # Default to semantic similarity 

261 return RelationshipType.SEMANTIC_SIMILARITY