Coverage for src/qdrant_loader_mcp_server/search/enhanced/cdi/calculators.py: 93%
95 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1"""
2Document Similarity Calculation for Cross-Document Intelligence.
4This module implements comprehensive document similarity calculation using multiple
5metrics including entity overlap, topic overlap, metadata similarity, content features,
6hierarchical distance, and semantic similarity with spaCy integration.
7"""
9from __future__ import annotations
11import time
12import warnings
14from ....utils.logging import LoggingConfig
15from ...models import SearchResult
16from ...nlp.spacy_analyzer import SpaCyQueryAnalyzer
17from .extractors.similarity_helpers import (
18 calculate_content_features_similarity as cdi_calc_content_features_similarity,
19)
20from .extractors.similarity_helpers import (
21 calculate_entity_overlap as cdi_calculate_entity_overlap,
22)
23from .extractors.similarity_helpers import (
24 calculate_metadata_similarity as cdi_calc_metadata_similarity,
25)
26from .extractors.similarity_helpers import (
27 calculate_semantic_similarity_spacy as cdi_calc_semantic_spacy,
28)
29from .extractors.similarity_helpers import (
30 calculate_topic_overlap as cdi_calculate_topic_overlap,
31)
32from .extractors.similarity_helpers import (
33 combine_metric_scores as cdi_combine_metric_scores,
34)
35from .extractors.similarity_helpers import (
36 get_shared_entities as cdi_get_shared_entities,
37)
38from .extractors.similarity_helpers import get_shared_topics as cdi_get_shared_topics
39from .models import DocumentSimilarity, RelationshipType, SimilarityMetric
40from .utils import extract_texts_from_mixed, hierarchical_distance_from_breadcrumbs
42logger = LoggingConfig.get_logger(__name__)
45class DocumentSimilarityCalculator:
46 """Calculates similarity between documents using multiple metrics."""
48 def __init__(self, spacy_analyzer: SpaCyQueryAnalyzer):
49 """Initialize the similarity calculator."""
50 self.spacy_analyzer = spacy_analyzer
51 self.logger = LoggingConfig.get_logger(__name__)
53 def calculate_similarity(
54 self,
55 doc1: SearchResult,
56 doc2: SearchResult,
57 metrics: list[SimilarityMetric] = None,
58 ) -> DocumentSimilarity:
59 """Calculate comprehensive similarity between two documents."""
60 if metrics is None:
61 metrics = [
62 SimilarityMetric.ENTITY_OVERLAP,
63 SimilarityMetric.TOPIC_OVERLAP,
64 SimilarityMetric.METADATA_SIMILARITY,
65 SimilarityMetric.CONTENT_FEATURES,
66 ]
68 start_time = time.time()
69 metric_scores = {}
71 # Calculate individual metric scores
72 for metric in metrics:
73 if metric == SimilarityMetric.ENTITY_OVERLAP:
74 # Use CDI extractors directly to avoid deprecated wrappers
75 metric_scores[metric] = cdi_calculate_entity_overlap(doc1, doc2)
76 elif metric == SimilarityMetric.TOPIC_OVERLAP:
77 # Use CDI extractors directly to avoid deprecated wrappers
78 metric_scores[metric] = cdi_calculate_topic_overlap(doc1, doc2)
79 elif metric == SimilarityMetric.METADATA_SIMILARITY:
80 metric_scores[metric] = self._calculate_metadata_similarity(doc1, doc2)
81 elif metric == SimilarityMetric.CONTENT_FEATURES:
82 metric_scores[metric] = self._calculate_content_features_similarity(
83 doc1, doc2
84 )
85 elif metric == SimilarityMetric.HIERARCHICAL_DISTANCE:
86 metric_scores[metric] = self._calculate_hierarchical_similarity(
87 doc1, doc2
88 )
89 elif metric == SimilarityMetric.SEMANTIC_SIMILARITY:
90 metric_scores[metric] = self._calculate_semantic_similarity(doc1, doc2)
92 # Calculate combined similarity score
93 combined_score = self._combine_metric_scores(metric_scores)
95 # Extract shared entities and topics
96 shared_entities = self._get_shared_entities(doc1, doc2)
97 shared_topics = self._get_shared_topics(doc1, doc2)
99 # Determine relationship type
100 relationship_type = self._determine_relationship_type(doc1, doc2, metric_scores)
102 processing_time = (time.time() - start_time) * 1000
103 self.logger.debug(
104 f"Calculated similarity between documents in {processing_time:.2f}ms"
105 )
107 return DocumentSimilarity(
108 doc1_id=f"{doc1.source_type}:{doc1.source_title}",
109 doc2_id=f"{doc2.source_type}:{doc2.source_title}",
110 similarity_score=combined_score,
111 metric_scores=metric_scores,
112 shared_entities=shared_entities,
113 shared_topics=shared_topics,
114 relationship_type=relationship_type,
115 )
117 def _calculate_entity_overlap(
118 self, doc1: SearchResult, doc2: SearchResult
119 ) -> float:
120 """Calculate entity overlap between documents (delegates to CDI helper)."""
121 return cdi_calculate_entity_overlap(doc1, doc2)
123 def _calculate_topic_overlap(self, doc1: SearchResult, doc2: SearchResult) -> float:
124 """Deprecated. Use CDI extractors.calculate_topic_overlap instead.
126 TODO: Remove this method after 2026-01-01 once all external callers migrate.
127 """
128 warnings.warn(
129 "DocumentSimilarityCalculator._calculate_topic_overlap is deprecated; "
130 "use CDI extractors.calculate_topic_overlap instead",
131 category=DeprecationWarning,
132 stacklevel=2,
133 )
134 return cdi_calculate_topic_overlap(doc1, doc2)
136 def _calculate_metadata_similarity(
137 self, doc1: SearchResult, doc2: SearchResult
138 ) -> float:
139 """Calculate metadata similarity (delegates to CDI helper)."""
140 return cdi_calc_metadata_similarity(doc1, doc2)
142 def _calculate_content_features_similarity(
143 self, doc1: SearchResult, doc2: SearchResult
144 ) -> float:
145 """Calculate content features similarity (delegates to CDI helper)."""
146 return cdi_calc_content_features_similarity(doc1, doc2)
148 def _calculate_hierarchical_similarity(
149 self, doc1: SearchResult, doc2: SearchResult
150 ) -> float:
151 """Calculate hierarchical relationship similarity."""
152 # Check for direct parent-child relationship
153 if (
154 doc1.parent_id
155 and doc1.parent_id == f"{doc2.source_type}:{doc2.source_title}"
156 ):
157 return 1.0
158 if (
159 doc2.parent_id
160 and doc2.parent_id == f"{doc1.source_type}:{doc1.source_title}"
161 ):
162 return 1.0
164 # Check for sibling relationship (same parent)
165 if doc1.parent_id and doc2.parent_id and doc1.parent_id == doc2.parent_id:
166 return 0.8
168 # Breadcrumb-based similarity
169 if doc1.breadcrumb_text and doc2.breadcrumb_text:
170 return hierarchical_distance_from_breadcrumbs(
171 doc1.breadcrumb_text, doc2.breadcrumb_text
172 )
174 return 0.0
176 def _calculate_semantic_similarity(
177 self, doc1: SearchResult, doc2: SearchResult
178 ) -> float:
179 """Calculate semantic similarity using spaCy."""
180 try:
181 return cdi_calc_semantic_spacy(self.spacy_analyzer, doc1.text, doc2.text)
182 except Exception as e:
183 self.logger.warning(f"Failed to calculate semantic similarity: {e}")
184 return 0.0
186 def _combine_metric_scores(
187 self, metric_scores: dict[SimilarityMetric, float]
188 ) -> float:
189 """Combine multiple metric scores into final similarity score (delegates to CDI helper)."""
190 return cdi_combine_metric_scores(metric_scores)
192 def _get_shared_entities(self, doc1: SearchResult, doc2: SearchResult) -> list[str]:
193 """Get shared entities between documents (delegates to CDI helper)."""
194 return cdi_get_shared_entities(doc1, doc2)
196 def _get_shared_topics(self, doc1: SearchResult, doc2: SearchResult) -> list[str]:
197 """Get shared topics between documents (delegates to CDI helper)."""
198 return cdi_get_shared_topics(doc1, doc2)
200 def extract_entity_texts(self, entities: list[dict | str]) -> list[str]:
201 """Extract entity texts (public API delegating to CDI utils)."""
202 return extract_texts_from_mixed(entities)
204 def extract_topic_texts(self, topics: list[dict | str]) -> list[str]:
205 """Extract topic texts (public API delegating to CDI utils)."""
206 return extract_texts_from_mixed(topics)
208 def _extract_entity_texts(self, entities: list[dict | str]) -> list[str]:
209 """Deprecated. Use CDI utils instead: `utils.extract_texts_from_mixed`.
211 Replacement: use CDI utils `extract_texts_from_mixed`.
213 TODO: Remove this wrapper after 2026-01-01; ensure all callers are migrated.
214 """
215 warnings.warn(
216 "DocumentSimilarityCalculator._extract_entity_texts is deprecated; "
217 "use CDI utils.extract_texts_from_mixed instead",
218 category=DeprecationWarning,
219 stacklevel=2,
220 )
221 return extract_texts_from_mixed(entities)
223 def _extract_topic_texts(self, topics: list[dict | str]) -> list[str]:
224 """Deprecated. Use CDI utils instead: `utils.extract_texts_from_mixed`.
226 Replacement: use CDI utils `extract_texts_from_mixed`.
228 TODO: Remove this wrapper after 2026-01-01; ensure all callers are migrated.
229 """
230 warnings.warn(
231 "DocumentSimilarityCalculator._extract_topic_texts is deprecated; "
232 "use CDI utils.extract_texts_from_mixed instead",
233 category=DeprecationWarning,
234 stacklevel=2,
235 )
236 return extract_texts_from_mixed(topics)
238 def _determine_relationship_type(
239 self,
240 doc1: SearchResult,
241 doc2: SearchResult,
242 metric_scores: dict[SimilarityMetric, float],
243 ) -> RelationshipType:
244 """Determine the type of relationship between documents."""
245 # Check for hierarchical relationship
246 if (
247 SimilarityMetric.HIERARCHICAL_DISTANCE in metric_scores
248 and metric_scores[SimilarityMetric.HIERARCHICAL_DISTANCE] > 0.7
249 ):
250 return RelationshipType.HIERARCHICAL
252 # Check for cross-references
253 if doc1.cross_references or doc2.cross_references:
254 return RelationshipType.CROSS_REFERENCE
256 # Check for project grouping
257 if doc1.project_id and doc2.project_id and doc1.project_id == doc2.project_id:
258 return RelationshipType.PROJECT_GROUPING
260 # Default to semantic similarity
261 return RelationshipType.SEMANTIC_SIMILARITY