Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/relationships.py: 81%
94 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1from __future__ import annotations
3from typing import Any
6def analyze_entity_overlap(cluster_a, cluster_b) -> dict[str, Any] | None:
7 # Defensive input validation
8 if cluster_a is None or cluster_b is None:
9 return None
10 entities_a = set(getattr(cluster_a, "shared_entities", []) or [])
11 entities_b = set(getattr(cluster_b, "shared_entities", []) or [])
12 if not entities_a or not entities_b:
13 return None
14 overlap = entities_a & entities_b
15 union = entities_a | entities_b
16 if not overlap:
17 return None
18 strength = len(overlap) / len(union)
19 return {
20 "type": "entity_overlap",
21 "strength": strength,
22 "description": f"Share {len(overlap)} common entities: {', '.join(list(overlap)[:3])}",
23 "shared_elements": list(overlap),
24 }
27def analyze_topic_overlap(cluster_a, cluster_b) -> dict[str, Any] | None:
28 # Defensive input validation similar to analyze_entity_overlap
29 if cluster_a is None or cluster_b is None:
30 return None
31 topics_a_raw = getattr(cluster_a, "shared_topics", None)
32 topics_b_raw = getattr(cluster_b, "shared_topics", None)
33 if not topics_a_raw or not topics_b_raw:
34 return None
35 try:
36 topics_a = set(topics_a_raw)
37 topics_b = set(topics_b_raw)
38 except TypeError:
39 return None
40 if not topics_a or not topics_b:
41 return None
42 overlap = topics_a & topics_b
43 union = topics_a | topics_b
44 if not overlap:
45 return None
46 strength = len(overlap) / len(union)
47 return {
48 "type": "topic_overlap",
49 "strength": strength,
50 "description": f"Share {len(overlap)} common topics: {', '.join(list(overlap)[:3])}",
51 "shared_elements": list(overlap),
52 }
55def analyze_source_similarity(
56 docs_a: list[Any], docs_b: list[Any]
57) -> dict[str, Any] | None:
58 sources_a = {getattr(doc, "source_type", None) for doc in docs_a if doc}
59 sources_b = {getattr(doc, "source_type", None) for doc in docs_b if doc}
60 sources_a.discard(None)
61 sources_b.discard(None)
62 if not sources_a or not sources_b:
63 return None
64 overlap = sources_a & sources_b
65 union = sources_a | sources_b
66 if not overlap:
67 return None
68 strength = len(overlap) / len(union)
69 if len(sources_a) == 1 and len(sources_b) == 1 and sources_a == sources_b:
70 strength = min(1.0, strength + 0.3)
71 return {
72 "type": "source_similarity",
73 "strength": strength,
74 "description": f"Both contain {', '.join(overlap)} documents",
75 "shared_elements": list(overlap),
76 }
79def analyze_hierarchy_relationship(
80 docs_a: list[Any], docs_b: list[Any]
81) -> dict[str, Any] | None:
82 breadcrumbs_a = [getattr(doc, "breadcrumb_text", "") for doc in docs_a if doc]
83 breadcrumbs_b = [getattr(doc, "breadcrumb_text", "") for doc in docs_b if doc]
84 if not breadcrumbs_a or not breadcrumbs_b:
85 return None
86 parent_child_count = 0
87 for bc_a in breadcrumbs_a:
88 for bc_b in breadcrumbs_b:
89 if bc_a and bc_b:
90 if bc_a in bc_b or bc_b in bc_a:
91 parent_child_count += 1
92 if parent_child_count == 0:
93 return None
94 total_comparisons = len(breadcrumbs_a) * len(breadcrumbs_b)
95 strength = parent_child_count / total_comparisons if total_comparisons > 0 else 0
96 return {
97 "type": "hierarchical",
98 "strength": strength,
99 "description": f"Hierarchically related documents ({parent_child_count} connections)",
100 "shared_elements": [],
101 }
104def analyze_content_similarity(
105 docs_a: list[Any], docs_b: list[Any]
106) -> dict[str, Any] | None:
107 has_code_a = any(getattr(doc, "has_code_blocks", False) for doc in docs_a if doc)
108 has_code_b = any(getattr(doc, "has_code_blocks", False) for doc in docs_b if doc)
109 word_counts_a = [getattr(doc, "word_count", 0) or 0 for doc in docs_a if doc]
110 word_counts_b = [getattr(doc, "word_count", 0) or 0 for doc in docs_b if doc]
111 avg_size_a = sum(word_counts_a) / len(word_counts_a) if word_counts_a else 0
112 avg_size_b = sum(word_counts_b) / len(word_counts_b) if word_counts_b else 0
114 similarity_factors: list[float] = []
115 if has_code_a and has_code_b:
116 similarity_factors.append(0.4)
117 elif not has_code_a and not has_code_b:
118 similarity_factors.append(0.2)
120 if avg_size_a > 0 and avg_size_b > 0:
121 size_ratio = min(avg_size_a, avg_size_b) / max(avg_size_a, avg_size_b)
122 if size_ratio > 0.5:
123 similarity_factors.append(size_ratio * 0.3)
125 if not similarity_factors:
126 return None
128 strength = sum(similarity_factors)
129 if strength < 0.1:
130 return None
132 description_parts: list[str] = []
133 if has_code_a and has_code_b:
134 description_parts.append("both contain code")
135 if (
136 avg_size_a > 0
137 and avg_size_b > 0
138 and abs(avg_size_a - avg_size_b) / max(avg_size_a, avg_size_b) < 0.5
139 ):
140 description_parts.append("similar document sizes")
142 description = (
143 f"Content similarity: {', '.join(description_parts)}"
144 if description_parts
145 else "Similar content characteristics"
146 )
148 return {
149 "type": "content_similarity",
150 "strength": strength,
151 "description": description,
152 "shared_elements": [],
153 }