Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/relationships.py: 81%

94 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1from __future__ import annotations 

2 

3from typing import Any 

4 

5 

6def analyze_entity_overlap(cluster_a, cluster_b) -> dict[str, Any] | None: 

7 # Defensive input validation 

8 if cluster_a is None or cluster_b is None: 

9 return None 

10 entities_a = set(getattr(cluster_a, "shared_entities", []) or []) 

11 entities_b = set(getattr(cluster_b, "shared_entities", []) or []) 

12 if not entities_a or not entities_b: 

13 return None 

14 overlap = entities_a & entities_b 

15 union = entities_a | entities_b 

16 if not overlap: 

17 return None 

18 strength = len(overlap) / len(union) 

19 return { 

20 "type": "entity_overlap", 

21 "strength": strength, 

22 "description": f"Share {len(overlap)} common entities: {', '.join(list(overlap)[:3])}", 

23 "shared_elements": list(overlap), 

24 } 

25 

26 

27def analyze_topic_overlap(cluster_a, cluster_b) -> dict[str, Any] | None: 

28 # Defensive input validation similar to analyze_entity_overlap 

29 if cluster_a is None or cluster_b is None: 

30 return None 

31 topics_a_raw = getattr(cluster_a, "shared_topics", None) 

32 topics_b_raw = getattr(cluster_b, "shared_topics", None) 

33 if not topics_a_raw or not topics_b_raw: 

34 return None 

35 try: 

36 topics_a = set(topics_a_raw) 

37 topics_b = set(topics_b_raw) 

38 except TypeError: 

39 return None 

40 if not topics_a or not topics_b: 

41 return None 

42 overlap = topics_a & topics_b 

43 union = topics_a | topics_b 

44 if not overlap: 

45 return None 

46 strength = len(overlap) / len(union) 

47 return { 

48 "type": "topic_overlap", 

49 "strength": strength, 

50 "description": f"Share {len(overlap)} common topics: {', '.join(list(overlap)[:3])}", 

51 "shared_elements": list(overlap), 

52 } 

53 

54 

55def analyze_source_similarity( 

56 docs_a: list[Any], docs_b: list[Any] 

57) -> dict[str, Any] | None: 

58 sources_a = {getattr(doc, "source_type", None) for doc in docs_a if doc} 

59 sources_b = {getattr(doc, "source_type", None) for doc in docs_b if doc} 

60 sources_a.discard(None) 

61 sources_b.discard(None) 

62 if not sources_a or not sources_b: 

63 return None 

64 overlap = sources_a & sources_b 

65 union = sources_a | sources_b 

66 if not overlap: 

67 return None 

68 strength = len(overlap) / len(union) 

69 if len(sources_a) == 1 and len(sources_b) == 1 and sources_a == sources_b: 

70 strength = min(1.0, strength + 0.3) 

71 return { 

72 "type": "source_similarity", 

73 "strength": strength, 

74 "description": f"Both contain {', '.join(overlap)} documents", 

75 "shared_elements": list(overlap), 

76 } 

77 

78 

79def analyze_hierarchy_relationship( 

80 docs_a: list[Any], docs_b: list[Any] 

81) -> dict[str, Any] | None: 

82 breadcrumbs_a = [getattr(doc, "breadcrumb_text", "") for doc in docs_a if doc] 

83 breadcrumbs_b = [getattr(doc, "breadcrumb_text", "") for doc in docs_b if doc] 

84 if not breadcrumbs_a or not breadcrumbs_b: 

85 return None 

86 parent_child_count = 0 

87 for bc_a in breadcrumbs_a: 

88 for bc_b in breadcrumbs_b: 

89 if bc_a and bc_b: 

90 if bc_a in bc_b or bc_b in bc_a: 

91 parent_child_count += 1 

92 if parent_child_count == 0: 

93 return None 

94 total_comparisons = len(breadcrumbs_a) * len(breadcrumbs_b) 

95 strength = parent_child_count / total_comparisons if total_comparisons > 0 else 0 

96 return { 

97 "type": "hierarchical", 

98 "strength": strength, 

99 "description": f"Hierarchically related documents ({parent_child_count} connections)", 

100 "shared_elements": [], 

101 } 

102 

103 

104def analyze_content_similarity( 

105 docs_a: list[Any], docs_b: list[Any] 

106) -> dict[str, Any] | None: 

107 has_code_a = any(getattr(doc, "has_code_blocks", False) for doc in docs_a if doc) 

108 has_code_b = any(getattr(doc, "has_code_blocks", False) for doc in docs_b if doc) 

109 word_counts_a = [getattr(doc, "word_count", 0) or 0 for doc in docs_a if doc] 

110 word_counts_b = [getattr(doc, "word_count", 0) or 0 for doc in docs_b if doc] 

111 avg_size_a = sum(word_counts_a) / len(word_counts_a) if word_counts_a else 0 

112 avg_size_b = sum(word_counts_b) / len(word_counts_b) if word_counts_b else 0 

113 

114 similarity_factors: list[float] = [] 

115 if has_code_a and has_code_b: 

116 similarity_factors.append(0.4) 

117 elif not has_code_a and not has_code_b: 

118 similarity_factors.append(0.2) 

119 

120 if avg_size_a > 0 and avg_size_b > 0: 

121 size_ratio = min(avg_size_a, avg_size_b) / max(avg_size_a, avg_size_b) 

122 if size_ratio > 0.5: 

123 similarity_factors.append(size_ratio * 0.3) 

124 

125 if not similarity_factors: 

126 return None 

127 

128 strength = sum(similarity_factors) 

129 if strength < 0.1: 

130 return None 

131 

132 description_parts: list[str] = [] 

133 if has_code_a and has_code_b: 

134 description_parts.append("both contain code") 

135 if ( 

136 avg_size_a > 0 

137 and avg_size_b > 0 

138 and abs(avg_size_a - avg_size_b) / max(avg_size_a, avg_size_b) < 0.5 

139 ): 

140 description_parts.append("similar document sizes") 

141 

142 description = ( 

143 f"Content similarity: {', '.join(description_parts)}" 

144 if description_parts 

145 else "Similar content characteristics" 

146 ) 

147 

148 return { 

149 "type": "content_similarity", 

150 "strength": strength, 

151 "description": description, 

152 "shared_elements": [], 

153 }