Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/relationships.py: 81%

1from __future__ import annotations

3from typing import Any

6def analyze_entity_overlap(cluster_a, cluster_b) -> dict[str, Any] | None:

7 # Defensive input validation

8 if cluster_a is None or cluster_b is None:

9 return None

10 entities_a = set(getattr(cluster_a, "shared_entities", []) or [])

11 entities_b = set(getattr(cluster_b, "shared_entities", []) or [])

12 if not entities_a or not entities_b:

13 return None

14 overlap = entities_a & entities_b

15 union = entities_a | entities_b

16 if not overlap:

17 return None

18 strength = len(overlap) / len(union)

19 return {

20 "type": "entity_overlap",

21 "strength": strength,

22 "description": f"Share {len(overlap)} common entities: {', '.join(list(overlap)[:3])}",

23 "shared_elements": list(overlap),

24 }

27def analyze_topic_overlap(cluster_a, cluster_b) -> dict[str, Any] | None:

28 # Defensive input validation similar to analyze_entity_overlap

29 if cluster_a is None or cluster_b is None:

30 return None

31 topics_a_raw = getattr(cluster_a, "shared_topics", None)

32 topics_b_raw = getattr(cluster_b, "shared_topics", None)

33 if not topics_a_raw or not topics_b_raw:

34 return None

35 try:

36 topics_a = set(topics_a_raw)

37 topics_b = set(topics_b_raw)

38 except TypeError:

39 return None

40 if not topics_a or not topics_b:

41 return None

42 overlap = topics_a & topics_b

43 union = topics_a | topics_b

44 if not overlap:

45 return None

46 strength = len(overlap) / len(union)

47 return {

48 "type": "topic_overlap",

49 "strength": strength,

50 "description": f"Share {len(overlap)} common topics: {', '.join(list(overlap)[:3])}",

51 "shared_elements": list(overlap),

52 }

55def analyze_source_similarity(

56 docs_a: list[Any], docs_b: list[Any]

57) -> dict[str, Any] | None:

58 sources_a = {getattr(doc, "source_type", None) for doc in docs_a if doc}

59 sources_b = {getattr(doc, "source_type", None) for doc in docs_b if doc}

60 sources_a.discard(None)

61 sources_b.discard(None)

62 if not sources_a or not sources_b:

63 return None

64 overlap = sources_a & sources_b

65 union = sources_a | sources_b

66 if not overlap:

67 return None

68 strength = len(overlap) / len(union)

69 if len(sources_a) == 1 and len(sources_b) == 1 and sources_a == sources_b:

70 strength = min(1.0, strength + 0.3)

71 return {

72 "type": "source_similarity",

73 "strength": strength,

74 "description": f"Both contain {', '.join(overlap)} documents",

75 "shared_elements": list(overlap),

76 }

79def analyze_hierarchy_relationship(

80 docs_a: list[Any], docs_b: list[Any]

81) -> dict[str, Any] | None:

82 breadcrumbs_a = [getattr(doc, "breadcrumb_text", "") for doc in docs_a if doc]

83 breadcrumbs_b = [getattr(doc, "breadcrumb_text", "") for doc in docs_b if doc]

84 if not breadcrumbs_a or not breadcrumbs_b:

85 return None

86 parent_child_count = 0

87 for bc_a in breadcrumbs_a:

88 for bc_b in breadcrumbs_b:

89 if bc_a and bc_b:

90 if bc_a in bc_b or bc_b in bc_a:

91 parent_child_count += 1

92 if parent_child_count == 0:

93 return None

94 total_comparisons = len(breadcrumbs_a) * len(breadcrumbs_b)

95 strength = parent_child_count / total_comparisons if total_comparisons > 0 else 0

96 return {

97 "type": "hierarchical",

98 "strength": strength,

99 "description": f"Hierarchically related documents ({parent_child_count} connections)",

100 "shared_elements": [],

101 }

102

103

104def analyze_content_similarity(

105 docs_a: list[Any], docs_b: list[Any]

106) -> dict[str, Any] | None:

107 has_code_a = any(getattr(doc, "has_code_blocks", False) for doc in docs_a if doc)

108 has_code_b = any(getattr(doc, "has_code_blocks", False) for doc in docs_b if doc)

109 word_counts_a = [getattr(doc, "word_count", 0) or 0 for doc in docs_a if doc]

110 word_counts_b = [getattr(doc, "word_count", 0) or 0 for doc in docs_b if doc]

111 avg_size_a = sum(word_counts_a) / len(word_counts_a) if word_counts_a else 0

112 avg_size_b = sum(word_counts_b) / len(word_counts_b) if word_counts_b else 0

113

114 similarity_factors: list[float] = []

115 if has_code_a and has_code_b:

116 similarity_factors.append(0.4)

117 elif not has_code_a and not has_code_b:

118 similarity_factors.append(0.2)

119

120 if avg_size_a > 0 and avg_size_b > 0:

121 size_ratio = min(avg_size_a, avg_size_b) / max(avg_size_a, avg_size_b)

122 if size_ratio > 0.5:

123 similarity_factors.append(size_ratio * 0.3)

124

125 if not similarity_factors:

126 return None

127

128 strength = sum(similarity_factors)

129 if strength < 0.1:

130 return None

131

132 description_parts: list[str] = []

133 if has_code_a and has_code_b:

134 description_parts.append("both contain code")

135 if (

136 avg_size_a > 0

137 and avg_size_b > 0

138 and abs(avg_size_a - avg_size_b) / max(avg_size_a, avg_size_b) < 0.5

139 ):

140 description_parts.append("similar document sizes")

141

142 description = (

143 f"Content similarity: {', '.join(description_parts)}"

144 if description_parts

145 else "Similar content characteristics"

146 )

147

148 return {

149 "type": "content_similarity",

150 "strength": strength,

151 "description": description,

152 "shared_elements": [],

153 }