Coverage for src/qdrant_loader_mcp_server/search/components/combining/scoring

1from __future__ import annotations

3from typing import Any

6def boost_score_with_metadata(

7 base_score: float, metadata: dict, query_context: dict, *, spacy_analyzer=None

8) -> float:

9 boosted_score = base_score

10 boost_factor = 0.0

12 search_intent = query_context.get("search_intent")

13 adaptive_config = query_context.get("adaptive_config")

15 if search_intent and adaptive_config:

16 boost_factor += apply_intent_boosting(

17 metadata, search_intent, adaptive_config, query_context

18 )

20 boost_factor += apply_content_type_boosting(metadata, query_context)

21 boost_factor += apply_section_level_boosting(metadata)

22 boost_factor += apply_content_quality_boosting(metadata)

23 boost_factor += apply_conversion_boosting(metadata, query_context)

25 if spacy_analyzer:

26 boost_factor += apply_semantic_boosting(metadata, query_context, spacy_analyzer)

27 else:

28 boost_factor += apply_fallback_semantic_boosting(metadata, query_context)

30 boost_factor = min(boost_factor, 0.5)

31 return boosted_score * (1 + boost_factor)

34def apply_intent_boosting(

35 metadata: dict, search_intent: Any, adaptive_config: Any, query_context: dict

36) -> float:

37 boost_factor = 0.0

38 ranking_boosts = adaptive_config.ranking_boosts

39 source_type_preferences = adaptive_config.source_type_preferences

41 source_type = metadata.get("source_type", "")

42 if source_type in source_type_preferences:

43 boost_factor += (source_type_preferences[source_type] - 1.0) * 0.2

45 for boost_key, boost_value in ranking_boosts.items():

46 if boost_key == "section_type" and isinstance(boost_value, dict):

47 section_type = metadata.get("section_type", "")

48 if section_type in boost_value:

49 boost_factor += (boost_value[section_type] - 1.0) * 0.15

50 elif boost_key == "source_type" and isinstance(boost_value, dict):

51 if source_type in boost_value:

52 boost_factor += (boost_value[source_type] - 1.0) * 0.15

53 elif boost_key in metadata and metadata[boost_key]:

54 if isinstance(boost_value, int | float):

55 boost_factor += (boost_value - 1.0) * 0.1

57 boost_factor += search_intent.confidence * 0.05

58 return boost_factor

61def apply_content_type_boosting(metadata: dict, query_context: dict) -> float:

62 boost_factor = 0.0

63 content_analysis = metadata.get("content_type_analysis", {})

65 if query_context.get("prefers_code") and content_analysis.get("has_code_blocks"):

66 boost_factor += 0.15

67 if query_context.get("prefers_tables") and content_analysis.get("has_tables"):

68 boost_factor += 0.12

69 if query_context.get("prefers_images") and content_analysis.get("has_images"):

70 boost_factor += 0.10

71 if query_context.get("prefers_docs") and not content_analysis.get(

72 "has_code_blocks"

73 ):

74 boost_factor += 0.08

75 return boost_factor

78def apply_section_level_boosting(metadata: dict) -> float:

79 boost_factor = 0.0

80 section_level = metadata.get("section_level")

81 if section_level is not None:

82 if section_level <= 2:

83 boost_factor += 0.10

84 elif section_level <= 3:

85 boost_factor += 0.05

86 return boost_factor

89def apply_content_quality_boosting(metadata: dict) -> float:

90 boost_factor = 0.0

91 content_analysis = metadata.get("content_type_analysis", {})

92 word_count = content_analysis.get("word_count") or 0

93 if word_count > 100:

94 boost_factor += 0.05

95 if word_count > 500:

96 boost_factor += 0.05

97 return boost_factor

100def apply_conversion_boosting(metadata: dict, query_context: dict) -> float:

101 boost_factor = 0.0

102 if metadata.get("is_converted") and metadata.get("original_file_type") in [

103 "docx",

104 "xlsx",

105 "pdf",

106 ]:

107 boost_factor += 0.08

108 if metadata.get("is_excel_sheet") and any(

109 term in " ".join(query_context.get("keywords", []))

110 for term in ["data", "table", "sheet", "excel", "csv"]

111 ):

112 boost_factor += 0.12

113 return boost_factor

114

115

116def apply_semantic_boosting(

117 metadata: dict, query_context: dict, spacy_analyzer: Any

118) -> float:

119 boost_factor = 0.0

120 if "spacy_analysis" not in query_context:

121 return boost_factor

122 spacy_analysis = query_context["spacy_analysis"]

123

124 entities = metadata.get("entities", [])

125 if entities and spacy_analysis.entities:

126 max_entity_similarity = 0.0

127 for entity in entities:

128 entity_text = (

129 entity if isinstance(entity, str) else entity.get("text", str(entity))

130 )

131 similarity = spacy_analyzer.semantic_similarity_matching(

132 spacy_analysis, entity_text

133 )

134 max_entity_similarity = max(max_entity_similarity, similarity)

135 if max_entity_similarity > 0.6:

136 boost_factor += 0.15

137 elif max_entity_similarity > 0.4:

138 boost_factor += 0.10

139 elif max_entity_similarity > 0.2:

140 boost_factor += 0.05

141

142 topics = metadata.get("topics", [])

143 if topics and spacy_analysis.main_concepts:

144 max_topic_similarity = 0.0

145 for topic in topics:

146 topic_text = (

147 topic if isinstance(topic, str) else topic.get("text", str(topic))

148 )

149 for concept in spacy_analysis.main_concepts:

150 similarity = spacy_analyzer.semantic_similarity_matching(

151 spacy_analysis, f"{topic_text} {concept}"

152 )

153 max_topic_similarity = max(max_topic_similarity, similarity)

154 if max_topic_similarity > 0.5:

155 boost_factor += 0.12

156 elif max_topic_similarity > 0.3:

157 boost_factor += 0.08

158 return boost_factor

159

160

161def apply_fallback_semantic_boosting(metadata: dict, query_context: dict) -> float:

162 boost_factor = 0.0

163 entities = metadata.get("entities", [])

164 if entities:

165 query_keywords = set(query_context.get("keywords", []))

166 entity_texts = set()

167 for entity in entities:

168 if isinstance(entity, str):

169 entity_texts.add(entity.lower())

170 elif isinstance(entity, dict):

171 if "text" in entity:

172 entity_texts.add(str(entity["text"]).lower())

173 elif "entity" in entity:

174 entity_texts.add(str(entity["entity"]).lower())

175 else:

176 entity_texts.add(str(entity).lower())

177 if query_keywords.intersection(entity_texts):

178 boost_factor += 0.10

179

180 topics = metadata.get("topics", [])

181 if topics:

182 query_keywords = set(query_context.get("keywords", []))

183 topic_texts = set()

184 for topic in topics:

185 if isinstance(topic, str):

186 topic_texts.add(topic.lower())

187 elif isinstance(topic, dict):

188 if "text" in topic:

189 topic_texts.add(str(topic["text"]).lower())

190 elif "topic" in topic:

191 topic_texts.add(str(topic["topic"]).lower())

192 else:

193 topic_texts.add(str(topic).lower())

194 if query_keywords.intersection(topic_texts):

195 boost_factor += 0.08

196 return boost_factor

Coverage for src/qdrant_loader_mcp_server/search/components/combining/scoring_boosts.py: 91%

139 statements