Coverage for src/qdrant_loader_mcp_server/search/components/combining/scoring_boosts.py: 91%

139 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1from __future__ import annotations 

2 

3from typing import Any 

4 

5 

6def boost_score_with_metadata( 

7 base_score: float, metadata: dict, query_context: dict, *, spacy_analyzer=None 

8) -> float: 

9 boosted_score = base_score 

10 boost_factor = 0.0 

11 

12 search_intent = query_context.get("search_intent") 

13 adaptive_config = query_context.get("adaptive_config") 

14 

15 if search_intent and adaptive_config: 

16 boost_factor += apply_intent_boosting( 

17 metadata, search_intent, adaptive_config, query_context 

18 ) 

19 

20 boost_factor += apply_content_type_boosting(metadata, query_context) 

21 boost_factor += apply_section_level_boosting(metadata) 

22 boost_factor += apply_content_quality_boosting(metadata) 

23 boost_factor += apply_conversion_boosting(metadata, query_context) 

24 

25 if spacy_analyzer: 

26 boost_factor += apply_semantic_boosting(metadata, query_context, spacy_analyzer) 

27 else: 

28 boost_factor += apply_fallback_semantic_boosting(metadata, query_context) 

29 

30 boost_factor = min(boost_factor, 0.5) 

31 return boosted_score * (1 + boost_factor) 

32 

33 

34def apply_intent_boosting( 

35 metadata: dict, search_intent: Any, adaptive_config: Any, query_context: dict 

36) -> float: 

37 boost_factor = 0.0 

38 ranking_boosts = adaptive_config.ranking_boosts 

39 source_type_preferences = adaptive_config.source_type_preferences 

40 

41 source_type = metadata.get("source_type", "") 

42 if source_type in source_type_preferences: 

43 boost_factor += (source_type_preferences[source_type] - 1.0) * 0.2 

44 

45 for boost_key, boost_value in ranking_boosts.items(): 

46 if boost_key == "section_type" and isinstance(boost_value, dict): 

47 section_type = metadata.get("section_type", "") 

48 if section_type in boost_value: 

49 boost_factor += (boost_value[section_type] - 1.0) * 0.15 

50 elif boost_key == "source_type" and isinstance(boost_value, dict): 

51 if source_type in boost_value: 

52 boost_factor += (boost_value[source_type] - 1.0) * 0.15 

53 elif boost_key in metadata and metadata[boost_key]: 

54 if isinstance(boost_value, int | float): 

55 boost_factor += (boost_value - 1.0) * 0.1 

56 

57 boost_factor += search_intent.confidence * 0.05 

58 return boost_factor 

59 

60 

61def apply_content_type_boosting(metadata: dict, query_context: dict) -> float: 

62 boost_factor = 0.0 

63 content_analysis = metadata.get("content_type_analysis", {}) 

64 

65 if query_context.get("prefers_code") and content_analysis.get("has_code_blocks"): 

66 boost_factor += 0.15 

67 if query_context.get("prefers_tables") and content_analysis.get("has_tables"): 

68 boost_factor += 0.12 

69 if query_context.get("prefers_images") and content_analysis.get("has_images"): 

70 boost_factor += 0.10 

71 if query_context.get("prefers_docs") and not content_analysis.get( 

72 "has_code_blocks" 

73 ): 

74 boost_factor += 0.08 

75 return boost_factor 

76 

77 

78def apply_section_level_boosting(metadata: dict) -> float: 

79 boost_factor = 0.0 

80 section_level = metadata.get("section_level") 

81 if section_level is not None: 

82 if section_level <= 2: 

83 boost_factor += 0.10 

84 elif section_level <= 3: 

85 boost_factor += 0.05 

86 return boost_factor 

87 

88 

89def apply_content_quality_boosting(metadata: dict) -> float: 

90 boost_factor = 0.0 

91 content_analysis = metadata.get("content_type_analysis", {}) 

92 word_count = content_analysis.get("word_count") or 0 

93 if word_count > 100: 

94 boost_factor += 0.05 

95 if word_count > 500: 

96 boost_factor += 0.05 

97 return boost_factor 

98 

99 

100def apply_conversion_boosting(metadata: dict, query_context: dict) -> float: 

101 boost_factor = 0.0 

102 if metadata.get("is_converted") and metadata.get("original_file_type") in [ 

103 "docx", 

104 "xlsx", 

105 "pdf", 

106 ]: 

107 boost_factor += 0.08 

108 if metadata.get("is_excel_sheet") and any( 

109 term in " ".join(query_context.get("keywords", [])) 

110 for term in ["data", "table", "sheet", "excel", "csv"] 

111 ): 

112 boost_factor += 0.12 

113 return boost_factor 

114 

115 

116def apply_semantic_boosting( 

117 metadata: dict, query_context: dict, spacy_analyzer: Any 

118) -> float: 

119 boost_factor = 0.0 

120 if "spacy_analysis" not in query_context: 

121 return boost_factor 

122 spacy_analysis = query_context["spacy_analysis"] 

123 

124 entities = metadata.get("entities", []) 

125 if entities and spacy_analysis.entities: 

126 max_entity_similarity = 0.0 

127 for entity in entities: 

128 entity_text = ( 

129 entity if isinstance(entity, str) else entity.get("text", str(entity)) 

130 ) 

131 similarity = spacy_analyzer.semantic_similarity_matching( 

132 spacy_analysis, entity_text 

133 ) 

134 max_entity_similarity = max(max_entity_similarity, similarity) 

135 if max_entity_similarity > 0.6: 

136 boost_factor += 0.15 

137 elif max_entity_similarity > 0.4: 

138 boost_factor += 0.10 

139 elif max_entity_similarity > 0.2: 

140 boost_factor += 0.05 

141 

142 topics = metadata.get("topics", []) 

143 if topics and spacy_analysis.main_concepts: 

144 max_topic_similarity = 0.0 

145 for topic in topics: 

146 topic_text = ( 

147 topic if isinstance(topic, str) else topic.get("text", str(topic)) 

148 ) 

149 for concept in spacy_analysis.main_concepts: 

150 similarity = spacy_analyzer.semantic_similarity_matching( 

151 spacy_analysis, f"{topic_text} {concept}" 

152 ) 

153 max_topic_similarity = max(max_topic_similarity, similarity) 

154 if max_topic_similarity > 0.5: 

155 boost_factor += 0.12 

156 elif max_topic_similarity > 0.3: 

157 boost_factor += 0.08 

158 return boost_factor 

159 

160 

161def apply_fallback_semantic_boosting(metadata: dict, query_context: dict) -> float: 

162 boost_factor = 0.0 

163 entities = metadata.get("entities", []) 

164 if entities: 

165 query_keywords = set(query_context.get("keywords", [])) 

166 entity_texts = set() 

167 for entity in entities: 

168 if isinstance(entity, str): 

169 entity_texts.add(entity.lower()) 

170 elif isinstance(entity, dict): 

171 if "text" in entity: 

172 entity_texts.add(str(entity["text"]).lower()) 

173 elif "entity" in entity: 

174 entity_texts.add(str(entity["entity"]).lower()) 

175 else: 

176 entity_texts.add(str(entity).lower()) 

177 if query_keywords.intersection(entity_texts): 

178 boost_factor += 0.10 

179 

180 topics = metadata.get("topics", []) 

181 if topics: 

182 query_keywords = set(query_context.get("keywords", [])) 

183 topic_texts = set() 

184 for topic in topics: 

185 if isinstance(topic, str): 

186 topic_texts.add(topic.lower()) 

187 elif isinstance(topic, dict): 

188 if "text" in topic: 

189 topic_texts.add(str(topic["text"]).lower()) 

190 elif "topic" in topic: 

191 topic_texts.add(str(topic["topic"]).lower()) 

192 else: 

193 topic_texts.add(str(topic).lower()) 

194 if query_keywords.intersection(topic_texts): 

195 boost_factor += 0.08 

196 return boost_factor