Coverage for src/qdrant_loader_mcp_server/search/components/combining/scoring_boosts.py: 91%
139 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1from __future__ import annotations
3from typing import Any
6def boost_score_with_metadata(
7 base_score: float, metadata: dict, query_context: dict, *, spacy_analyzer=None
8) -> float:
9 boosted_score = base_score
10 boost_factor = 0.0
12 search_intent = query_context.get("search_intent")
13 adaptive_config = query_context.get("adaptive_config")
15 if search_intent and adaptive_config:
16 boost_factor += apply_intent_boosting(
17 metadata, search_intent, adaptive_config, query_context
18 )
20 boost_factor += apply_content_type_boosting(metadata, query_context)
21 boost_factor += apply_section_level_boosting(metadata)
22 boost_factor += apply_content_quality_boosting(metadata)
23 boost_factor += apply_conversion_boosting(metadata, query_context)
25 if spacy_analyzer:
26 boost_factor += apply_semantic_boosting(metadata, query_context, spacy_analyzer)
27 else:
28 boost_factor += apply_fallback_semantic_boosting(metadata, query_context)
30 boost_factor = min(boost_factor, 0.5)
31 return boosted_score * (1 + boost_factor)
34def apply_intent_boosting(
35 metadata: dict, search_intent: Any, adaptive_config: Any, query_context: dict
36) -> float:
37 boost_factor = 0.0
38 ranking_boosts = adaptive_config.ranking_boosts
39 source_type_preferences = adaptive_config.source_type_preferences
41 source_type = metadata.get("source_type", "")
42 if source_type in source_type_preferences:
43 boost_factor += (source_type_preferences[source_type] - 1.0) * 0.2
45 for boost_key, boost_value in ranking_boosts.items():
46 if boost_key == "section_type" and isinstance(boost_value, dict):
47 section_type = metadata.get("section_type", "")
48 if section_type in boost_value:
49 boost_factor += (boost_value[section_type] - 1.0) * 0.15
50 elif boost_key == "source_type" and isinstance(boost_value, dict):
51 if source_type in boost_value:
52 boost_factor += (boost_value[source_type] - 1.0) * 0.15
53 elif boost_key in metadata and metadata[boost_key]:
54 if isinstance(boost_value, int | float):
55 boost_factor += (boost_value - 1.0) * 0.1
57 boost_factor += search_intent.confidence * 0.05
58 return boost_factor
61def apply_content_type_boosting(metadata: dict, query_context: dict) -> float:
62 boost_factor = 0.0
63 content_analysis = metadata.get("content_type_analysis", {})
65 if query_context.get("prefers_code") and content_analysis.get("has_code_blocks"):
66 boost_factor += 0.15
67 if query_context.get("prefers_tables") and content_analysis.get("has_tables"):
68 boost_factor += 0.12
69 if query_context.get("prefers_images") and content_analysis.get("has_images"):
70 boost_factor += 0.10
71 if query_context.get("prefers_docs") and not content_analysis.get(
72 "has_code_blocks"
73 ):
74 boost_factor += 0.08
75 return boost_factor
78def apply_section_level_boosting(metadata: dict) -> float:
79 boost_factor = 0.0
80 section_level = metadata.get("section_level")
81 if section_level is not None:
82 if section_level <= 2:
83 boost_factor += 0.10
84 elif section_level <= 3:
85 boost_factor += 0.05
86 return boost_factor
89def apply_content_quality_boosting(metadata: dict) -> float:
90 boost_factor = 0.0
91 content_analysis = metadata.get("content_type_analysis", {})
92 word_count = content_analysis.get("word_count") or 0
93 if word_count > 100:
94 boost_factor += 0.05
95 if word_count > 500:
96 boost_factor += 0.05
97 return boost_factor
100def apply_conversion_boosting(metadata: dict, query_context: dict) -> float:
101 boost_factor = 0.0
102 if metadata.get("is_converted") and metadata.get("original_file_type") in [
103 "docx",
104 "xlsx",
105 "pdf",
106 ]:
107 boost_factor += 0.08
108 if metadata.get("is_excel_sheet") and any(
109 term in " ".join(query_context.get("keywords", []))
110 for term in ["data", "table", "sheet", "excel", "csv"]
111 ):
112 boost_factor += 0.12
113 return boost_factor
116def apply_semantic_boosting(
117 metadata: dict, query_context: dict, spacy_analyzer: Any
118) -> float:
119 boost_factor = 0.0
120 if "spacy_analysis" not in query_context:
121 return boost_factor
122 spacy_analysis = query_context["spacy_analysis"]
124 entities = metadata.get("entities", [])
125 if entities and spacy_analysis.entities:
126 max_entity_similarity = 0.0
127 for entity in entities:
128 entity_text = (
129 entity if isinstance(entity, str) else entity.get("text", str(entity))
130 )
131 similarity = spacy_analyzer.semantic_similarity_matching(
132 spacy_analysis, entity_text
133 )
134 max_entity_similarity = max(max_entity_similarity, similarity)
135 if max_entity_similarity > 0.6:
136 boost_factor += 0.15
137 elif max_entity_similarity > 0.4:
138 boost_factor += 0.10
139 elif max_entity_similarity > 0.2:
140 boost_factor += 0.05
142 topics = metadata.get("topics", [])
143 if topics and spacy_analysis.main_concepts:
144 max_topic_similarity = 0.0
145 for topic in topics:
146 topic_text = (
147 topic if isinstance(topic, str) else topic.get("text", str(topic))
148 )
149 for concept in spacy_analysis.main_concepts:
150 similarity = spacy_analyzer.semantic_similarity_matching(
151 spacy_analysis, f"{topic_text} {concept}"
152 )
153 max_topic_similarity = max(max_topic_similarity, similarity)
154 if max_topic_similarity > 0.5:
155 boost_factor += 0.12
156 elif max_topic_similarity > 0.3:
157 boost_factor += 0.08
158 return boost_factor
161def apply_fallback_semantic_boosting(metadata: dict, query_context: dict) -> float:
162 boost_factor = 0.0
163 entities = metadata.get("entities", [])
164 if entities:
165 query_keywords = set(query_context.get("keywords", []))
166 entity_texts = set()
167 for entity in entities:
168 if isinstance(entity, str):
169 entity_texts.add(entity.lower())
170 elif isinstance(entity, dict):
171 if "text" in entity:
172 entity_texts.add(str(entity["text"]).lower())
173 elif "entity" in entity:
174 entity_texts.add(str(entity["entity"]).lower())
175 else:
176 entity_texts.add(str(entity).lower())
177 if query_keywords.intersection(entity_texts):
178 boost_factor += 0.10
180 topics = metadata.get("topics", [])
181 if topics:
182 query_keywords = set(query_context.get("keywords", []))
183 topic_texts = set()
184 for topic in topics:
185 if isinstance(topic, str):
186 topic_texts.add(topic.lower())
187 elif isinstance(topic, dict):
188 if "text" in topic:
189 topic_texts.add(str(topic["text"]).lower())
190 elif "topic" in topic:
191 topic_texts.add(str(topic["topic"]).lower())
192 else:
193 topic_texts.add(str(topic).lower())
194 if query_keywords.intersection(topic_texts):
195 boost_factor += 0.08
196 return boost_factor