Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/builder.py: 68%
199 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1from __future__ import annotations
3from typing import Any
6def _create_llm_provider_from_env(logger: Any | None = None) -> Any | None:
7 """Create an embeddings provider from qdrant-loader-core settings if available.
9 This mirrors the legacy dynamic import behavior and falls back to None when
10 unavailable. No exceptions propagate to callers.
11 """
12 try:
13 import os
14 from importlib import import_module
16 core_settings_mod = import_module("qdrant_loader_core.llm.settings")
17 core_factory_mod = import_module("qdrant_loader_core.llm.factory")
18 LLMSettings = core_settings_mod.LLMSettings
19 create_provider = core_factory_mod.create_provider
21 # 1) Try to load LLM settings from the MCP server config file (global.llm)
22 llm_cfg: dict | None = None
23 try:
24 cfg_loader_mod = import_module("qdrant_loader_mcp_server.config_loader")
25 load_config = getattr(cfg_loader_mod, "load_config", None)
26 if callable(load_config):
27 _cfg, effective, _used_file = load_config(None)
28 if isinstance(effective, dict):
29 maybe_llm = (effective.get("global") or {}).get("llm")
30 if isinstance(maybe_llm, dict) and maybe_llm:
31 # Make a shallow copy so we can safely overlay defaults/env
32 llm_cfg = dict(maybe_llm)
33 except Exception:
34 # Non-fatal: fall through to env-only defaults
35 llm_cfg = None
37 # 2) If no file config present, construct from environment (legacy behavior)
38 if not llm_cfg:
39 llm_cfg = {
40 "provider": (os.getenv("LLM_PROVIDER") or "openai"),
41 "base_url": os.getenv("LLM_BASE_URL"),
42 "api_key": os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY"),
43 "models": {
44 "embeddings": os.getenv("LLM_EMBEDDING_MODEL")
45 or "text-embedding-3-small",
46 },
47 "tokenizer": os.getenv("LLM_TOKENIZER") or "none",
48 "request": {},
49 "rate_limits": {},
50 "embeddings": {},
51 }
52 else:
53 # Ensure sane defaults and environment overlays for partial file configs
54 def _resolve_placeholder(
55 value: object,
56 fallback_env: str | None = None,
57 default: object | None = None,
58 ) -> object | None:
59 if (
60 isinstance(value, str)
61 and value.startswith("${")
62 and value.endswith("}")
63 ):
64 env_name = value[2:-1]
65 return (
66 os.getenv(env_name)
67 or (os.getenv(fallback_env) if fallback_env else None)
68 or default
69 )
70 return (
71 value
72 if value not in (None, "")
73 else (os.getenv(fallback_env) if fallback_env else default)
74 )
76 # Provider and endpoints
77 llm_cfg["provider"] = _resolve_placeholder(
78 llm_cfg.get("provider"), fallback_env="LLM_PROVIDER", default="openai"
79 )
80 llm_cfg["base_url"] = _resolve_placeholder(
81 llm_cfg.get("base_url"), fallback_env="LLM_BASE_URL", default=None
82 )
83 llm_cfg["api_key"] = _resolve_placeholder(
84 llm_cfg.get("api_key"),
85 fallback_env="LLM_API_KEY",
86 default=os.getenv("OPENAI_API_KEY"),
87 )
89 # Models
90 models = dict(llm_cfg.get("models") or {})
91 models["embeddings"] = _resolve_placeholder(
92 models.get("embeddings"),
93 fallback_env="LLM_EMBEDDING_MODEL",
94 default="text-embedding-3-small",
95 )
96 if models.get("chat") in (None, "") or (
97 isinstance(models.get("chat"), str)
98 and str(models.get("chat")).startswith("${")
99 and str(models.get("chat")).endswith("}")
100 ):
101 env_chat = os.getenv("LLM_CHAT_MODEL")
102 if env_chat:
103 models["chat"] = env_chat
104 llm_cfg["models"] = models
106 # Other optional blocks
107 llm_cfg["tokenizer"] = _resolve_placeholder(
108 llm_cfg.get("tokenizer"), fallback_env="LLM_TOKENIZER", default="none"
109 )
110 llm_cfg.setdefault("request", {})
111 llm_cfg.setdefault("rate_limits", {})
112 llm_cfg.setdefault("embeddings", {})
114 llm_settings = LLMSettings.from_global_config({"llm": llm_cfg})
115 return create_provider(llm_settings)
116 except ImportError:
117 # Attempt monorepo-relative import by adding sibling core package to sys.path
118 try:
119 import sys
120 from pathlib import Path
122 current_file = Path(__file__).resolve()
123 for ancestor in current_file.parents:
124 core_src = ancestor / "qdrant-loader-core" / "src"
125 if core_src.exists():
126 sys.path.append(str(core_src))
127 break
128 # Retry import after amending sys.path
129 from importlib import import_module as _import_module # type: ignore
131 core_settings_mod = _import_module("qdrant_loader_core.llm.settings")
132 core_factory_mod = _import_module("qdrant_loader_core.llm.factory")
133 LLMSettings = core_settings_mod.LLMSettings
134 create_provider = core_factory_mod.create_provider
136 import os as _os
138 llm_cfg = {
139 "provider": (_os.getenv("LLM_PROVIDER") or "openai"),
140 "base_url": _os.getenv("LLM_BASE_URL"),
141 "api_key": _os.getenv("LLM_API_KEY") or _os.getenv("OPENAI_API_KEY"),
142 "models": {
143 "embeddings": _os.getenv("LLM_EMBEDDING_MODEL")
144 or "text-embedding-3-small",
145 },
146 "tokenizer": _os.getenv("LLM_TOKENIZER") or "none",
147 "request": {},
148 "rate_limits": {},
149 "embeddings": {},
150 }
151 llm_settings = LLMSettings.from_global_config({"llm": llm_cfg})
152 return create_provider(llm_settings)
153 except Exception:
154 if logger is not None:
155 try:
156 logger.debug(
157 "LLM provider import failed after path adjustment; falling back to None",
158 exc_info=True,
159 )
160 except Exception:
161 pass
162 return None
163 except Exception as e:
164 if logger is not None:
165 try:
166 # Log full stack for unexpected provider errors
167 try:
168 logger.exception(
169 "Error creating LLM provider; falling back to None"
170 )
171 except Exception:
172 logger.debug(
173 "Error creating LLM provider; falling back to None: %s",
174 e,
175 exc_info=True,
176 )
177 except Exception:
178 pass
179 return None
182def create_spacy_analyzer(spacy_model: str = "en_core_web_md") -> Any:
183 """Create the SpaCyQueryAnalyzer instance."""
184 from ...nlp.spacy_analyzer import SpaCyQueryAnalyzer
186 return SpaCyQueryAnalyzer(spacy_model=spacy_model)
189def create_query_processor(spacy_analyzer: Any) -> Any:
190 """Create the QueryProcessor bound to the given analyzer."""
191 from ...components import QueryProcessor
193 return QueryProcessor(spacy_analyzer)
196def create_vector_search_service(
197 *,
198 qdrant_client: Any,
199 collection_name: str,
200 min_score: float,
201 search_config: Any | None,
202 embeddings_provider: Any | None,
203 openai_client: Any,
204) -> Any:
205 """Create VectorSearchService with optional cache/search tuning from config."""
206 from ...components import VectorSearchService
208 if search_config:
209 return VectorSearchService(
210 qdrant_client=qdrant_client,
211 collection_name=collection_name,
212 min_score=min_score,
213 cache_enabled=search_config.cache_enabled,
214 cache_ttl=search_config.cache_ttl,
215 cache_max_size=search_config.cache_max_size,
216 hnsw_ef=search_config.hnsw_ef,
217 use_exact_search=search_config.use_exact_search,
218 embeddings_provider=embeddings_provider,
219 openai_client=openai_client,
220 )
221 return VectorSearchService(
222 qdrant_client=qdrant_client,
223 collection_name=collection_name,
224 min_score=min_score,
225 embeddings_provider=embeddings_provider,
226 openai_client=openai_client,
227 )
230def create_keyword_search_service(*, qdrant_client: Any, collection_name: str) -> Any:
231 """Create KeywordSearchService."""
232 from ...components import KeywordSearchService
234 return KeywordSearchService(
235 qdrant_client=qdrant_client, collection_name=collection_name
236 )
239def create_result_combiner(
240 *,
241 vector_weight: float,
242 keyword_weight: float,
243 metadata_weight: float,
244 min_score: float,
245 spacy_analyzer: Any,
246) -> Any:
247 """Create ResultCombiner with provided weights and analyzer."""
248 from ...components import ResultCombiner
250 return ResultCombiner(
251 vector_weight=vector_weight,
252 keyword_weight=keyword_weight,
253 metadata_weight=metadata_weight,
254 min_score=min_score,
255 spacy_analyzer=spacy_analyzer,
256 )
259def create_intent_components(spacy_analyzer: Any, knowledge_graph: Any, enable: bool):
260 """Create intent classifier and adaptive strategy, or (None, None) if disabled."""
261 if not enable:
262 return None, None
263 from ...enhanced.intent_classifier import AdaptiveSearchStrategy, IntentClassifier
265 intent_classifier = IntentClassifier(spacy_analyzer)
266 adaptive_strategy = AdaptiveSearchStrategy(knowledge_graph)
267 return intent_classifier, adaptive_strategy
270def create_topic_chain_generator(spacy_analyzer: Any, knowledge_graph: Any) -> Any:
271 """Create TopicSearchChainGenerator."""
272 from ...enhanced.topic_search_chain import TopicSearchChainGenerator
274 return TopicSearchChainGenerator(spacy_analyzer, knowledge_graph)
277def create_faceted_engine() -> Any:
278 """Create FacetedSearchEngine."""
279 from ...enhanced.faceted_search import FacetedSearchEngine
281 return FacetedSearchEngine()
284def create_cdi_engine(
285 *,
286 spacy_analyzer: Any,
287 knowledge_graph: Any,
288 qdrant_client: Any,
289 openai_client: Any,
290 collection_name: str,
291 conflict_settings: dict | None,
292) -> Any:
293 """Create CrossDocumentIntelligenceEngine with provided settings."""
294 from ...enhanced.cross_document_intelligence import CrossDocumentIntelligenceEngine
296 return CrossDocumentIntelligenceEngine(
297 spacy_analyzer,
298 knowledge_graph,
299 qdrant_client,
300 openai_client,
301 collection_name,
302 conflict_settings=conflict_settings,
303 )
306def build_conflict_settings(search_config: Any | None) -> dict | None:
307 """Construct conflict detection settings from ``search_config`` safely."""
308 if search_config is None:
309 return None
310 try:
311 return {
312 "conflict_limit_default": getattr(
313 search_config, "conflict_limit_default", 10
314 ),
315 "conflict_max_pairs_total": getattr(
316 search_config, "conflict_max_pairs_total", 24
317 ),
318 "conflict_tier_caps": getattr(
319 search_config,
320 "conflict_tier_caps",
321 {"primary": 12, "secondary": 8, "tertiary": 4, "fallback": 0},
322 ),
323 "conflict_use_llm": getattr(search_config, "conflict_use_llm", True),
324 "conflict_max_llm_pairs": getattr(
325 search_config, "conflict_max_llm_pairs", 2
326 ),
327 "conflict_llm_model": getattr(
328 search_config, "conflict_llm_model", "gpt-4o-mini"
329 ),
330 "conflict_llm_timeout_s": getattr(
331 search_config, "conflict_llm_timeout_s", 12.0
332 ),
333 "conflict_overall_timeout_s": getattr(
334 search_config, "conflict_overall_timeout_s", 9.0
335 ),
336 "conflict_text_window_chars": getattr(
337 search_config, "conflict_text_window_chars", 2000
338 ),
339 "conflict_embeddings_timeout_s": getattr(
340 search_config, "conflict_embeddings_timeout_s", 2.0
341 ),
342 "conflict_embeddings_max_concurrency": getattr(
343 search_config, "conflict_embeddings_max_concurrency", 5
344 ),
345 }
346 except Exception:
347 return None
350def initialize_engine_components(
351 engine_self: Any,
352 *,
353 qdrant_client: Any,
354 openai_client: Any,
355 collection_name: str,
356 vector_weight: float,
357 keyword_weight: float,
358 metadata_weight: float,
359 min_score: float,
360 knowledge_graph: Any,
361 enable_intent_adaptation: bool,
362 search_config: Any | None,
363 processing_config: Any | None,
364) -> None:
365 """Initialize all engine components and wire optional processing hooks."""
366 # Analyzer and query processor
367 spacy_analyzer = create_spacy_analyzer(spacy_model="en_core_web_md")
368 query_processor = create_query_processor(spacy_analyzer)
370 # Embeddings provider and search services
371 # Create shared LLM provider if available from core settings
372 llm_provider = _create_llm_provider_from_env(logger=engine_self.logger)
373 embeddings_provider = llm_provider
374 # If an explicit OpenAI client is provided, prefer it over any auto-created provider
375 # so tests and engines that mock the client behave deterministically.
376 if openai_client is not None:
377 embeddings_provider = None
378 vector_search_service = create_vector_search_service(
379 qdrant_client=qdrant_client,
380 collection_name=collection_name,
381 min_score=min_score,
382 search_config=search_config,
383 embeddings_provider=embeddings_provider,
384 openai_client=openai_client,
385 )
386 keyword_search_service = create_keyword_search_service(
387 qdrant_client=qdrant_client, collection_name=collection_name
388 )
389 result_combiner = create_result_combiner(
390 vector_weight=vector_weight,
391 keyword_weight=keyword_weight,
392 metadata_weight=metadata_weight,
393 min_score=min_score,
394 spacy_analyzer=spacy_analyzer,
395 )
397 # Assign to engine
398 engine_self.spacy_analyzer = spacy_analyzer
399 engine_self.query_processor = query_processor
400 engine_self.vector_search_service = vector_search_service
401 engine_self.keyword_search_service = keyword_search_service
402 engine_self.result_combiner = result_combiner
404 # Metadata extractor
405 from ...components import MetadataExtractor
407 engine_self.metadata_extractor = MetadataExtractor()
409 # Pipeline and adapters
410 from ..adapters import (
411 KeywordSearcherAdapter,
412 ResultCombinerAdapter,
413 VectorSearcherAdapter,
414 )
415 from ..pipeline import HybridPipeline
417 engine_self.hybrid_pipeline = HybridPipeline(
418 vector_searcher=VectorSearcherAdapter(vector_search_service),
419 keyword_searcher=KeywordSearcherAdapter(keyword_search_service),
420 result_combiner=ResultCombinerAdapter(result_combiner),
421 reranker=None,
422 booster=None,
423 normalizer=None,
424 deduplicator=None,
425 )
427 # Orchestration utilities
428 from ..orchestration import HybridOrchestrator, QueryPlanner
430 engine_self._planner = QueryPlanner()
431 engine_self._orchestrator = HybridOrchestrator()
433 # Optional processing toggles
434 from ..components.reranking import HybridReranker
436 engine_self.processing_config = processing_config
437 if engine_self.hybrid_pipeline is not None and processing_config is not None:
438 if getattr(processing_config, "enable_reranker", False):
439 try:
440 engine_self.hybrid_pipeline.reranker = HybridReranker()
441 except Exception:
442 engine_self.hybrid_pipeline.reranker = None
443 if getattr(processing_config, "enable_booster", False):
444 from ..components.boosting import ResultBooster
446 engine_self.hybrid_pipeline.booster = ResultBooster()
447 # Backward-compat: support both enable_normalizer and enable_normalization
448 if getattr(processing_config, "enable_normalizer", False) or getattr(
449 processing_config, "enable_normalization", False
450 ):
451 from ..components.normalization import ScoreNormalizer
453 engine_self.hybrid_pipeline.normalizer = ScoreNormalizer()
454 # Backward-compat: support both enable_deduplicator and enable_deduplication
455 if getattr(processing_config, "enable_deduplicator", False) or getattr(
456 processing_config, "enable_deduplication", False
457 ):
458 from ..components.deduplication import ResultDeduplicator
460 engine_self.hybrid_pipeline.deduplicator = ResultDeduplicator()
462 # Enhanced search components
463 engine_self.enable_intent_adaptation = enable_intent_adaptation
464 engine_self.knowledge_graph = knowledge_graph
465 engine_self.intent_classifier, engine_self.adaptive_strategy = (
466 create_intent_components(
467 spacy_analyzer, knowledge_graph, enable_intent_adaptation
468 )
469 )
470 if engine_self.enable_intent_adaptation:
471 try:
472 engine_self.logger.info("Intent-aware adaptive search ENABLED")
473 except Exception:
474 pass
475 else:
476 try:
477 engine_self.logger.info("Intent-aware adaptive search DISABLED")
478 except Exception:
479 pass
481 # Topic chain generator
482 engine_self.topic_chain_generator = create_topic_chain_generator(
483 spacy_analyzer, knowledge_graph
484 )
485 engine_self._topic_chains_initialized = False
486 try:
487 engine_self.logger.info("Topic-driven search chaining ENABLED")
488 except Exception:
489 pass
491 # Faceted search
492 engine_self.faceted_search_engine = create_faceted_engine()
493 try:
494 engine_self.logger.info("Dynamic faceted search interface ENABLED")
495 except Exception:
496 pass
498 # Cross-document intelligence
499 conflict_settings = build_conflict_settings(search_config)
500 engine_self.cross_document_engine = create_cdi_engine(
501 spacy_analyzer=spacy_analyzer,
502 knowledge_graph=knowledge_graph,
503 qdrant_client=qdrant_client,
504 openai_client=openai_client,
505 collection_name=collection_name,
506 conflict_settings=conflict_settings,
507 )
508 # Attach provider for chat operations when available
509 try:
510 engine_self.cross_document_engine.llm_provider = llm_provider
511 # Also link detector back to engine to let llm_validation access provider
512 try:
513 detector = engine_self.cross_document_engine.conflict_detector
514 detector.engine = engine_self
515 except Exception:
516 pass
517 except Exception:
518 pass
519 try:
520 engine_self.logger.info("Cross-document intelligence ENABLED")
521 except Exception:
522 pass