Coverage for src / qdrant_loader_mcp_server / config_loader.py: 83%
145 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:41 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:41 +0000
1"""File-based configuration loader for the MCP server.
3Precedence:
4- CLI --config
5- MCP_CONFIG environment variable
6- ./config.yaml
7- ~/.config/qdrant-loader/config.yaml
8- /etc/qdrant-loader/config.yaml
10Environment variables overlay values from file. CLI flags override env.
11"""
13from __future__ import annotations
15import os
16import re
17from pathlib import Path
18from typing import Any
20import yaml
22from qdrant_loader_mcp_server.config_reranking import MCPReranking
24from .config import Config, OpenAIConfig, QdrantConfig, SearchConfig
25from .utils.logging import LoggingConfig
27logger = LoggingConfig.get_logger(__name__)
30def _first_existing(paths: list[Path]) -> Path | None:
31 for p in paths:
32 if p and p.exists() and p.is_file():
33 return p
34 return None
37_ENV_VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
40def _substitute_env_vars(data: Any) -> Any:
41 """Substitute ${VAR_NAME} patterns with environment variable values.
43 Only standard env var names are matched (letters, digits, underscores,
44 starting with a letter or underscore). Bash-style defaults like
45 ``${VAR:-fallback}`` are NOT supported — use .env files instead.
47 Processes strings, dicts, and lists recursively. Raises ValueError
48 if a referenced variable is not set.
49 """
50 if isinstance(data, str):
52 def replace(m: re.Match) -> str:
53 val = os.getenv(m.group(1))
54 if val is None:
55 raise ValueError(
56 f"Environment variable '{m.group(1)}' referenced in config is not set"
57 )
58 return val
60 return _ENV_VAR_PATTERN.sub(replace, data)
61 if isinstance(data, dict):
62 return {k: _substitute_env_vars(v) for k, v in data.items()}
63 if isinstance(data, list):
64 return [_substitute_env_vars(i) for i in data]
65 return data
68def resolve_config_path(cli_config: Path | None) -> Path | None:
69 if cli_config:
70 return cli_config
71 env_cfg = os.getenv("MCP_CONFIG")
72 if env_cfg:
73 p = Path(env_cfg).expanduser()
74 if p.exists():
75 return p
76 candidates = [
77 Path.cwd() / "config.yaml",
78 Path.home() / ".config" / "qdrant-loader" / "config.yaml",
79 Path("/etc/qdrant-loader/config.yaml"),
80 ]
81 return _first_existing(candidates)
84def _get_section(config_data: dict[str, Any], name: str) -> dict[str, Any]:
85 # Only support "global" root going forward
86 return config_data.get(name, {}) or {}
89def _overlay_env_llm(llm: dict[str, Any]) -> None:
90 # LLM env overrides
91 if os.getenv("LLM_PROVIDER"):
92 llm.setdefault("provider", os.getenv("LLM_PROVIDER"))
93 llm["provider"] = os.getenv("LLM_PROVIDER")
94 if os.getenv("LLM_BASE_URL"):
95 llm["base_url"] = os.getenv("LLM_BASE_URL")
96 if os.getenv("LLM_API_KEY"):
97 llm["api_key"] = os.getenv("LLM_API_KEY")
98 # models
99 models = dict(llm.get("models") or {})
100 if os.getenv("LLM_EMBEDDING_MODEL"):
101 models["embeddings"] = os.getenv("LLM_EMBEDDING_MODEL")
102 if os.getenv("LLM_CHAT_MODEL"):
103 models["chat"] = os.getenv("LLM_CHAT_MODEL")
104 if models:
105 llm["models"] = models
108def _overlay_env_qdrant(qdrant: dict[str, Any]) -> None:
109 # Override Qdrant settings with environment variables (unconditional).
110 # Note: differs from qdrant_loader's _auto_resolve_env_vars() which only
111 # applies env vars when the config value equals the default.
112 # Priority: environment variable > config file value > QdrantConfig default.
113 if os.getenv("QDRANT_URL"):
114 qdrant["url"] = os.getenv("QDRANT_URL")
115 if os.getenv("QDRANT_API_KEY"):
116 qdrant["api_key"] = os.getenv("QDRANT_API_KEY")
117 if os.getenv("QDRANT_COLLECTION_NAME"):
118 qdrant["collection_name"] = os.getenv("QDRANT_COLLECTION_NAME")
121def _overlay_env_search(search: dict[str, Any]) -> None:
122 # Only a subset for Phase 0; SearchConfig has its own env fallbacks as well
123 if os.getenv("SEARCH_CONFLICT_USE_LLM"):
124 raw = os.getenv("SEARCH_CONFLICT_USE_LLM", "true").strip().lower()
125 search["conflict_use_llm"] = raw in {"1", "true", "t", "yes", "y", "on"}
126 if os.getenv("SEARCH_CONFLICT_LLM_MODEL"):
127 search["conflict_llm_model"] = os.getenv("SEARCH_CONFLICT_LLM_MODEL")
130def load_file_config(path: Path) -> dict[str, Any]:
131 with path.open("r", encoding="utf-8") as f:
132 raw = yaml.safe_load(f) or {}
133 return _substitute_env_vars(raw)
136def build_config_from_dict(config_data: dict[str, Any]) -> Config:
137 global_data = _get_section(config_data, "global")
138 llm = dict(global_data.get("llm") or {})
139 qdrant = dict(global_data.get("qdrant") or {})
140 search = dict(config_data.get("search") or {})
142 # Deprecation: detect legacy blocks and log a warning once
143 legacy_embedding = global_data.get("embedding")
144 legacy_markit = (
145 (config_data.get("file_conversion") or {}).get("markitdown")
146 if isinstance(config_data.get("file_conversion"), dict)
147 else None
148 )
149 try:
150 if legacy_embedding or legacy_markit:
151 logger.warning(
152 "Legacy configuration fields detected; please migrate to global.llm",
153 legacy_embedding=bool(legacy_embedding),
154 legacy_markitdown=bool(legacy_markit),
155 )
156 except Exception:
157 pass
159 # Migrate legacy global.embedding → global.llm (backward compat)
160 if legacy_embedding:
161 if not llm.get("api_key") and legacy_embedding.get("api_key"):
162 llm["api_key"] = legacy_embedding["api_key"]
163 models_cfg = dict(llm.get("models") or {})
164 if not models_cfg.get("embeddings") and legacy_embedding.get("model"):
165 models_cfg["embeddings"] = legacy_embedding["model"]
166 llm["models"] = models_cfg
167 # Migrate vector_size for use in OpenAIConfig
168 if not llm.get("vector_size") and isinstance(
169 legacy_embedding.get("vector_size"), int
170 ):
171 llm["vector_size"] = legacy_embedding["vector_size"]
173 # Extract vector_size from new format: global.llm.embeddings.vector_size
174 # (different from models.embeddings which is a model name string)
175 emb_cfg = llm.get("embeddings") or {}
176 if isinstance(emb_cfg, dict) and isinstance(emb_cfg.get("vector_size"), int):
177 if not llm.get("vector_size"):
178 llm["vector_size"] = emb_cfg["vector_size"]
180 # Apply environment overrides
181 _overlay_env_llm(llm)
182 _overlay_env_qdrant(qdrant)
183 _overlay_env_search(search)
185 # Derive OpenAIConfig for now (Phase 0); will be replaced by core LLM provider later
186 api_key = llm.get("api_key") or os.getenv("OPENAI_API_KEY")
187 models = dict(llm.get("models") or {})
188 embedding_model = (
189 models.get("embeddings")
190 or os.getenv("LLM_EMBEDDING_MODEL")
191 or "text-embedding-3-small"
192 )
193 chat_model = models.get("chat") or os.getenv("LLM_CHAT_MODEL") or "gpt-3.5-turbo"
195 # Build reranking config from global section if present
196 reranking_cfg = None
197 if "reranking" in global_data:
198 if not isinstance(global_data["reranking"], dict):
199 raise ValueError("global.reranking must be a mapping")
200 reranking_cfg = MCPReranking(**global_data["reranking"])
202 vector_size = llm.get("vector_size") # int | None
203 cfg = Config(
204 qdrant=QdrantConfig(**qdrant) if qdrant else QdrantConfig(),
205 openai=OpenAIConfig(
206 api_key=api_key,
207 model=embedding_model,
208 chat_model=chat_model,
209 vector_size=vector_size,
210 ),
211 search=SearchConfig(**search) if search else SearchConfig(),
212 reranking=reranking_cfg if reranking_cfg is not None else MCPReranking(),
213 )
214 return cfg
217def redact_effective_config(effective: dict[str, Any]) -> dict[str, Any]:
218 def _redact(obj: Any) -> Any:
219 if isinstance(obj, dict):
220 redacted = {}
221 for k, v in obj.items():
222 if k in {"api_key", "Authorization"} and isinstance(v, str) and v:
223 redacted[k] = "***REDACTED***"
224 else:
225 redacted[k] = _redact(v)
226 return redacted
227 if isinstance(obj, list):
228 return [_redact(i) for i in obj]
229 return obj
231 return _redact(effective)
234def load_config(cli_config: Path | None) -> tuple[Config, dict[str, Any], bool]:
235 """Load effective configuration.
237 Returns (config_obj, effective_dict, used_file: bool)
238 """
239 config_path = resolve_config_path(cli_config)
240 used_file = False
241 if config_path:
242 try:
243 data = load_file_config(config_path)
244 cfg = build_config_from_dict(data)
245 used_file = True
246 # Effective dict for printing (merge file data with derived)
247 effective = {
248 "global": {
249 "llm": data.get("global", {}).get("llm"),
250 "qdrant": data.get("global", {}).get("qdrant"),
251 },
252 "search": data.get("search"),
253 "derived": {
254 "openai": {
255 "model": cfg.openai.model,
256 "chat_model": cfg.openai.chat_model,
257 "api_key": cfg.openai.api_key,
258 }
259 },
260 }
261 return cfg, effective, used_file
262 except Exception as e:
263 logger.warning(
264 "Failed to load config file; falling back to env-only", error=str(e)
265 )
267 # Fallback to legacy env-only mode (deprecated)
268 cfg = Config()
269 effective = {
270 "global": {
271 "llm": {
272 "provider": os.getenv("LLM_PROVIDER"),
273 "base_url": os.getenv("LLM_BASE_URL"),
274 "api_key": os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY"),
275 "models": {
276 "embeddings": os.getenv("LLM_EMBEDDING_MODEL"),
277 "chat": os.getenv("LLM_CHAT_MODEL"),
278 },
279 },
280 "qdrant": {
281 "url": os.getenv("QDRANT_URL"),
282 "api_key": os.getenv("QDRANT_API_KEY"),
283 "collection_name": os.getenv("QDRANT_COLLECTION_NAME"),
284 },
285 },
286 "search": None,
287 "derived": {
288 "openai": {
289 "model": cfg.openai.model,
290 "chat_model": cfg.openai.chat_model,
291 "api_key": cfg.openai.api_key,
292 }
293 },
294 "warning": "Using legacy env-only mode; providing a config file is recommended and will be required in a future release.",
295 }
296 try:
297 logger.warning(
298 "Running in legacy env-only mode; provide --config or MCP_CONFIG file",
299 )
300 except Exception:
301 pass
302 return cfg, effective, used_file