Coverage for src / qdrant_loader_mcp_server / config_loader.py: 83%

145 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-10 09:41 +0000

1"""File-based configuration loader for the MCP server. 

2 

3Precedence: 

4- CLI --config 

5- MCP_CONFIG environment variable 

6- ./config.yaml 

7- ~/.config/qdrant-loader/config.yaml 

8- /etc/qdrant-loader/config.yaml 

9 

10Environment variables overlay values from file. CLI flags override env. 

11""" 

12 

13from __future__ import annotations 

14 

15import os 

16import re 

17from pathlib import Path 

18from typing import Any 

19 

20import yaml 

21 

22from qdrant_loader_mcp_server.config_reranking import MCPReranking 

23 

24from .config import Config, OpenAIConfig, QdrantConfig, SearchConfig 

25from .utils.logging import LoggingConfig 

26 

27logger = LoggingConfig.get_logger(__name__) 

28 

29 

30def _first_existing(paths: list[Path]) -> Path | None: 

31 for p in paths: 

32 if p and p.exists() and p.is_file(): 

33 return p 

34 return None 

35 

36 

37_ENV_VAR_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}") 

38 

39 

40def _substitute_env_vars(data: Any) -> Any: 

41 """Substitute ${VAR_NAME} patterns with environment variable values. 

42 

43 Only standard env var names are matched (letters, digits, underscores, 

44 starting with a letter or underscore). Bash-style defaults like 

45 ``${VAR:-fallback}`` are NOT supported — use .env files instead. 

46 

47 Processes strings, dicts, and lists recursively. Raises ValueError 

48 if a referenced variable is not set. 

49 """ 

50 if isinstance(data, str): 

51 

52 def replace(m: re.Match) -> str: 

53 val = os.getenv(m.group(1)) 

54 if val is None: 

55 raise ValueError( 

56 f"Environment variable '{m.group(1)}' referenced in config is not set" 

57 ) 

58 return val 

59 

60 return _ENV_VAR_PATTERN.sub(replace, data) 

61 if isinstance(data, dict): 

62 return {k: _substitute_env_vars(v) for k, v in data.items()} 

63 if isinstance(data, list): 

64 return [_substitute_env_vars(i) for i in data] 

65 return data 

66 

67 

68def resolve_config_path(cli_config: Path | None) -> Path | None: 

69 if cli_config: 

70 return cli_config 

71 env_cfg = os.getenv("MCP_CONFIG") 

72 if env_cfg: 

73 p = Path(env_cfg).expanduser() 

74 if p.exists(): 

75 return p 

76 candidates = [ 

77 Path.cwd() / "config.yaml", 

78 Path.home() / ".config" / "qdrant-loader" / "config.yaml", 

79 Path("/etc/qdrant-loader/config.yaml"), 

80 ] 

81 return _first_existing(candidates) 

82 

83 

84def _get_section(config_data: dict[str, Any], name: str) -> dict[str, Any]: 

85 # Only support "global" root going forward 

86 return config_data.get(name, {}) or {} 

87 

88 

89def _overlay_env_llm(llm: dict[str, Any]) -> None: 

90 # LLM env overrides 

91 if os.getenv("LLM_PROVIDER"): 

92 llm.setdefault("provider", os.getenv("LLM_PROVIDER")) 

93 llm["provider"] = os.getenv("LLM_PROVIDER") 

94 if os.getenv("LLM_BASE_URL"): 

95 llm["base_url"] = os.getenv("LLM_BASE_URL") 

96 if os.getenv("LLM_API_KEY"): 

97 llm["api_key"] = os.getenv("LLM_API_KEY") 

98 # models 

99 models = dict(llm.get("models") or {}) 

100 if os.getenv("LLM_EMBEDDING_MODEL"): 

101 models["embeddings"] = os.getenv("LLM_EMBEDDING_MODEL") 

102 if os.getenv("LLM_CHAT_MODEL"): 

103 models["chat"] = os.getenv("LLM_CHAT_MODEL") 

104 if models: 

105 llm["models"] = models 

106 

107 

108def _overlay_env_qdrant(qdrant: dict[str, Any]) -> None: 

109 # Override Qdrant settings with environment variables (unconditional). 

110 # Note: differs from qdrant_loader's _auto_resolve_env_vars() which only 

111 # applies env vars when the config value equals the default. 

112 # Priority: environment variable > config file value > QdrantConfig default. 

113 if os.getenv("QDRANT_URL"): 

114 qdrant["url"] = os.getenv("QDRANT_URL") 

115 if os.getenv("QDRANT_API_KEY"): 

116 qdrant["api_key"] = os.getenv("QDRANT_API_KEY") 

117 if os.getenv("QDRANT_COLLECTION_NAME"): 

118 qdrant["collection_name"] = os.getenv("QDRANT_COLLECTION_NAME") 

119 

120 

121def _overlay_env_search(search: dict[str, Any]) -> None: 

122 # Only a subset for Phase 0; SearchConfig has its own env fallbacks as well 

123 if os.getenv("SEARCH_CONFLICT_USE_LLM"): 

124 raw = os.getenv("SEARCH_CONFLICT_USE_LLM", "true").strip().lower() 

125 search["conflict_use_llm"] = raw in {"1", "true", "t", "yes", "y", "on"} 

126 if os.getenv("SEARCH_CONFLICT_LLM_MODEL"): 

127 search["conflict_llm_model"] = os.getenv("SEARCH_CONFLICT_LLM_MODEL") 

128 

129 

130def load_file_config(path: Path) -> dict[str, Any]: 

131 with path.open("r", encoding="utf-8") as f: 

132 raw = yaml.safe_load(f) or {} 

133 return _substitute_env_vars(raw) 

134 

135 

136def build_config_from_dict(config_data: dict[str, Any]) -> Config: 

137 global_data = _get_section(config_data, "global") 

138 llm = dict(global_data.get("llm") or {}) 

139 qdrant = dict(global_data.get("qdrant") or {}) 

140 search = dict(config_data.get("search") or {}) 

141 

142 # Deprecation: detect legacy blocks and log a warning once 

143 legacy_embedding = global_data.get("embedding") 

144 legacy_markit = ( 

145 (config_data.get("file_conversion") or {}).get("markitdown") 

146 if isinstance(config_data.get("file_conversion"), dict) 

147 else None 

148 ) 

149 try: 

150 if legacy_embedding or legacy_markit: 

151 logger.warning( 

152 "Legacy configuration fields detected; please migrate to global.llm", 

153 legacy_embedding=bool(legacy_embedding), 

154 legacy_markitdown=bool(legacy_markit), 

155 ) 

156 except Exception: 

157 pass 

158 

159 # Migrate legacy global.embedding → global.llm (backward compat) 

160 if legacy_embedding: 

161 if not llm.get("api_key") and legacy_embedding.get("api_key"): 

162 llm["api_key"] = legacy_embedding["api_key"] 

163 models_cfg = dict(llm.get("models") or {}) 

164 if not models_cfg.get("embeddings") and legacy_embedding.get("model"): 

165 models_cfg["embeddings"] = legacy_embedding["model"] 

166 llm["models"] = models_cfg 

167 # Migrate vector_size for use in OpenAIConfig 

168 if not llm.get("vector_size") and isinstance( 

169 legacy_embedding.get("vector_size"), int 

170 ): 

171 llm["vector_size"] = legacy_embedding["vector_size"] 

172 

173 # Extract vector_size from new format: global.llm.embeddings.vector_size 

174 # (different from models.embeddings which is a model name string) 

175 emb_cfg = llm.get("embeddings") or {} 

176 if isinstance(emb_cfg, dict) and isinstance(emb_cfg.get("vector_size"), int): 

177 if not llm.get("vector_size"): 

178 llm["vector_size"] = emb_cfg["vector_size"] 

179 

180 # Apply environment overrides 

181 _overlay_env_llm(llm) 

182 _overlay_env_qdrant(qdrant) 

183 _overlay_env_search(search) 

184 

185 # Derive OpenAIConfig for now (Phase 0); will be replaced by core LLM provider later 

186 api_key = llm.get("api_key") or os.getenv("OPENAI_API_KEY") 

187 models = dict(llm.get("models") or {}) 

188 embedding_model = ( 

189 models.get("embeddings") 

190 or os.getenv("LLM_EMBEDDING_MODEL") 

191 or "text-embedding-3-small" 

192 ) 

193 chat_model = models.get("chat") or os.getenv("LLM_CHAT_MODEL") or "gpt-3.5-turbo" 

194 

195 # Build reranking config from global section if present 

196 reranking_cfg = None 

197 if "reranking" in global_data: 

198 if not isinstance(global_data["reranking"], dict): 

199 raise ValueError("global.reranking must be a mapping") 

200 reranking_cfg = MCPReranking(**global_data["reranking"]) 

201 

202 vector_size = llm.get("vector_size") # int | None 

203 cfg = Config( 

204 qdrant=QdrantConfig(**qdrant) if qdrant else QdrantConfig(), 

205 openai=OpenAIConfig( 

206 api_key=api_key, 

207 model=embedding_model, 

208 chat_model=chat_model, 

209 vector_size=vector_size, 

210 ), 

211 search=SearchConfig(**search) if search else SearchConfig(), 

212 reranking=reranking_cfg if reranking_cfg is not None else MCPReranking(), 

213 ) 

214 return cfg 

215 

216 

217def redact_effective_config(effective: dict[str, Any]) -> dict[str, Any]: 

218 def _redact(obj: Any) -> Any: 

219 if isinstance(obj, dict): 

220 redacted = {} 

221 for k, v in obj.items(): 

222 if k in {"api_key", "Authorization"} and isinstance(v, str) and v: 

223 redacted[k] = "***REDACTED***" 

224 else: 

225 redacted[k] = _redact(v) 

226 return redacted 

227 if isinstance(obj, list): 

228 return [_redact(i) for i in obj] 

229 return obj 

230 

231 return _redact(effective) 

232 

233 

234def load_config(cli_config: Path | None) -> tuple[Config, dict[str, Any], bool]: 

235 """Load effective configuration. 

236 

237 Returns (config_obj, effective_dict, used_file: bool) 

238 """ 

239 config_path = resolve_config_path(cli_config) 

240 used_file = False 

241 if config_path: 

242 try: 

243 data = load_file_config(config_path) 

244 cfg = build_config_from_dict(data) 

245 used_file = True 

246 # Effective dict for printing (merge file data with derived) 

247 effective = { 

248 "global": { 

249 "llm": data.get("global", {}).get("llm"), 

250 "qdrant": data.get("global", {}).get("qdrant"), 

251 }, 

252 "search": data.get("search"), 

253 "derived": { 

254 "openai": { 

255 "model": cfg.openai.model, 

256 "chat_model": cfg.openai.chat_model, 

257 "api_key": cfg.openai.api_key, 

258 } 

259 }, 

260 } 

261 return cfg, effective, used_file 

262 except Exception as e: 

263 logger.warning( 

264 "Failed to load config file; falling back to env-only", error=str(e) 

265 ) 

266 

267 # Fallback to legacy env-only mode (deprecated) 

268 cfg = Config() 

269 effective = { 

270 "global": { 

271 "llm": { 

272 "provider": os.getenv("LLM_PROVIDER"), 

273 "base_url": os.getenv("LLM_BASE_URL"), 

274 "api_key": os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY"), 

275 "models": { 

276 "embeddings": os.getenv("LLM_EMBEDDING_MODEL"), 

277 "chat": os.getenv("LLM_CHAT_MODEL"), 

278 }, 

279 }, 

280 "qdrant": { 

281 "url": os.getenv("QDRANT_URL"), 

282 "api_key": os.getenv("QDRANT_API_KEY"), 

283 "collection_name": os.getenv("QDRANT_COLLECTION_NAME"), 

284 }, 

285 }, 

286 "search": None, 

287 "derived": { 

288 "openai": { 

289 "model": cfg.openai.model, 

290 "chat_model": cfg.openai.chat_model, 

291 "api_key": cfg.openai.api_key, 

292 } 

293 }, 

294 "warning": "Using legacy env-only mode; providing a config file is recommended and will be required in a future release.", 

295 } 

296 try: 

297 logger.warning( 

298 "Running in legacy env-only mode; provide --config or MCP_CONFIG file", 

299 ) 

300 except Exception: 

301 pass 

302 return cfg, effective, used_file