Coverage for src / qdrant_loader / config / __init__.py: 86%

175 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-18 04:48 +0000

1"""Configuration module. 

2 

3This module provides the main configuration interface for the application. 

4It combines global settings with source-specific configurations. 

5""" 

6 

7import os 

8import re 

9from pathlib import Path 

10from typing import Any, Optional 

11 

12import yaml 

13from dotenv import load_dotenv 

14from pydantic import Field, ValidationError, model_validator 

15from pydantic_settings import BaseSettings, SettingsConfigDict 

16 

17from ..utils.logging import LoggingConfig 

18from .chunking import ChunkingConfig 

19 

20# Import consolidated configs 

21from .global_config import GlobalConfig, SemanticAnalysisConfig 

22 

23# Import multi-project support 

24from .models import ( 

25 ParsedConfig, 

26 ProjectConfig, 

27 ProjectContext, 

28 ProjectDetail, 

29 ProjectInfo, 

30 ProjectsConfig, 

31 ProjectStats, 

32) 

33from .parser import MultiProjectConfigParser 

34from .sources import SourcesConfig 

35from .state import StateManagementConfig 

36from .validator import ConfigValidator 

37from .workspace import WorkspaceConfig 

38 

39# Load environment variables from .env file 

40load_dotenv(override=False) 

41 

42 

43def _get_logger(): 

44 return LoggingConfig.get_logger(__name__) 

45 

46 

47# Lazy import function for connector configs 

48def _get_connector_configs(): 

49 """Lazy import connector configs to avoid circular dependencies.""" 

50 from ..connectors.confluence.config import ConfluenceSpaceConfig 

51 from ..connectors.git.config import GitAuthConfig, GitRepoConfig 

52 from ..connectors.jira.config import JiraProjectConfig 

53 from ..connectors.publicdocs.config import PublicDocsSourceConfig, SelectorsConfig 

54 

55 return { 

56 "ConfluenceSpaceConfig": ConfluenceSpaceConfig, 

57 "GitAuthConfig": GitAuthConfig, 

58 "GitRepoConfig": GitRepoConfig, 

59 "JiraProjectConfig": JiraProjectConfig, 

60 "PublicDocsSourceConfig": PublicDocsSourceConfig, 

61 "SelectorsConfig": SelectorsConfig, 

62 } 

63 

64 

65__all__ = [ 

66 "ChunkingConfig", 

67 "ConfluenceSpaceConfig", 

68 "GitAuthConfig", 

69 "GitRepoConfig", 

70 "GlobalConfig", 

71 "JiraProjectConfig", 

72 "PublicDocsSourceConfig", 

73 "SelectorsConfig", 

74 "SemanticAnalysisConfig", 

75 "Settings", 

76 "SourcesConfig", 

77 "StateManagementConfig", 

78 # Multi-project support 

79 "ProjectContext", 

80 "ProjectConfig", 

81 "ProjectsConfig", 

82 "ParsedConfig", 

83 "ProjectStats", 

84 "ProjectInfo", 

85 "ProjectDetail", 

86 "MultiProjectConfigParser", 

87 "ConfigValidator", 

88 # Functions 

89 "get_global_config", 

90 "get_settings", 

91 "initialize_config", 

92 "initialize_config_with_workspace", 

93] 

94 

95 

96# Add lazy loading for connector configs 

97def __getattr__(name): 

98 """Lazy import connector configs to avoid circular dependencies.""" 

99 connector_configs = _get_connector_configs() 

100 if name in connector_configs: 

101 return connector_configs[name] 

102 raise AttributeError(f"module '{__name__}' has no attribute '{name}'") 

103 

104 

105_global_settings: Optional["Settings"] = None 

106 

107 

108def get_settings() -> "Settings": 

109 """Get the global settings instance. 

110 

111 Returns: 

112 Settings: The global settings instance. 

113 """ 

114 if _global_settings is None: 

115 raise RuntimeError( 

116 "Settings not initialized. Call initialize_config() or initialize_config_with_workspace() first." 

117 ) 

118 return _global_settings 

119 

120 

121def get_global_config() -> GlobalConfig: 

122 """Get the global configuration instance. 

123 

124 Returns: 

125 GlobalConfig: The global configuration instance. 

126 """ 

127 return get_settings().global_config 

128 

129 

130def initialize_config( 

131 yaml_path: Path, env_path: Path | None = None, skip_validation: bool = False 

132) -> None: 

133 """Initialize the global configuration. 

134 

135 Args: 

136 yaml_path: Path to the YAML configuration file. 

137 env_path: Optional path to the .env file. 

138 skip_validation: If True, skip directory validation and creation. 

139 """ 

140 global _global_settings 

141 try: 

142 # Proceed with initialization 

143 _get_logger().debug( 

144 "Initializing configuration", 

145 yaml_path=str(yaml_path), 

146 env_path=str(env_path) if env_path else None, 

147 ) 

148 _global_settings = Settings.from_yaml( 

149 yaml_path, env_path=env_path, skip_validation=skip_validation 

150 ) 

151 _get_logger().debug("Successfully initialized configuration") 

152 

153 except Exception as e: 

154 _get_logger().error( 

155 "Failed to initialize configuration", error=str(e), yaml_path=str(yaml_path) 

156 ) 

157 raise 

158 

159 

160def initialize_config_with_workspace( 

161 workspace_config: WorkspaceConfig, skip_validation: bool = False 

162) -> None: 

163 """Initialize configuration using workspace settings. 

164 

165 Args: 

166 workspace_config: Workspace configuration with paths and settings 

167 skip_validation: If True, skip directory validation and creation 

168 """ 

169 global _global_settings 

170 try: 

171 _get_logger().debug( 

172 "Initializing configuration with workspace", 

173 workspace=str(workspace_config.workspace_path), 

174 config_path=str(workspace_config.config_path), 

175 env_path=( 

176 str(workspace_config.env_path) if workspace_config.env_path else None 

177 ), 

178 ) 

179 

180 # Load configuration using workspace paths 

181 _global_settings = Settings.from_yaml( 

182 workspace_config.config_path, 

183 env_path=workspace_config.env_path, 

184 skip_validation=skip_validation, 

185 ) 

186 

187 # Check if database_path was specified in config.yaml and warn user 

188 original_db_path = _global_settings.global_config.state_management.database_path 

189 workspace_db_path = str(workspace_config.database_path) 

190 

191 # Only warn if the original path is different from the workspace path and not empty/default 

192 if ( 

193 original_db_path 

194 and original_db_path != ":memory:" 

195 and original_db_path != workspace_db_path 

196 ): 

197 _get_logger().warning( 

198 "Database path in config.yaml is ignored in workspace mode", 

199 config_database_path=original_db_path, 

200 workspace_database_path=workspace_db_path, 

201 ) 

202 

203 # Override the database path with workspace-specific path 

204 _global_settings.global_config.state_management.database_path = ( 

205 workspace_db_path 

206 ) 

207 

208 _get_logger().debug( 

209 "Set workspace database path", 

210 database_path=workspace_db_path, 

211 ) 

212 

213 _get_logger().debug( 

214 "Successfully initialized configuration with workspace", 

215 workspace=str(workspace_config.workspace_path), 

216 ) 

217 

218 except Exception as e: 

219 _get_logger().error( 

220 "Failed to initialize configuration with workspace", 

221 error=str(e), 

222 workspace=str(workspace_config.workspace_path), 

223 ) 

224 raise 

225 

226 

227class Settings(BaseSettings): 

228 """Main configuration class combining global and source-specific settings.""" 

229 

230 # Configuration objects - these are the only fields we need 

231 global_config: GlobalConfig = Field( 

232 default_factory=GlobalConfig, description="Global configuration settings" 

233 ) 

234 projects_config: ProjectsConfig = Field( 

235 default_factory=ProjectsConfig, description="Multi-project configurations" 

236 ) 

237 

238 model_config = SettingsConfigDict( 

239 env_file=None, # Disable automatic .env loading - we handle this manually 

240 env_file_encoding="utf-8", 

241 extra="allow", 

242 ) 

243 

244 @model_validator(mode="after") # type: ignore 

245 def validate_source_configs(self) -> "Settings": 

246 """Validate that required configuration is present for configured sources.""" 

247 _get_logger().debug("Validating source configurations") 

248 

249 # Auto-resolve environment variables as fallbacks 

250 self._auto_resolve_env_vars() 

251 

252 # Validate that required fields are not empty after variable substitution 

253 if not self.global_config.qdrant.url: 

254 raise ValueError( 

255 "Qdrant URL is required but was not provided or substituted" 

256 ) 

257 

258 if not self.global_config.qdrant.collection_name: 

259 raise ValueError( 

260 "Qdrant collection name is required but was not provided or substituted" 

261 ) 

262 

263 # Note: Source validation is now handled at the project level 

264 # Each project's sources are validated when the project is processed 

265 

266 _get_logger().debug("Source configuration validation successful") 

267 return self 

268 

269 def _auto_resolve_env_vars(self) -> None: 

270 """Auto-resolve well-known environment variables as fallbacks. 

271 

272 Priority: config file value > environment variable > default. 

273 Only fills in values that were not explicitly set in config. 

274 

275 Note: detection uses default-value sentinels, so explicitly setting a 

276 config value equal to the default (e.g. url: http://localhost:6333) 

277 will still be overridden by the environment variable. 

278 """ 

279 # OPENAI_API_KEY → embedding.api_key and llm.api_key 

280 openai_key = os.getenv("OPENAI_API_KEY") 

281 if openai_key: 

282 if not self.global_config.embedding.api_key: 

283 self.global_config.embedding.api_key = openai_key 

284 if self.global_config.llm and isinstance(self.global_config.llm, dict): 

285 if not self.global_config.llm.get("api_key"): 

286 self.global_config.llm["api_key"] = openai_key 

287 

288 # QDRANT_URL → qdrant.url (override only if still default) 

289 qdrant_url = os.getenv("QDRANT_URL") 

290 if qdrant_url and self.global_config.qdrant.url == "http://localhost:6333": 

291 self.global_config.qdrant.url = qdrant_url 

292 

293 # QDRANT_API_KEY → qdrant.api_key 

294 qdrant_api_key = os.getenv("QDRANT_API_KEY") 

295 if qdrant_api_key and not self.global_config.qdrant.api_key: 

296 self.global_config.qdrant.api_key = qdrant_api_key 

297 

298 # QDRANT_COLLECTION_NAME → qdrant.collection_name 

299 collection = os.getenv("QDRANT_COLLECTION_NAME") 

300 if collection and self.global_config.qdrant.collection_name == "documents": 

301 self.global_config.qdrant.collection_name = collection 

302 

303 # STATE_DB_PATH → state_management.database_path 

304 # Note: In workspace mode this is overridden by workspace_config.database_path 

305 state_db = os.getenv("STATE_DB_PATH") 

306 if ( 

307 state_db 

308 and self.global_config.state_management.database_path == "./state.db" 

309 ): 

310 self.global_config.state_management.database_path = state_db 

311 

312 @property 

313 def qdrant_url(self) -> str: 

314 """Get the Qdrant URL from global configuration.""" 

315 return self.global_config.qdrant.url 

316 

317 @property 

318 def qdrant_api_key(self) -> str | None: 

319 """Get the Qdrant API key from global configuration.""" 

320 return self.global_config.qdrant.api_key 

321 

322 @property 

323 def qdrant_collection_name(self) -> str: 

324 """Get the Qdrant collection name from global configuration.""" 

325 return self.global_config.qdrant.collection_name 

326 

327 @property 

328 def openai_api_key(self) -> str: 

329 """Get the OpenAI API key from embedding configuration.""" 

330 api_key = self.global_config.embedding.api_key 

331 if not api_key: 

332 raise ValueError( 

333 "OpenAI API key is required but was not provided or substituted in embedding configuration" 

334 ) 

335 return api_key 

336 

337 @property 

338 def state_db_path(self) -> str: 

339 """Get the state database path from global configuration.""" 

340 return self.global_config.state_management.database_path 

341 

342 @property 

343 def llm_settings(self): 

344 """Provider-agnostic LLM settings derived from global configuration. 

345 

346 Uses `global.llm` when present; otherwise maps legacy fields. 

347 """ 

348 # Import lazily to avoid hard dependency issues in environments without core installed 

349 from importlib import import_module 

350 

351 settings_mod = import_module("qdrant_loader_core.llm.settings") 

352 LLMSettings = settings_mod.LLMSettings 

353 return LLMSettings.from_global_config(self.global_config.to_dict()) 

354 

355 @staticmethod 

356 def _substitute_env_vars(data: Any) -> Any: 

357 """Recursively substitute environment variables in configuration data. 

358 

359 Args: 

360 data: Configuration data to process 

361 

362 Returns: 

363 Processed data with environment variables substituted 

364 """ 

365 if isinstance(data, str): 

366 # First expand $HOME if present 

367 if "$HOME" in data: 

368 data = data.replace("$HOME", os.path.expanduser("~")) 

369 

370 # Then handle ${VAR_NAME} pattern 

371 pattern = r"\${([^}]+)}" 

372 matches = re.finditer(pattern, data) 

373 result = data 

374 for match in matches: 

375 var_name = match.group(1) 

376 env_value = os.getenv(var_name) 

377 if env_value is None: 

378 # Only warn about missing variables that are commonly required 

379 # Skip STATE_DB_PATH as it's often overridden in workspace mode 

380 if var_name not in ["STATE_DB_PATH"]: 

381 _get_logger().warning( 

382 "Environment variable not found", variable=var_name 

383 ) 

384 continue 

385 # If the environment variable contains $HOME, expand it 

386 if "$HOME" in env_value: 

387 env_value = env_value.replace("$HOME", os.path.expanduser("~")) 

388 result = result.replace(f"${{{var_name}}}", env_value) 

389 

390 return result 

391 elif isinstance(data, dict): 

392 return {k: Settings._substitute_env_vars(v) for k, v in data.items()} 

393 elif isinstance(data, list): 

394 return [Settings._substitute_env_vars(item) for item in data] 

395 return data 

396 

397 @classmethod 

398 def from_yaml( 

399 cls, 

400 config_path: Path, 

401 env_path: Path | None = None, 

402 skip_validation: bool = False, 

403 ) -> "Settings": 

404 """Load configuration from a YAML file. 

405 

406 Args: 

407 config_path: Path to the YAML configuration file. 

408 env_path: Optional path to the .env file. If provided, only this file is loaded. 

409 skip_validation: If True, skip directory validation and creation. 

410 

411 Returns: 

412 Settings: Loaded configuration. 

413 """ 

414 _get_logger().debug("Loading configuration from YAML", path=str(config_path)) 

415 try: 

416 # Step 1: Load environment variables first 

417 if env_path is not None: 

418 # Custom env file specified - load only this file 

419 _get_logger().debug( 

420 "Loading custom environment file", path=str(env_path) 

421 ) 

422 if not env_path.exists(): 

423 raise FileNotFoundError(f"Environment file not found: {env_path}") 

424 load_dotenv(env_path, override=True) 

425 else: 

426 # Load default .env file if it exists 

427 _get_logger().debug("Loading default environment variables") 

428 load_dotenv(override=False) 

429 

430 # Step 2: Load YAML config 

431 with open(config_path) as f: 

432 config_data = yaml.safe_load(f) 

433 

434 # Step 3: Process all environment variables in config using substitution 

435 _get_logger().debug("Processing environment variables in configuration") 

436 config_data = cls._substitute_env_vars(config_data) 

437 

438 # Step 4: Use multi-project parser to parse configuration 

439 validator = ConfigValidator() 

440 parser = MultiProjectConfigParser(validator) 

441 parsed_config = parser.parse(config_data, skip_validation=skip_validation) 

442 

443 # Step 5: Create settings instance with parsed configuration 

444 settings = cls( 

445 global_config=parsed_config.global_config, 

446 projects_config=parsed_config.projects_config, 

447 ) 

448 

449 _get_logger().debug("Successfully created Settings instance") 

450 return settings 

451 

452 except yaml.YAMLError as e: 

453 _get_logger().error("Failed to parse YAML configuration", error=str(e)) 

454 raise 

455 except ValidationError as e: 

456 _get_logger().error("Configuration validation failed", error=str(e)) 

457 raise 

458 except Exception as e: 

459 _get_logger().error("Unexpected error loading configuration", error=str(e)) 

460 raise 

461 

462 def to_dict(self) -> dict: 

463 """Convert the configuration to a dictionary. 

464 

465 Returns: 

466 dict: Configuration as a dictionary. 

467 """ 

468 return { 

469 "global": self.global_config.to_dict(), 

470 "projects": self.projects_config.to_dict(), 

471 }