Coverage for src / qdrant_loader / config / __init__.py: 86%

178 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-10 09:40 +0000

1"""Configuration module. 

2 

3This module provides the main configuration interface for the application. 

4It combines global settings with source-specific configurations. 

5""" 

6 

7import os 

8import re 

9from pathlib import Path 

10from typing import Any, Optional 

11 

12import yaml 

13from dotenv import load_dotenv 

14from pydantic import Field, ValidationError, model_validator 

15from pydantic_settings import BaseSettings, SettingsConfigDict 

16 

17from ..utils.logging import LoggingConfig 

18from ..utils.sensitive import sanitize_exception_message 

19from .chunking import ChunkingConfig 

20 

21# Import consolidated configs 

22from .global_config import GlobalConfig, SemanticAnalysisConfig 

23 

24# Import multi-project support 

25from .models import ( 

26 ParsedConfig, 

27 ProjectConfig, 

28 ProjectContext, 

29 ProjectDetail, 

30 ProjectInfo, 

31 ProjectsConfig, 

32 ProjectStats, 

33) 

34from .parser import MultiProjectConfigParser 

35from .sources import SourcesConfig 

36from .state import StateManagementConfig 

37from .validator import ConfigValidator 

38from .workspace import WorkspaceConfig 

39 

40# Load environment variables from .env file 

41load_dotenv(override=False) 

42 

43 

44def _get_logger(): 

45 return LoggingConfig.get_logger(__name__) 

46 

47 

48# Lazy import function for connector configs 

49def _get_connector_configs(): 

50 """Lazy import connector configs to avoid circular dependencies.""" 

51 from ..connectors.confluence.config import ConfluenceSpaceConfig 

52 from ..connectors.git.config import GitAuthConfig, GitRepoConfig 

53 from ..connectors.jira.config import JiraProjectConfig 

54 from ..connectors.publicdocs.config import PublicDocsSourceConfig, SelectorsConfig 

55 

56 return { 

57 "ConfluenceSpaceConfig": ConfluenceSpaceConfig, 

58 "GitAuthConfig": GitAuthConfig, 

59 "GitRepoConfig": GitRepoConfig, 

60 "JiraProjectConfig": JiraProjectConfig, 

61 "PublicDocsSourceConfig": PublicDocsSourceConfig, 

62 "SelectorsConfig": SelectorsConfig, 

63 } 

64 

65 

66__all__ = [ 

67 "ChunkingConfig", 

68 "ConfluenceSpaceConfig", 

69 "GitAuthConfig", 

70 "GitRepoConfig", 

71 "GlobalConfig", 

72 "JiraProjectConfig", 

73 "PublicDocsSourceConfig", 

74 "SelectorsConfig", 

75 "SemanticAnalysisConfig", 

76 "Settings", 

77 "SourcesConfig", 

78 "StateManagementConfig", 

79 # Multi-project support 

80 "ProjectContext", 

81 "ProjectConfig", 

82 "ProjectsConfig", 

83 "ParsedConfig", 

84 "ProjectStats", 

85 "ProjectInfo", 

86 "ProjectDetail", 

87 "MultiProjectConfigParser", 

88 "ConfigValidator", 

89 # Functions 

90 "get_global_config", 

91 "get_settings", 

92 "initialize_config", 

93 "initialize_config_with_workspace", 

94] 

95 

96 

97# Add lazy loading for connector configs 

98def __getattr__(name): 

99 """Lazy import connector configs to avoid circular dependencies.""" 

100 connector_configs = _get_connector_configs() 

101 if name in connector_configs: 

102 return connector_configs[name] 

103 raise AttributeError(f"module '{__name__}' has no attribute '{name}'") 

104 

105 

106_global_settings: Optional["Settings"] = None 

107 

108 

109def get_settings() -> "Settings": 

110 """Get the global settings instance. 

111 

112 Returns: 

113 Settings: The global settings instance. 

114 """ 

115 if _global_settings is None: 

116 raise RuntimeError( 

117 "Settings not initialized. Call initialize_config() or initialize_config_with_workspace() first." 

118 ) 

119 return _global_settings 

120 

121 

122def get_global_config() -> GlobalConfig: 

123 """Get the global configuration instance. 

124 

125 Returns: 

126 GlobalConfig: The global configuration instance. 

127 """ 

128 return get_settings().global_config 

129 

130 

131def initialize_config( 

132 yaml_path: Path, env_path: Path | None = None, skip_validation: bool = False 

133) -> None: 

134 """Initialize the global configuration. 

135 

136 Args: 

137 yaml_path: Path to the YAML configuration file. 

138 env_path: Optional path to the .env file. 

139 skip_validation: If True, skip directory validation and creation. 

140 """ 

141 global _global_settings 

142 try: 

143 # Proceed with initialization 

144 _get_logger().debug( 

145 "Initializing configuration", 

146 yaml_path=str(yaml_path), 

147 env_path=str(env_path) if env_path else None, 

148 ) 

149 _global_settings = Settings.from_yaml( 

150 yaml_path, env_path=env_path, skip_validation=skip_validation 

151 ) 

152 _get_logger().debug("Successfully initialized configuration") 

153 

154 except Exception as e: 

155 safe_error = sanitize_exception_message(e) 

156 _get_logger().error( 

157 "Failed to initialize configuration", 

158 error=safe_error, 

159 yaml_path=str(yaml_path), 

160 ) 

161 raise 

162 

163 

164def initialize_config_with_workspace( 

165 workspace_config: WorkspaceConfig, skip_validation: bool = False 

166) -> None: 

167 """Initialize configuration using workspace settings. 

168 

169 Args: 

170 workspace_config: Workspace configuration with paths and settings 

171 skip_validation: If True, skip directory validation and creation 

172 """ 

173 global _global_settings 

174 try: 

175 _get_logger().debug( 

176 "Initializing configuration with workspace", 

177 workspace=str(workspace_config.workspace_path), 

178 config_path=str(workspace_config.config_path), 

179 env_path=( 

180 str(workspace_config.env_path) if workspace_config.env_path else None 

181 ), 

182 ) 

183 

184 # Load configuration using workspace paths 

185 _global_settings = Settings.from_yaml( 

186 workspace_config.config_path, 

187 env_path=workspace_config.env_path, 

188 skip_validation=skip_validation, 

189 ) 

190 

191 # Check if database_path was specified in config.yaml and warn user 

192 original_db_path = _global_settings.global_config.state_management.database_path 

193 workspace_db_path = str(workspace_config.database_path) 

194 

195 # Only warn if the original path is different from the workspace path and not empty/default 

196 if ( 

197 original_db_path 

198 and original_db_path != ":memory:" 

199 and original_db_path != workspace_db_path 

200 ): 

201 _get_logger().warning( 

202 "Database path in config.yaml is ignored in workspace mode", 

203 config_database_path=original_db_path, 

204 workspace_database_path=workspace_db_path, 

205 ) 

206 

207 # Override the database path with workspace-specific path 

208 _global_settings.global_config.state_management.database_path = ( 

209 workspace_db_path 

210 ) 

211 

212 _get_logger().debug( 

213 "Set workspace database path", 

214 database_path=workspace_db_path, 

215 ) 

216 

217 _get_logger().debug( 

218 "Successfully initialized configuration with workspace", 

219 workspace=str(workspace_config.workspace_path), 

220 ) 

221 

222 except Exception as e: 

223 safe_error = sanitize_exception_message(e) 

224 _get_logger().error( 

225 "Failed to initialize configuration with workspace", 

226 error=safe_error, 

227 workspace=str(workspace_config.workspace_path), 

228 ) 

229 raise 

230 

231 

232class Settings(BaseSettings): 

233 """Main configuration class combining global and source-specific settings.""" 

234 

235 # Configuration objects - these are the only fields we need 

236 global_config: GlobalConfig = Field( 

237 default_factory=GlobalConfig, description="Global configuration settings" 

238 ) 

239 projects_config: ProjectsConfig = Field( 

240 default_factory=ProjectsConfig, description="Multi-project configurations" 

241 ) 

242 

243 model_config = SettingsConfigDict( 

244 env_file=None, # Disable automatic .env loading - we handle this manually 

245 env_file_encoding="utf-8", 

246 extra="allow", 

247 ) 

248 

249 @model_validator(mode="after") # type: ignore 

250 def validate_source_configs(self) -> "Settings": 

251 """Validate that required configuration is present for configured sources.""" 

252 _get_logger().debug("Validating source configurations") 

253 

254 # Auto-resolve environment variables as fallbacks 

255 self._auto_resolve_env_vars() 

256 

257 # Validate that required fields are not empty after variable substitution 

258 if not self.global_config.qdrant.url: 

259 raise ValueError( 

260 "Qdrant URL is required but was not provided or substituted" 

261 ) 

262 

263 if not self.global_config.qdrant.collection_name: 

264 raise ValueError( 

265 "Qdrant collection name is required but was not provided or substituted" 

266 ) 

267 

268 # Note: Source validation is now handled at the project level 

269 # Each project's sources are validated when the project is processed 

270 

271 _get_logger().debug("Source configuration validation successful") 

272 return self 

273 

274 def _auto_resolve_env_vars(self) -> None: 

275 """Auto-resolve well-known environment variables as fallbacks. 

276 

277 Priority: config file value > environment variable > default. 

278 Only fills in values that were not explicitly set in config. 

279 

280 Note: detection uses default-value sentinels, so explicitly setting a 

281 config value equal to the default (e.g. url: http://localhost:6333) 

282 will still be overridden by the environment variable. 

283 """ 

284 # OPENAI_API_KEY → embedding.api_key and llm.api_key 

285 openai_key = os.getenv("OPENAI_API_KEY") 

286 if openai_key: 

287 if not self.global_config.embedding.api_key: 

288 self.global_config.embedding.api_key = openai_key 

289 if self.global_config.llm and isinstance(self.global_config.llm, dict): 

290 if not self.global_config.llm.get("api_key"): 

291 self.global_config.llm["api_key"] = openai_key 

292 

293 # QDRANT_URL → qdrant.url (override only if still default) 

294 qdrant_url = os.getenv("QDRANT_URL") 

295 if qdrant_url and self.global_config.qdrant.url == "http://localhost:6333": 

296 self.global_config.qdrant.url = qdrant_url 

297 

298 # QDRANT_API_KEY → qdrant.api_key 

299 qdrant_api_key = os.getenv("QDRANT_API_KEY") 

300 if qdrant_api_key and not self.global_config.qdrant.api_key: 

301 self.global_config.qdrant.api_key = qdrant_api_key 

302 

303 # QDRANT_COLLECTION_NAME → qdrant.collection_name 

304 collection = os.getenv("QDRANT_COLLECTION_NAME") 

305 if collection and self.global_config.qdrant.collection_name == "documents": 

306 self.global_config.qdrant.collection_name = collection 

307 

308 # STATE_DB_PATH → state_management.database_path 

309 # Note: In workspace mode this is overridden by workspace_config.database_path 

310 state_db = os.getenv("STATE_DB_PATH") 

311 if ( 

312 state_db 

313 and self.global_config.state_management.database_path == "./state.db" 

314 ): 

315 self.global_config.state_management.database_path = state_db 

316 

317 @property 

318 def qdrant_url(self) -> str: 

319 """Get the Qdrant URL from global configuration.""" 

320 return self.global_config.qdrant.url 

321 

322 @property 

323 def qdrant_api_key(self) -> str | None: 

324 """Get the Qdrant API key from global configuration.""" 

325 return self.global_config.qdrant.api_key 

326 

327 @property 

328 def qdrant_collection_name(self) -> str: 

329 """Get the Qdrant collection name from global configuration.""" 

330 return self.global_config.qdrant.collection_name 

331 

332 @property 

333 def openai_api_key(self) -> str: 

334 """Get the OpenAI API key from embedding configuration.""" 

335 api_key = self.global_config.embedding.api_key 

336 if not api_key: 

337 raise ValueError( 

338 "OpenAI API key is required but was not provided or substituted in embedding configuration" 

339 ) 

340 return api_key 

341 

342 @property 

343 def state_db_path(self) -> str: 

344 """Get the state database path from global configuration.""" 

345 return self.global_config.state_management.database_path 

346 

347 @property 

348 def llm_settings(self): 

349 """Provider-agnostic LLM settings derived from global configuration. 

350 

351 Uses `global.llm` when present; otherwise maps legacy fields. 

352 """ 

353 # Import lazily to avoid hard dependency issues in environments without core installed 

354 from importlib import import_module 

355 

356 settings_mod = import_module("qdrant_loader_core.llm.settings") 

357 LLMSettings = settings_mod.LLMSettings 

358 return LLMSettings.from_global_config(self.global_config.to_dict()) 

359 

360 @staticmethod 

361 def _substitute_env_vars(data: Any) -> Any: 

362 """Recursively substitute environment variables in configuration data. 

363 

364 Args: 

365 data: Configuration data to process 

366 

367 Returns: 

368 Processed data with environment variables substituted 

369 """ 

370 if isinstance(data, str): 

371 # First expand $HOME if present 

372 if "$HOME" in data: 

373 data = data.replace("$HOME", os.path.expanduser("~")) 

374 

375 # Then handle ${VAR_NAME} pattern 

376 pattern = r"\${([^}]+)}" 

377 matches = re.finditer(pattern, data) 

378 result = data 

379 for match in matches: 

380 var_name = match.group(1) 

381 env_value = os.getenv(var_name) 

382 if env_value is None: 

383 # Only warn about missing variables that are commonly required 

384 # Skip STATE_DB_PATH as it's often overridden in workspace mode 

385 if var_name not in ["STATE_DB_PATH"]: 

386 _get_logger().warning( 

387 "Environment variable not found", variable=var_name 

388 ) 

389 continue 

390 # If the environment variable contains $HOME, expand it 

391 if "$HOME" in env_value: 

392 env_value = env_value.replace("$HOME", os.path.expanduser("~")) 

393 result = result.replace(f"${{{var_name}}}", env_value) 

394 

395 return result 

396 elif isinstance(data, dict): 

397 return {k: Settings._substitute_env_vars(v) for k, v in data.items()} 

398 elif isinstance(data, list): 

399 return [Settings._substitute_env_vars(item) for item in data] 

400 return data 

401 

402 @classmethod 

403 def from_yaml( 

404 cls, 

405 config_path: Path, 

406 env_path: Path | None = None, 

407 skip_validation: bool = False, 

408 ) -> "Settings": 

409 """Load configuration from a YAML file. 

410 

411 Args: 

412 config_path: Path to the YAML configuration file. 

413 env_path: Optional path to the .env file. If provided, only this file is loaded. 

414 skip_validation: If True, skip directory validation and creation. 

415 

416 Returns: 

417 Settings: Loaded configuration. 

418 """ 

419 _get_logger().debug("Loading configuration from YAML", path=str(config_path)) 

420 try: 

421 # Step 1: Load environment variables first 

422 if env_path is not None: 

423 # Custom env file specified - load only this file 

424 _get_logger().debug( 

425 "Loading custom environment file", path=str(env_path) 

426 ) 

427 if not env_path.exists(): 

428 raise FileNotFoundError(f"Environment file not found: {env_path}") 

429 load_dotenv(env_path, override=True) 

430 else: 

431 # Load default .env file if it exists 

432 _get_logger().debug("Loading default environment variables") 

433 load_dotenv(override=False) 

434 

435 # Step 2: Load YAML config 

436 with open(config_path) as f: 

437 config_data = yaml.safe_load(f) 

438 

439 # Step 3: Process all environment variables in config using substitution 

440 _get_logger().debug("Processing environment variables in configuration") 

441 config_data = cls._substitute_env_vars(config_data) 

442 

443 # Step 4: Use multi-project parser to parse configuration 

444 validator = ConfigValidator() 

445 parser = MultiProjectConfigParser(validator) 

446 parsed_config = parser.parse(config_data, skip_validation=skip_validation) 

447 

448 # Step 5: Create settings instance with parsed configuration 

449 settings = cls( 

450 global_config=parsed_config.global_config, 

451 projects_config=parsed_config.projects_config, 

452 ) 

453 

454 _get_logger().debug("Successfully created Settings instance") 

455 return settings 

456 

457 except yaml.YAMLError as e: 

458 _get_logger().error( 

459 "Failed to parse YAML configuration", 

460 error=sanitize_exception_message(e), 

461 ) 

462 raise 

463 except ValidationError as e: 

464 _get_logger().error( 

465 "Configuration validation failed", 

466 error=sanitize_exception_message(e), 

467 ) 

468 raise 

469 except Exception as e: 

470 _get_logger().error( 

471 "Unexpected error loading configuration", 

472 error=sanitize_exception_message(e), 

473 ) 

474 raise 

475 

476 def to_dict(self) -> dict: 

477 """Convert the configuration to a dictionary. 

478 

479 Returns: 

480 dict: Configuration as a dictionary. 

481 """ 

482 return { 

483 "global": self.global_config.to_dict(), 

484 "projects": self.projects_config.to_dict(), 

485 }