Coverage for src/qdrant_loader/config/__init__.py: 84%

162 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-11 07:21 +0000

1"""Configuration module. 

2 

3This module provides the main configuration interface for the application. 

4It combines global settings with source-specific configurations. 

5""" 

6 

7import os 

8import re 

9from pathlib import Path 

10from typing import Any, Optional 

11 

12import yaml 

13from dotenv import load_dotenv 

14from pydantic import Field, ValidationError, model_validator 

15from pydantic_settings import BaseSettings, SettingsConfigDict 

16 

17from ..utils.logging import LoggingConfig 

18from .chunking import ChunkingConfig 

19 

20# Import consolidated configs 

21from .global_config import GlobalConfig, SemanticAnalysisConfig 

22 

23# Import multi-project support 

24from .models import ( 

25 ParsedConfig, 

26 ProjectConfig, 

27 ProjectContext, 

28 ProjectDetail, 

29 ProjectInfo, 

30 ProjectsConfig, 

31 ProjectStats, 

32) 

33from .parser import MultiProjectConfigParser 

34from .sources import SourcesConfig 

35from .state import StateManagementConfig 

36from .validator import ConfigValidator 

37from .workspace import WorkspaceConfig 

38 

39# Load environment variables from .env file 

40load_dotenv(override=False) 

41 

42 

43def _get_logger(): 

44 return LoggingConfig.get_logger(__name__) 

45 

46 

47# Lazy import function for connector configs 

48def _get_connector_configs(): 

49 """Lazy import connector configs to avoid circular dependencies.""" 

50 from ..connectors.confluence.config import ConfluenceSpaceConfig 

51 from ..connectors.git.config import GitAuthConfig, GitRepoConfig 

52 from ..connectors.jira.config import JiraProjectConfig 

53 from ..connectors.publicdocs.config import PublicDocsSourceConfig, SelectorsConfig 

54 

55 return { 

56 "ConfluenceSpaceConfig": ConfluenceSpaceConfig, 

57 "GitAuthConfig": GitAuthConfig, 

58 "GitRepoConfig": GitRepoConfig, 

59 "JiraProjectConfig": JiraProjectConfig, 

60 "PublicDocsSourceConfig": PublicDocsSourceConfig, 

61 "SelectorsConfig": SelectorsConfig, 

62 } 

63 

64 

65__all__ = [ 

66 "ChunkingConfig", 

67 "ConfluenceSpaceConfig", 

68 "GitAuthConfig", 

69 "GitRepoConfig", 

70 "GlobalConfig", 

71 "JiraProjectConfig", 

72 "PublicDocsSourceConfig", 

73 "SelectorsConfig", 

74 "SemanticAnalysisConfig", 

75 "Settings", 

76 "SourcesConfig", 

77 "StateManagementConfig", 

78 # Multi-project support 

79 "ProjectContext", 

80 "ProjectConfig", 

81 "ProjectsConfig", 

82 "ParsedConfig", 

83 "ProjectStats", 

84 "ProjectInfo", 

85 "ProjectDetail", 

86 "MultiProjectConfigParser", 

87 "ConfigValidator", 

88 # Functions 

89 "get_global_config", 

90 "get_settings", 

91 "initialize_config", 

92 "initialize_config_with_workspace", 

93] 

94 

95 

96# Add lazy loading for connector configs 

97def __getattr__(name): 

98 """Lazy import connector configs to avoid circular dependencies.""" 

99 connector_configs = _get_connector_configs() 

100 if name in connector_configs: 

101 return connector_configs[name] 

102 raise AttributeError(f"module '{__name__}' has no attribute '{name}'") 

103 

104 

105_global_settings: Optional["Settings"] = None 

106 

107 

108def get_settings() -> "Settings": 

109 """Get the global settings instance. 

110 

111 Returns: 

112 Settings: The global settings instance. 

113 """ 

114 if _global_settings is None: 

115 raise RuntimeError( 

116 "Settings not initialized. Call initialize_config() or initialize_config_with_workspace() first." 

117 ) 

118 return _global_settings 

119 

120 

121def get_global_config() -> GlobalConfig: 

122 """Get the global configuration instance. 

123 

124 Returns: 

125 GlobalConfig: The global configuration instance. 

126 """ 

127 return get_settings().global_config 

128 

129 

130def initialize_config( 

131 yaml_path: Path, env_path: Path | None = None, skip_validation: bool = False 

132) -> None: 

133 """Initialize the global configuration. 

134 

135 Args: 

136 yaml_path: Path to the YAML configuration file. 

137 env_path: Optional path to the .env file. 

138 skip_validation: If True, skip directory validation and creation. 

139 """ 

140 global _global_settings 

141 try: 

142 # Proceed with initialization 

143 _get_logger().debug( 

144 "Initializing configuration", 

145 yaml_path=str(yaml_path), 

146 env_path=str(env_path) if env_path else None, 

147 ) 

148 _global_settings = Settings.from_yaml( 

149 yaml_path, env_path=env_path, skip_validation=skip_validation 

150 ) 

151 _get_logger().debug("Successfully initialized configuration") 

152 

153 except Exception as e: 

154 _get_logger().error( 

155 "Failed to initialize configuration", error=str(e), yaml_path=str(yaml_path) 

156 ) 

157 raise 

158 

159 

160def initialize_config_with_workspace( 

161 workspace_config: WorkspaceConfig, skip_validation: bool = False 

162) -> None: 

163 """Initialize configuration using workspace settings. 

164 

165 Args: 

166 workspace_config: Workspace configuration with paths and settings 

167 skip_validation: If True, skip directory validation and creation 

168 """ 

169 global _global_settings 

170 try: 

171 _get_logger().debug( 

172 "Initializing configuration with workspace", 

173 workspace=str(workspace_config.workspace_path), 

174 config_path=str(workspace_config.config_path), 

175 env_path=( 

176 str(workspace_config.env_path) if workspace_config.env_path else None 

177 ), 

178 ) 

179 

180 # Load configuration using workspace paths 

181 _global_settings = Settings.from_yaml( 

182 workspace_config.config_path, 

183 env_path=workspace_config.env_path, 

184 skip_validation=skip_validation, 

185 ) 

186 

187 # Check if database_path was specified in config.yaml and warn user 

188 original_db_path = _global_settings.global_config.state_management.database_path 

189 workspace_db_path = str(workspace_config.database_path) 

190 

191 # Only warn if the original path is different from the workspace path and not empty/default 

192 if ( 

193 original_db_path 

194 and original_db_path != ":memory:" 

195 and original_db_path != workspace_db_path 

196 ): 

197 _get_logger().warning( 

198 "Database path in config.yaml is ignored in workspace mode", 

199 config_database_path=original_db_path, 

200 workspace_database_path=workspace_db_path, 

201 ) 

202 

203 # Override the database path with workspace-specific path 

204 _global_settings.global_config.state_management.database_path = ( 

205 workspace_db_path 

206 ) 

207 

208 _get_logger().debug( 

209 "Set workspace database path", 

210 database_path=workspace_db_path, 

211 ) 

212 

213 _get_logger().debug( 

214 "Successfully initialized configuration with workspace", 

215 workspace=str(workspace_config.workspace_path), 

216 ) 

217 

218 except Exception as e: 

219 _get_logger().error( 

220 "Failed to initialize configuration with workspace", 

221 error=str(e), 

222 workspace=str(workspace_config.workspace_path), 

223 ) 

224 raise 

225 

226 

227class Settings(BaseSettings): 

228 """Main configuration class combining global and source-specific settings.""" 

229 

230 # Configuration objects - these are the only fields we need 

231 global_config: GlobalConfig = Field( 

232 default_factory=GlobalConfig, description="Global configuration settings" 

233 ) 

234 projects_config: ProjectsConfig = Field( 

235 default_factory=ProjectsConfig, description="Multi-project configurations" 

236 ) 

237 

238 model_config = SettingsConfigDict( 

239 env_file=None, # Disable automatic .env loading - we handle this manually 

240 env_file_encoding="utf-8", 

241 extra="allow", 

242 ) 

243 

244 @model_validator(mode="after") # type: ignore 

245 def validate_source_configs(self) -> "Settings": 

246 """Validate that required configuration is present for configured sources.""" 

247 _get_logger().debug("Validating source configurations") 

248 

249 # Validate that qdrant configuration is present in global config 

250 if not self.global_config.qdrant: 

251 raise ValueError("Qdrant configuration is required in global config") 

252 

253 # Validate that required fields are not empty after variable substitution 

254 if not self.global_config.qdrant.url: 

255 raise ValueError( 

256 "Qdrant URL is required but was not provided or substituted" 

257 ) 

258 

259 if not self.global_config.qdrant.collection_name: 

260 raise ValueError( 

261 "Qdrant collection name is required but was not provided or substituted" 

262 ) 

263 

264 # Note: Source validation is now handled at the project level 

265 # Each project's sources are validated when the project is processed 

266 

267 _get_logger().debug("Source configuration validation successful") 

268 return self 

269 

270 @property 

271 def qdrant_url(self) -> str: 

272 """Get the Qdrant URL from global configuration.""" 

273 if not self.global_config.qdrant: 

274 raise ValueError("Qdrant configuration is not available") 

275 return self.global_config.qdrant.url 

276 

277 @property 

278 def qdrant_api_key(self) -> str | None: 

279 """Get the Qdrant API key from global configuration.""" 

280 if not self.global_config.qdrant: 

281 return None 

282 return self.global_config.qdrant.api_key 

283 

284 @property 

285 def qdrant_collection_name(self) -> str: 

286 """Get the Qdrant collection name from global configuration.""" 

287 if not self.global_config.qdrant: 

288 raise ValueError("Qdrant configuration is not available") 

289 return self.global_config.qdrant.collection_name 

290 

291 @property 

292 def openai_api_key(self) -> str: 

293 """Get the OpenAI API key from embedding configuration.""" 

294 api_key = self.global_config.embedding.api_key 

295 if not api_key: 

296 raise ValueError( 

297 "OpenAI API key is required but was not provided or substituted in embedding configuration" 

298 ) 

299 return api_key 

300 

301 @property 

302 def state_db_path(self) -> str: 

303 """Get the state database path from global configuration.""" 

304 return self.global_config.state_management.database_path 

305 

306 @property 

307 def llm_settings(self): 

308 """Provider-agnostic LLM settings derived from global configuration. 

309 

310 Uses `global.llm` when present; otherwise maps legacy fields. 

311 """ 

312 # Import lazily to avoid hard dependency issues in environments without core installed 

313 from importlib import import_module 

314 

315 settings_mod = import_module("qdrant_loader_core.llm.settings") 

316 LLMSettings = settings_mod.LLMSettings 

317 return LLMSettings.from_global_config(self.global_config.to_dict()) 

318 

319 @staticmethod 

320 def _substitute_env_vars(data: Any) -> Any: 

321 """Recursively substitute environment variables in configuration data. 

322 

323 Args: 

324 data: Configuration data to process 

325 

326 Returns: 

327 Processed data with environment variables substituted 

328 """ 

329 if isinstance(data, str): 

330 # First expand $HOME if present 

331 if "$HOME" in data: 

332 data = data.replace("$HOME", os.path.expanduser("~")) 

333 

334 # Then handle ${VAR_NAME} pattern 

335 pattern = r"\${([^}]+)}" 

336 matches = re.finditer(pattern, data) 

337 result = data 

338 for match in matches: 

339 var_name = match.group(1) 

340 env_value = os.getenv(var_name) 

341 if env_value is None: 

342 # Only warn about missing variables that are commonly required 

343 # Skip STATE_DB_PATH as it's often overridden in workspace mode 

344 if var_name not in ["STATE_DB_PATH"]: 

345 _get_logger().warning( 

346 "Environment variable not found", variable=var_name 

347 ) 

348 continue 

349 # If the environment variable contains $HOME, expand it 

350 if "$HOME" in env_value: 

351 env_value = env_value.replace("$HOME", os.path.expanduser("~")) 

352 result = result.replace(f"${{{var_name}}}", env_value) 

353 

354 return result 

355 elif isinstance(data, dict): 

356 return {k: Settings._substitute_env_vars(v) for k, v in data.items()} 

357 elif isinstance(data, list): 

358 return [Settings._substitute_env_vars(item) for item in data] 

359 return data 

360 

361 @classmethod 

362 def from_yaml( 

363 cls, 

364 config_path: Path, 

365 env_path: Path | None = None, 

366 skip_validation: bool = False, 

367 ) -> "Settings": 

368 """Load configuration from a YAML file. 

369 

370 Args: 

371 config_path: Path to the YAML configuration file. 

372 env_path: Optional path to the .env file. If provided, only this file is loaded. 

373 skip_validation: If True, skip directory validation and creation. 

374 

375 Returns: 

376 Settings: Loaded configuration. 

377 """ 

378 _get_logger().debug("Loading configuration from YAML", path=str(config_path)) 

379 try: 

380 # Step 1: Load environment variables first 

381 if env_path is not None: 

382 # Custom env file specified - load only this file 

383 _get_logger().debug( 

384 "Loading custom environment file", path=str(env_path) 

385 ) 

386 if not env_path.exists(): 

387 raise FileNotFoundError(f"Environment file not found: {env_path}") 

388 load_dotenv(env_path, override=True) 

389 else: 

390 # Load default .env file if it exists 

391 _get_logger().debug("Loading default environment variables") 

392 load_dotenv(override=False) 

393 

394 # Step 2: Load YAML config 

395 with open(config_path) as f: 

396 config_data = yaml.safe_load(f) 

397 

398 # Step 3: Process all environment variables in config using substitution 

399 _get_logger().debug("Processing environment variables in configuration") 

400 config_data = cls._substitute_env_vars(config_data) 

401 

402 # Step 4: Use multi-project parser to parse configuration 

403 validator = ConfigValidator() 

404 parser = MultiProjectConfigParser(validator) 

405 parsed_config = parser.parse(config_data, skip_validation=skip_validation) 

406 

407 # Step 5: Create settings instance with parsed configuration 

408 settings = cls( 

409 global_config=parsed_config.global_config, 

410 projects_config=parsed_config.projects_config, 

411 ) 

412 

413 _get_logger().debug("Successfully created Settings instance") 

414 return settings 

415 

416 except yaml.YAMLError as e: 

417 _get_logger().error("Failed to parse YAML configuration", error=str(e)) 

418 raise 

419 except ValidationError as e: 

420 _get_logger().error("Configuration validation failed", error=str(e)) 

421 raise 

422 except Exception as e: 

423 _get_logger().error("Unexpected error loading configuration", error=str(e)) 

424 raise 

425 

426 def to_dict(self) -> dict: 

427 """Convert the configuration to a dictionary. 

428 

429 Returns: 

430 dict: Configuration as a dictionary. 

431 """ 

432 return { 

433 "global": self.global_config.to_dict(), 

434 "projects": self.projects_config.to_dict(), 

435 }