Coverage for src / qdrant_loader / config / __init__.py: 86%
178 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:40 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:40 +0000
1"""Configuration module.
3This module provides the main configuration interface for the application.
4It combines global settings with source-specific configurations.
5"""
7import os
8import re
9from pathlib import Path
10from typing import Any, Optional
12import yaml
13from dotenv import load_dotenv
14from pydantic import Field, ValidationError, model_validator
15from pydantic_settings import BaseSettings, SettingsConfigDict
17from ..utils.logging import LoggingConfig
18from ..utils.sensitive import sanitize_exception_message
19from .chunking import ChunkingConfig
21# Import consolidated configs
22from .global_config import GlobalConfig, SemanticAnalysisConfig
24# Import multi-project support
25from .models import (
26 ParsedConfig,
27 ProjectConfig,
28 ProjectContext,
29 ProjectDetail,
30 ProjectInfo,
31 ProjectsConfig,
32 ProjectStats,
33)
34from .parser import MultiProjectConfigParser
35from .sources import SourcesConfig
36from .state import StateManagementConfig
37from .validator import ConfigValidator
38from .workspace import WorkspaceConfig
40# Load environment variables from .env file
41load_dotenv(override=False)
44def _get_logger():
45 return LoggingConfig.get_logger(__name__)
48# Lazy import function for connector configs
49def _get_connector_configs():
50 """Lazy import connector configs to avoid circular dependencies."""
51 from ..connectors.confluence.config import ConfluenceSpaceConfig
52 from ..connectors.git.config import GitAuthConfig, GitRepoConfig
53 from ..connectors.jira.config import JiraProjectConfig
54 from ..connectors.publicdocs.config import PublicDocsSourceConfig, SelectorsConfig
56 return {
57 "ConfluenceSpaceConfig": ConfluenceSpaceConfig,
58 "GitAuthConfig": GitAuthConfig,
59 "GitRepoConfig": GitRepoConfig,
60 "JiraProjectConfig": JiraProjectConfig,
61 "PublicDocsSourceConfig": PublicDocsSourceConfig,
62 "SelectorsConfig": SelectorsConfig,
63 }
66__all__ = [
67 "ChunkingConfig",
68 "ConfluenceSpaceConfig",
69 "GitAuthConfig",
70 "GitRepoConfig",
71 "GlobalConfig",
72 "JiraProjectConfig",
73 "PublicDocsSourceConfig",
74 "SelectorsConfig",
75 "SemanticAnalysisConfig",
76 "Settings",
77 "SourcesConfig",
78 "StateManagementConfig",
79 # Multi-project support
80 "ProjectContext",
81 "ProjectConfig",
82 "ProjectsConfig",
83 "ParsedConfig",
84 "ProjectStats",
85 "ProjectInfo",
86 "ProjectDetail",
87 "MultiProjectConfigParser",
88 "ConfigValidator",
89 # Functions
90 "get_global_config",
91 "get_settings",
92 "initialize_config",
93 "initialize_config_with_workspace",
94]
97# Add lazy loading for connector configs
98def __getattr__(name):
99 """Lazy import connector configs to avoid circular dependencies."""
100 connector_configs = _get_connector_configs()
101 if name in connector_configs:
102 return connector_configs[name]
103 raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
106_global_settings: Optional["Settings"] = None
109def get_settings() -> "Settings":
110 """Get the global settings instance.
112 Returns:
113 Settings: The global settings instance.
114 """
115 if _global_settings is None:
116 raise RuntimeError(
117 "Settings not initialized. Call initialize_config() or initialize_config_with_workspace() first."
118 )
119 return _global_settings
122def get_global_config() -> GlobalConfig:
123 """Get the global configuration instance.
125 Returns:
126 GlobalConfig: The global configuration instance.
127 """
128 return get_settings().global_config
131def initialize_config(
132 yaml_path: Path, env_path: Path | None = None, skip_validation: bool = False
133) -> None:
134 """Initialize the global configuration.
136 Args:
137 yaml_path: Path to the YAML configuration file.
138 env_path: Optional path to the .env file.
139 skip_validation: If True, skip directory validation and creation.
140 """
141 global _global_settings
142 try:
143 # Proceed with initialization
144 _get_logger().debug(
145 "Initializing configuration",
146 yaml_path=str(yaml_path),
147 env_path=str(env_path) if env_path else None,
148 )
149 _global_settings = Settings.from_yaml(
150 yaml_path, env_path=env_path, skip_validation=skip_validation
151 )
152 _get_logger().debug("Successfully initialized configuration")
154 except Exception as e:
155 safe_error = sanitize_exception_message(e)
156 _get_logger().error(
157 "Failed to initialize configuration",
158 error=safe_error,
159 yaml_path=str(yaml_path),
160 )
161 raise
164def initialize_config_with_workspace(
165 workspace_config: WorkspaceConfig, skip_validation: bool = False
166) -> None:
167 """Initialize configuration using workspace settings.
169 Args:
170 workspace_config: Workspace configuration with paths and settings
171 skip_validation: If True, skip directory validation and creation
172 """
173 global _global_settings
174 try:
175 _get_logger().debug(
176 "Initializing configuration with workspace",
177 workspace=str(workspace_config.workspace_path),
178 config_path=str(workspace_config.config_path),
179 env_path=(
180 str(workspace_config.env_path) if workspace_config.env_path else None
181 ),
182 )
184 # Load configuration using workspace paths
185 _global_settings = Settings.from_yaml(
186 workspace_config.config_path,
187 env_path=workspace_config.env_path,
188 skip_validation=skip_validation,
189 )
191 # Check if database_path was specified in config.yaml and warn user
192 original_db_path = _global_settings.global_config.state_management.database_path
193 workspace_db_path = str(workspace_config.database_path)
195 # Only warn if the original path is different from the workspace path and not empty/default
196 if (
197 original_db_path
198 and original_db_path != ":memory:"
199 and original_db_path != workspace_db_path
200 ):
201 _get_logger().warning(
202 "Database path in config.yaml is ignored in workspace mode",
203 config_database_path=original_db_path,
204 workspace_database_path=workspace_db_path,
205 )
207 # Override the database path with workspace-specific path
208 _global_settings.global_config.state_management.database_path = (
209 workspace_db_path
210 )
212 _get_logger().debug(
213 "Set workspace database path",
214 database_path=workspace_db_path,
215 )
217 _get_logger().debug(
218 "Successfully initialized configuration with workspace",
219 workspace=str(workspace_config.workspace_path),
220 )
222 except Exception as e:
223 safe_error = sanitize_exception_message(e)
224 _get_logger().error(
225 "Failed to initialize configuration with workspace",
226 error=safe_error,
227 workspace=str(workspace_config.workspace_path),
228 )
229 raise
232class Settings(BaseSettings):
233 """Main configuration class combining global and source-specific settings."""
235 # Configuration objects - these are the only fields we need
236 global_config: GlobalConfig = Field(
237 default_factory=GlobalConfig, description="Global configuration settings"
238 )
239 projects_config: ProjectsConfig = Field(
240 default_factory=ProjectsConfig, description="Multi-project configurations"
241 )
243 model_config = SettingsConfigDict(
244 env_file=None, # Disable automatic .env loading - we handle this manually
245 env_file_encoding="utf-8",
246 extra="allow",
247 )
249 @model_validator(mode="after") # type: ignore
250 def validate_source_configs(self) -> "Settings":
251 """Validate that required configuration is present for configured sources."""
252 _get_logger().debug("Validating source configurations")
254 # Auto-resolve environment variables as fallbacks
255 self._auto_resolve_env_vars()
257 # Validate that required fields are not empty after variable substitution
258 if not self.global_config.qdrant.url:
259 raise ValueError(
260 "Qdrant URL is required but was not provided or substituted"
261 )
263 if not self.global_config.qdrant.collection_name:
264 raise ValueError(
265 "Qdrant collection name is required but was not provided or substituted"
266 )
268 # Note: Source validation is now handled at the project level
269 # Each project's sources are validated when the project is processed
271 _get_logger().debug("Source configuration validation successful")
272 return self
274 def _auto_resolve_env_vars(self) -> None:
275 """Auto-resolve well-known environment variables as fallbacks.
277 Priority: config file value > environment variable > default.
278 Only fills in values that were not explicitly set in config.
280 Note: detection uses default-value sentinels, so explicitly setting a
281 config value equal to the default (e.g. url: http://localhost:6333)
282 will still be overridden by the environment variable.
283 """
284 # OPENAI_API_KEY → embedding.api_key and llm.api_key
285 openai_key = os.getenv("OPENAI_API_KEY")
286 if openai_key:
287 if not self.global_config.embedding.api_key:
288 self.global_config.embedding.api_key = openai_key
289 if self.global_config.llm and isinstance(self.global_config.llm, dict):
290 if not self.global_config.llm.get("api_key"):
291 self.global_config.llm["api_key"] = openai_key
293 # QDRANT_URL → qdrant.url (override only if still default)
294 qdrant_url = os.getenv("QDRANT_URL")
295 if qdrant_url and self.global_config.qdrant.url == "http://localhost:6333":
296 self.global_config.qdrant.url = qdrant_url
298 # QDRANT_API_KEY → qdrant.api_key
299 qdrant_api_key = os.getenv("QDRANT_API_KEY")
300 if qdrant_api_key and not self.global_config.qdrant.api_key:
301 self.global_config.qdrant.api_key = qdrant_api_key
303 # QDRANT_COLLECTION_NAME → qdrant.collection_name
304 collection = os.getenv("QDRANT_COLLECTION_NAME")
305 if collection and self.global_config.qdrant.collection_name == "documents":
306 self.global_config.qdrant.collection_name = collection
308 # STATE_DB_PATH → state_management.database_path
309 # Note: In workspace mode this is overridden by workspace_config.database_path
310 state_db = os.getenv("STATE_DB_PATH")
311 if (
312 state_db
313 and self.global_config.state_management.database_path == "./state.db"
314 ):
315 self.global_config.state_management.database_path = state_db
317 @property
318 def qdrant_url(self) -> str:
319 """Get the Qdrant URL from global configuration."""
320 return self.global_config.qdrant.url
322 @property
323 def qdrant_api_key(self) -> str | None:
324 """Get the Qdrant API key from global configuration."""
325 return self.global_config.qdrant.api_key
327 @property
328 def qdrant_collection_name(self) -> str:
329 """Get the Qdrant collection name from global configuration."""
330 return self.global_config.qdrant.collection_name
332 @property
333 def openai_api_key(self) -> str:
334 """Get the OpenAI API key from embedding configuration."""
335 api_key = self.global_config.embedding.api_key
336 if not api_key:
337 raise ValueError(
338 "OpenAI API key is required but was not provided or substituted in embedding configuration"
339 )
340 return api_key
342 @property
343 def state_db_path(self) -> str:
344 """Get the state database path from global configuration."""
345 return self.global_config.state_management.database_path
347 @property
348 def llm_settings(self):
349 """Provider-agnostic LLM settings derived from global configuration.
351 Uses `global.llm` when present; otherwise maps legacy fields.
352 """
353 # Import lazily to avoid hard dependency issues in environments without core installed
354 from importlib import import_module
356 settings_mod = import_module("qdrant_loader_core.llm.settings")
357 LLMSettings = settings_mod.LLMSettings
358 return LLMSettings.from_global_config(self.global_config.to_dict())
360 @staticmethod
361 def _substitute_env_vars(data: Any) -> Any:
362 """Recursively substitute environment variables in configuration data.
364 Args:
365 data: Configuration data to process
367 Returns:
368 Processed data with environment variables substituted
369 """
370 if isinstance(data, str):
371 # First expand $HOME if present
372 if "$HOME" in data:
373 data = data.replace("$HOME", os.path.expanduser("~"))
375 # Then handle ${VAR_NAME} pattern
376 pattern = r"\${([^}]+)}"
377 matches = re.finditer(pattern, data)
378 result = data
379 for match in matches:
380 var_name = match.group(1)
381 env_value = os.getenv(var_name)
382 if env_value is None:
383 # Only warn about missing variables that are commonly required
384 # Skip STATE_DB_PATH as it's often overridden in workspace mode
385 if var_name not in ["STATE_DB_PATH"]:
386 _get_logger().warning(
387 "Environment variable not found", variable=var_name
388 )
389 continue
390 # If the environment variable contains $HOME, expand it
391 if "$HOME" in env_value:
392 env_value = env_value.replace("$HOME", os.path.expanduser("~"))
393 result = result.replace(f"${{{var_name}}}", env_value)
395 return result
396 elif isinstance(data, dict):
397 return {k: Settings._substitute_env_vars(v) for k, v in data.items()}
398 elif isinstance(data, list):
399 return [Settings._substitute_env_vars(item) for item in data]
400 return data
402 @classmethod
403 def from_yaml(
404 cls,
405 config_path: Path,
406 env_path: Path | None = None,
407 skip_validation: bool = False,
408 ) -> "Settings":
409 """Load configuration from a YAML file.
411 Args:
412 config_path: Path to the YAML configuration file.
413 env_path: Optional path to the .env file. If provided, only this file is loaded.
414 skip_validation: If True, skip directory validation and creation.
416 Returns:
417 Settings: Loaded configuration.
418 """
419 _get_logger().debug("Loading configuration from YAML", path=str(config_path))
420 try:
421 # Step 1: Load environment variables first
422 if env_path is not None:
423 # Custom env file specified - load only this file
424 _get_logger().debug(
425 "Loading custom environment file", path=str(env_path)
426 )
427 if not env_path.exists():
428 raise FileNotFoundError(f"Environment file not found: {env_path}")
429 load_dotenv(env_path, override=True)
430 else:
431 # Load default .env file if it exists
432 _get_logger().debug("Loading default environment variables")
433 load_dotenv(override=False)
435 # Step 2: Load YAML config
436 with open(config_path) as f:
437 config_data = yaml.safe_load(f)
439 # Step 3: Process all environment variables in config using substitution
440 _get_logger().debug("Processing environment variables in configuration")
441 config_data = cls._substitute_env_vars(config_data)
443 # Step 4: Use multi-project parser to parse configuration
444 validator = ConfigValidator()
445 parser = MultiProjectConfigParser(validator)
446 parsed_config = parser.parse(config_data, skip_validation=skip_validation)
448 # Step 5: Create settings instance with parsed configuration
449 settings = cls(
450 global_config=parsed_config.global_config,
451 projects_config=parsed_config.projects_config,
452 )
454 _get_logger().debug("Successfully created Settings instance")
455 return settings
457 except yaml.YAMLError as e:
458 _get_logger().error(
459 "Failed to parse YAML configuration",
460 error=sanitize_exception_message(e),
461 )
462 raise
463 except ValidationError as e:
464 _get_logger().error(
465 "Configuration validation failed",
466 error=sanitize_exception_message(e),
467 )
468 raise
469 except Exception as e:
470 _get_logger().error(
471 "Unexpected error loading configuration",
472 error=sanitize_exception_message(e),
473 )
474 raise
476 def to_dict(self) -> dict:
477 """Convert the configuration to a dictionary.
479 Returns:
480 dict: Configuration as a dictionary.
481 """
482 return {
483 "global": self.global_config.to_dict(),
484 "projects": self.projects_config.to_dict(),
485 }