Coverage for src / qdrant_loader / config / parser.py: 100%

89 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-10 09:40 +0000

1"""Multi-project configuration parser. 

2 

3This module provides parsing functionality for multi-project configurations. 

4""" 

5 

6import re 

7from typing import Any 

8 

9from pydantic import ValidationError 

10 

11from ..utils.logging import LoggingConfig 

12from ..utils.sensitive import sanitize_exception_message 

13from .global_config import GlobalConfig 

14from .models import ParsedConfig, ProjectConfig, ProjectsConfig 

15from .sources import SourcesConfig 

16from .validator import ConfigValidator 

17 

18 

19def _get_logger(): 

20 return LoggingConfig.get_logger(__name__) 

21 

22 

23class MultiProjectConfigParser: 

24 """Parser for multi-project configurations.""" 

25 

26 def __init__(self, validator: ConfigValidator): 

27 """Initialize the parser with a validator. 

28 

29 Args: 

30 validator: Configuration validator instance 

31 """ 

32 self.validator = validator 

33 

34 def parse( 

35 self, config_data: dict[str, Any], skip_validation: bool = False 

36 ) -> ParsedConfig: 

37 """Parse configuration with multi-project support. 

38 

39 Supports two formats: 

40 1. Standard: config with 'projects' section 

41 2. Simplified: config with top-level 'sources' (auto-wrapped into default project) 

42 

43 Args: 

44 config_data: Raw configuration data from YAML 

45 skip_validation: Whether to skip validation during parsing 

46 

47 Returns: 

48 ParsedConfig: Parsed configuration with project information 

49 

50 Raises: 

51 ValidationError: If configuration is invalid 

52 """ 

53 _get_logger().debug("Starting configuration parsing") 

54 

55 # Auto-wrap simplified format: top-level 'sources' → projects.default 

56 if isinstance(config_data, dict): 

57 config_data = self._normalize_config(config_data) 

58 

59 # Validate configuration structure 

60 self.validator.validate_structure(config_data) 

61 

62 # Parse global configuration 

63 global_config = self._parse_global_config( 

64 config_data.get("global", {}), skip_validation 

65 ) 

66 

67 # Parse projects 

68 projects_config = self._parse_projects(config_data, global_config) 

69 

70 _get_logger().debug( 

71 "Configuration parsing completed", 

72 project_count=len(projects_config.projects), 

73 ) 

74 

75 return ParsedConfig( 

76 global_config=global_config, 

77 projects_config=projects_config, 

78 ) 

79 

80 def _normalize_config(self, config_data: dict[str, Any]) -> dict[str, Any]: 

81 """Normalize simplified config format to standard format. 

82 

83 If config has top-level 'sources' but no 'projects', wrap sources 

84 into a default project automatically. 

85 

86 Args: 

87 config_data: Raw configuration data 

88 

89 Returns: 

90 Normalized configuration data with 'projects' section 

91 """ 

92 has_projects = "projects" in config_data 

93 has_sources = "sources" in config_data 

94 

95 if has_sources and has_projects: 

96 _get_logger().warning( 

97 "Config has both 'projects' and top-level 'sources'; " 

98 "top-level 'sources' will be ignored" 

99 ) 

100 

101 if has_sources and not has_projects: 

102 _get_logger().debug( 

103 "Simplified config detected: wrapping top-level 'sources' " 

104 "into default project" 

105 ) 

106 result = {k: v for k, v in config_data.items() if k != "sources"} 

107 result["projects"] = { 

108 "default": { 

109 "display_name": "Default Project", 

110 "sources": config_data["sources"], 

111 } 

112 } 

113 return result 

114 

115 return config_data 

116 

117 def _parse_global_config( 

118 self, global_data: dict[str, Any], skip_validation: bool = False 

119 ) -> GlobalConfig: 

120 """Parse global configuration section. 

121 

122 Args: 

123 global_data: Global configuration data 

124 skip_validation: Whether to skip validation during parsing 

125 

126 Returns: 

127 GlobalConfig: Parsed global configuration 

128 """ 

129 try: 

130 return GlobalConfig(**global_data, skip_validation=skip_validation) 

131 except ValidationError as e: 

132 _get_logger().error( 

133 "Failed to parse global configuration", 

134 error=sanitize_exception_message(e), 

135 ) 

136 raise 

137 

138 def _parse_projects( 

139 self, config_data: dict[str, Any], global_config: GlobalConfig 

140 ) -> ProjectsConfig: 

141 """Parse project configurations. 

142 

143 Args: 

144 config_data: Raw configuration data 

145 global_config: Parsed global configuration 

146 

147 Returns: 

148 ProjectsConfig: Parsed projects configuration 

149 """ 

150 projects_config = ProjectsConfig() 

151 

152 # Handle multi-project format 

153 projects_data = config_data.get("projects", {}) 

154 for project_id, project_data in projects_data.items(): 

155 project_config = self._parse_project_config( 

156 project_id, project_data, global_config 

157 ) 

158 projects_config.add_project(project_config) 

159 _get_logger().debug("Parsed project configuration", project_id=project_id) 

160 

161 return projects_config 

162 

163 def _parse_project_config( 

164 self, project_id: str, project_data: dict[str, Any], global_config: GlobalConfig 

165 ) -> ProjectConfig: 

166 """Parse individual project configuration. 

167 

168 Args: 

169 project_id: Project identifier 

170 project_data: Project configuration data 

171 global_config: Global configuration 

172 

173 Returns: 

174 ProjectConfig: Parsed project configuration 

175 """ 

176 # Validate project ID 

177 if not self._is_valid_project_id(project_id): 

178 raise ValueError( 

179 f"Invalid project ID '{project_id}'. " 

180 "Project IDs must be valid Python identifiers (alphanumeric + underscores)." 

181 ) 

182 

183 # Extract basic project information 

184 display_name = project_data.get("display_name", project_id) 

185 description = project_data.get("description") 

186 

187 # Parse project-specific sources with automatic field injection 

188 sources_data = project_data.get("sources", {}) 

189 enhanced_sources_data = self._inject_source_metadata(sources_data) 

190 sources_config = SourcesConfig(**enhanced_sources_data) 

191 

192 # Extract configuration overrides 

193 overrides = project_data.get("overrides", {}) 

194 

195 # Merge project-specific overrides with global config 

196 merged_overrides = self._merge_configs(global_config, overrides) 

197 

198 return ProjectConfig( 

199 project_id=project_id, 

200 display_name=display_name, 

201 description=description, 

202 sources=sources_config, 

203 overrides=merged_overrides, 

204 ) 

205 

206 def _inject_source_metadata(self, sources_data: dict[str, Any]) -> dict[str, Any]: 

207 """Inject source_type and source fields into source configurations. 

208 

209 Args: 

210 sources_data: Raw sources configuration data 

211 

212 Returns: 

213 Dict[str, Any]: Enhanced sources data with injected metadata 

214 """ 

215 enhanced_data = {} 

216 

217 for source_type, source_configs in sources_data.items(): 

218 if not isinstance(source_configs, dict): 

219 enhanced_data[source_type] = source_configs 

220 continue 

221 

222 enhanced_source_configs = {} 

223 for source_name, source_config in source_configs.items(): 

224 if isinstance(source_config, dict): 

225 # Create a copy to avoid modifying the original 

226 enhanced_config = source_config.copy() 

227 

228 # Always inject source_type and source fields 

229 enhanced_config["source_type"] = source_type 

230 enhanced_config["source"] = source_name 

231 

232 enhanced_source_configs[source_name] = enhanced_config 

233 else: 

234 enhanced_source_configs[source_name] = source_config 

235 

236 enhanced_data[source_type] = enhanced_source_configs 

237 

238 return enhanced_data 

239 

240 def _is_valid_project_id(self, project_id: str) -> bool: 

241 """Validate project ID format. 

242 

243 Args: 

244 project_id: Project identifier to validate 

245 

246 Returns: 

247 bool: True if valid, False otherwise 

248 """ 

249 # Project IDs must be valid Python identifiers 

250 # Allow alphanumeric characters, underscores, and hyphens 

251 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

252 return bool(re.match(pattern, project_id)) 

253 

254 def _merge_configs( 

255 self, global_config: GlobalConfig, project_overrides: dict[str, Any] 

256 ) -> dict[str, Any]: 

257 """Merge project-specific overrides with global configuration. 

258 

259 Args: 

260 global_config: Global configuration 

261 project_overrides: Project-specific overrides 

262 

263 Returns: 

264 Dict[str, Any]: Merged configuration 

265 """ 

266 # Convert global config to dict 

267 global_dict = global_config.to_dict() 

268 

269 # Deep merge project overrides 

270 merged = self._deep_merge_dicts(global_dict, project_overrides) 

271 

272 return merged 

273 

274 def _deep_merge_dicts( 

275 self, base: dict[str, Any], override: dict[str, Any] 

276 ) -> dict[str, Any]: 

277 """Deep merge two dictionaries. 

278 

279 Args: 

280 base: Base dictionary 

281 override: Override dictionary 

282 

283 Returns: 

284 Dict[str, Any]: Merged dictionary 

285 """ 

286 result = base.copy() 

287 

288 for key, value in override.items(): 

289 if ( 

290 key in result 

291 and isinstance(result[key], dict) 

292 and isinstance(value, dict) 

293 ): 

294 result[key] = self._deep_merge_dicts(result[key], value) 

295 else: 

296 result[key] = value 

297 

298 return result