Coverage for src/qdrant_loader/config/parser.py: 88%

84 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:50 +0000

1"""Multi-project configuration parser. 

2 

3This module provides parsing functionality for multi-project configurations. 

4""" 

5 

6import re 

7from typing import Any, Dict, List 

8 

9from pydantic import ValidationError 

10 

11from .global_config import GlobalConfig 

12from .models import ParsedConfig, ProjectConfig, ProjectsConfig 

13from .sources import SourcesConfig 

14from .validator import ConfigValidator 

15from ..utils.logging import LoggingConfig 

16 

17logger = LoggingConfig.get_logger(__name__) 

18 

19 

20class MultiProjectConfigParser: 

21 """Parser for multi-project configurations.""" 

22 

23 def __init__(self, validator: ConfigValidator): 

24 """Initialize the parser with a validator. 

25 

26 Args: 

27 validator: Configuration validator instance 

28 """ 

29 self.validator = validator 

30 

31 def parse( 

32 self, config_data: Dict[str, Any], skip_validation: bool = False 

33 ) -> ParsedConfig: 

34 """Parse configuration with multi-project support. 

35 

36 Args: 

37 config_data: Raw configuration data from YAML 

38 skip_validation: Whether to skip validation during parsing 

39 

40 Returns: 

41 ParsedConfig: Parsed configuration with project information 

42 

43 Raises: 

44 ValidationError: If configuration is invalid 

45 """ 

46 logger.debug("Starting configuration parsing") 

47 

48 # Check for legacy format and provide clear error message 

49 if self._is_legacy_config(config_data): 

50 self._raise_legacy_format_error() 

51 

52 # Validate configuration structure 

53 self.validator.validate_structure(config_data) 

54 

55 # Parse global configuration 

56 global_config = self._parse_global_config( 

57 config_data.get("global", {}), skip_validation 

58 ) 

59 

60 # Parse projects 

61 projects_config = self._parse_projects(config_data, global_config) 

62 

63 logger.debug( 

64 "Configuration parsing completed", 

65 project_count=len(projects_config.projects), 

66 ) 

67 

68 return ParsedConfig( 

69 global_config=global_config, 

70 projects_config=projects_config, 

71 ) 

72 

73 def _parse_global_config( 

74 self, global_data: Dict[str, Any], skip_validation: bool = False 

75 ) -> GlobalConfig: 

76 """Parse global configuration section. 

77 

78 Args: 

79 global_data: Global configuration data 

80 skip_validation: Whether to skip validation during parsing 

81 

82 Returns: 

83 GlobalConfig: Parsed global configuration 

84 """ 

85 try: 

86 return GlobalConfig(**global_data, skip_validation=skip_validation) 

87 except ValidationError as e: 

88 logger.error("Failed to parse global configuration", error=str(e)) 

89 raise 

90 

91 def _is_legacy_config(self, config_data: Dict[str, Any]) -> bool: 

92 """Determine if configuration uses legacy single-project format. 

93 

94 Args: 

95 config_data: Raw configuration data 

96 

97 Returns: 

98 bool: True if legacy format, False if multi-project format 

99 """ 

100 has_sources_at_root = "sources" in config_data 

101 has_projects_section = "projects" in config_data 

102 

103 return has_sources_at_root and not has_projects_section 

104 

105 def _raise_legacy_format_error(self) -> None: 

106 """Raise a helpful error message for legacy configuration format.""" 

107 error_message = """ 

108Legacy configuration format detected. Please update your config.yaml file to use the new multi-project format. 

109 

110MIGRATION GUIDE: 

111================ 

112 

113OLD FORMAT (legacy): 

114```yaml 

115global: 

116 # ... global settings ... 

117 

118sources: 

119 git: 

120 my-repo: 

121 # ... git config ... 

122 confluence: 

123 my-space: 

124 # ... confluence config ... 

125``` 

126 

127NEW FORMAT (multi-project): 

128```yaml 

129global: 

130 # ... global settings ... 

131 

132projects: 

133 default: # or any project name you prefer 

134 display_name: "My Project" 

135 description: "Project description" 

136 collection_name: "my_collection" # optional, defaults to global collection + project name 

137 sources: 

138 git: 

139 my-repo: 

140 # ... git config ... 

141 confluence: 

142 my-space: 

143 # ... confluence config ... 

144 overrides: {} # optional project-specific config overrides 

145``` 

146 

147BENEFITS OF NEW FORMAT: 

148- Support for multiple projects in a single configuration 

149- Better organization and isolation of different data sources 

150- Project-specific collection names and configuration overrides 

151- Clearer structure and easier maintenance 

152 

153To migrate your configuration: 

1541. Move your 'sources' section under 'projects.default.sources' 

1552. Add required project fields: display_name, description 

1563. Optionally specify a custom collection_name 

1574. Add any project-specific overrides if needed 

158 

159For more information, see the documentation on multi-project configuration. 

160""" 

161 raise ValueError(error_message.strip()) 

162 

163 def _parse_projects( 

164 self, config_data: Dict[str, Any], global_config: GlobalConfig 

165 ) -> ProjectsConfig: 

166 """Parse project configurations. 

167 

168 Args: 

169 config_data: Raw configuration data 

170 global_config: Parsed global configuration 

171 

172 Returns: 

173 ProjectsConfig: Parsed projects configuration 

174 """ 

175 projects_config = ProjectsConfig() 

176 

177 # Handle multi-project format 

178 projects_data = config_data.get("projects", {}) 

179 for project_id, project_data in projects_data.items(): 

180 project_config = self._parse_project_config( 

181 project_id, project_data, global_config 

182 ) 

183 projects_config.add_project(project_config) 

184 logger.debug("Parsed project configuration", project_id=project_id) 

185 

186 return projects_config 

187 

188 def _parse_project_config( 

189 self, project_id: str, project_data: Dict[str, Any], global_config: GlobalConfig 

190 ) -> ProjectConfig: 

191 """Parse individual project configuration. 

192 

193 Args: 

194 project_id: Project identifier 

195 project_data: Project configuration data 

196 global_config: Global configuration 

197 

198 Returns: 

199 ProjectConfig: Parsed project configuration 

200 """ 

201 # Validate project ID 

202 if not self._is_valid_project_id(project_id): 

203 raise ValueError( 

204 f"Invalid project ID '{project_id}'. " 

205 "Project IDs must be valid Python identifiers (alphanumeric + underscores)." 

206 ) 

207 

208 # Extract basic project information 

209 display_name = project_data.get("display_name", project_id) 

210 description = project_data.get("description") 

211 collection_name = project_data.get("collection_name") 

212 

213 # Parse project-specific sources with automatic field injection 

214 sources_data = project_data.get("sources", {}) 

215 enhanced_sources_data = self._inject_source_metadata(sources_data) 

216 sources_config = SourcesConfig(**enhanced_sources_data) 

217 

218 # Extract configuration overrides 

219 overrides = project_data.get("overrides", {}) 

220 

221 # Merge project-specific overrides with global config 

222 merged_overrides = self._merge_configs(global_config, overrides) 

223 

224 return ProjectConfig( 

225 project_id=project_id, 

226 display_name=display_name, 

227 description=description, 

228 sources=sources_config, 

229 overrides=merged_overrides, 

230 ) 

231 

232 def _inject_source_metadata(self, sources_data: Dict[str, Any]) -> Dict[str, Any]: 

233 """Inject source_type and source fields into source configurations. 

234 

235 Args: 

236 sources_data: Raw sources configuration data 

237 

238 Returns: 

239 Dict[str, Any]: Enhanced sources data with injected metadata 

240 """ 

241 enhanced_data = {} 

242 

243 for source_type, source_configs in sources_data.items(): 

244 if not isinstance(source_configs, dict): 

245 enhanced_data[source_type] = source_configs 

246 continue 

247 

248 enhanced_source_configs = {} 

249 for source_name, source_config in source_configs.items(): 

250 if isinstance(source_config, dict): 

251 # Create a copy to avoid modifying the original 

252 enhanced_config = source_config.copy() 

253 

254 # Always inject source_type and source fields 

255 enhanced_config["source_type"] = source_type 

256 enhanced_config["source"] = source_name 

257 

258 enhanced_source_configs[source_name] = enhanced_config 

259 else: 

260 enhanced_source_configs[source_name] = source_config 

261 

262 enhanced_data[source_type] = enhanced_source_configs 

263 

264 return enhanced_data 

265 

266 def _is_valid_project_id(self, project_id: str) -> bool: 

267 """Validate project ID format. 

268 

269 Args: 

270 project_id: Project identifier to validate 

271 

272 Returns: 

273 bool: True if valid, False otherwise 

274 """ 

275 # Project IDs must be valid Python identifiers 

276 # Allow alphanumeric characters, underscores, and hyphens 

277 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

278 return bool(re.match(pattern, project_id)) 

279 

280 def _merge_configs( 

281 self, global_config: GlobalConfig, project_overrides: Dict[str, Any] 

282 ) -> Dict[str, Any]: 

283 """Merge project-specific overrides with global configuration. 

284 

285 Args: 

286 global_config: Global configuration 

287 project_overrides: Project-specific overrides 

288 

289 Returns: 

290 Dict[str, Any]: Merged configuration 

291 """ 

292 # Convert global config to dict 

293 global_dict = global_config.to_dict() 

294 

295 # Deep merge project overrides 

296 merged = self._deep_merge_dicts(global_dict, project_overrides) 

297 

298 return merged 

299 

300 def _deep_merge_dicts( 

301 self, base: Dict[str, Any], override: Dict[str, Any] 

302 ) -> Dict[str, Any]: 

303 """Deep merge two dictionaries. 

304 

305 Args: 

306 base: Base dictionary 

307 override: Override dictionary 

308 

309 Returns: 

310 Dict[str, Any]: Merged dictionary 

311 """ 

312 result = base.copy() 

313 

314 for key, value in override.items(): 

315 if ( 

316 key in result 

317 and isinstance(result[key], dict) 

318 and isinstance(value, dict) 

319 ): 

320 result[key] = self._deep_merge_dicts(result[key], value) 

321 else: 

322 result[key] = value 

323 

324 return result