Coverage for src / qdrant_loader / config / parser.py: 100%

88 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-18 04:48 +0000

1"""Multi-project configuration parser. 

2 

3This module provides parsing functionality for multi-project configurations. 

4""" 

5 

6import re 

7from typing import Any 

8 

9from pydantic import ValidationError 

10 

11from ..utils.logging import LoggingConfig 

12from .global_config import GlobalConfig 

13from .models import ParsedConfig, ProjectConfig, ProjectsConfig 

14from .sources import SourcesConfig 

15from .validator import ConfigValidator 

16 

17 

18def _get_logger(): 

19 return LoggingConfig.get_logger(__name__) 

20 

21 

22class MultiProjectConfigParser: 

23 """Parser for multi-project configurations.""" 

24 

25 def __init__(self, validator: ConfigValidator): 

26 """Initialize the parser with a validator. 

27 

28 Args: 

29 validator: Configuration validator instance 

30 """ 

31 self.validator = validator 

32 

33 def parse( 

34 self, config_data: dict[str, Any], skip_validation: bool = False 

35 ) -> ParsedConfig: 

36 """Parse configuration with multi-project support. 

37 

38 Supports two formats: 

39 1. Standard: config with 'projects' section 

40 2. Simplified: config with top-level 'sources' (auto-wrapped into default project) 

41 

42 Args: 

43 config_data: Raw configuration data from YAML 

44 skip_validation: Whether to skip validation during parsing 

45 

46 Returns: 

47 ParsedConfig: Parsed configuration with project information 

48 

49 Raises: 

50 ValidationError: If configuration is invalid 

51 """ 

52 _get_logger().debug("Starting configuration parsing") 

53 

54 # Auto-wrap simplified format: top-level 'sources' → projects.default 

55 if isinstance(config_data, dict): 

56 config_data = self._normalize_config(config_data) 

57 

58 # Validate configuration structure 

59 self.validator.validate_structure(config_data) 

60 

61 # Parse global configuration 

62 global_config = self._parse_global_config( 

63 config_data.get("global", {}), skip_validation 

64 ) 

65 

66 # Parse projects 

67 projects_config = self._parse_projects(config_data, global_config) 

68 

69 _get_logger().debug( 

70 "Configuration parsing completed", 

71 project_count=len(projects_config.projects), 

72 ) 

73 

74 return ParsedConfig( 

75 global_config=global_config, 

76 projects_config=projects_config, 

77 ) 

78 

79 def _normalize_config(self, config_data: dict[str, Any]) -> dict[str, Any]: 

80 """Normalize simplified config format to standard format. 

81 

82 If config has top-level 'sources' but no 'projects', wrap sources 

83 into a default project automatically. 

84 

85 Args: 

86 config_data: Raw configuration data 

87 

88 Returns: 

89 Normalized configuration data with 'projects' section 

90 """ 

91 has_projects = "projects" in config_data 

92 has_sources = "sources" in config_data 

93 

94 if has_sources and has_projects: 

95 _get_logger().warning( 

96 "Config has both 'projects' and top-level 'sources'; " 

97 "top-level 'sources' will be ignored" 

98 ) 

99 

100 if has_sources and not has_projects: 

101 _get_logger().debug( 

102 "Simplified config detected: wrapping top-level 'sources' " 

103 "into default project" 

104 ) 

105 result = {k: v for k, v in config_data.items() if k != "sources"} 

106 result["projects"] = { 

107 "default": { 

108 "display_name": "Default Project", 

109 "sources": config_data["sources"], 

110 } 

111 } 

112 return result 

113 

114 return config_data 

115 

116 def _parse_global_config( 

117 self, global_data: dict[str, Any], skip_validation: bool = False 

118 ) -> GlobalConfig: 

119 """Parse global configuration section. 

120 

121 Args: 

122 global_data: Global configuration data 

123 skip_validation: Whether to skip validation during parsing 

124 

125 Returns: 

126 GlobalConfig: Parsed global configuration 

127 """ 

128 try: 

129 return GlobalConfig(**global_data, skip_validation=skip_validation) 

130 except ValidationError as e: 

131 _get_logger().error("Failed to parse global configuration", error=str(e)) 

132 raise 

133 

134 def _parse_projects( 

135 self, config_data: dict[str, Any], global_config: GlobalConfig 

136 ) -> ProjectsConfig: 

137 """Parse project configurations. 

138 

139 Args: 

140 config_data: Raw configuration data 

141 global_config: Parsed global configuration 

142 

143 Returns: 

144 ProjectsConfig: Parsed projects configuration 

145 """ 

146 projects_config = ProjectsConfig() 

147 

148 # Handle multi-project format 

149 projects_data = config_data.get("projects", {}) 

150 for project_id, project_data in projects_data.items(): 

151 project_config = self._parse_project_config( 

152 project_id, project_data, global_config 

153 ) 

154 projects_config.add_project(project_config) 

155 _get_logger().debug("Parsed project configuration", project_id=project_id) 

156 

157 return projects_config 

158 

159 def _parse_project_config( 

160 self, project_id: str, project_data: dict[str, Any], global_config: GlobalConfig 

161 ) -> ProjectConfig: 

162 """Parse individual project configuration. 

163 

164 Args: 

165 project_id: Project identifier 

166 project_data: Project configuration data 

167 global_config: Global configuration 

168 

169 Returns: 

170 ProjectConfig: Parsed project configuration 

171 """ 

172 # Validate project ID 

173 if not self._is_valid_project_id(project_id): 

174 raise ValueError( 

175 f"Invalid project ID '{project_id}'. " 

176 "Project IDs must be valid Python identifiers (alphanumeric + underscores)." 

177 ) 

178 

179 # Extract basic project information 

180 display_name = project_data.get("display_name", project_id) 

181 description = project_data.get("description") 

182 

183 # Parse project-specific sources with automatic field injection 

184 sources_data = project_data.get("sources", {}) 

185 enhanced_sources_data = self._inject_source_metadata(sources_data) 

186 sources_config = SourcesConfig(**enhanced_sources_data) 

187 

188 # Extract configuration overrides 

189 overrides = project_data.get("overrides", {}) 

190 

191 # Merge project-specific overrides with global config 

192 merged_overrides = self._merge_configs(global_config, overrides) 

193 

194 return ProjectConfig( 

195 project_id=project_id, 

196 display_name=display_name, 

197 description=description, 

198 sources=sources_config, 

199 overrides=merged_overrides, 

200 ) 

201 

202 def _inject_source_metadata(self, sources_data: dict[str, Any]) -> dict[str, Any]: 

203 """Inject source_type and source fields into source configurations. 

204 

205 Args: 

206 sources_data: Raw sources configuration data 

207 

208 Returns: 

209 Dict[str, Any]: Enhanced sources data with injected metadata 

210 """ 

211 enhanced_data = {} 

212 

213 for source_type, source_configs in sources_data.items(): 

214 if not isinstance(source_configs, dict): 

215 enhanced_data[source_type] = source_configs 

216 continue 

217 

218 enhanced_source_configs = {} 

219 for source_name, source_config in source_configs.items(): 

220 if isinstance(source_config, dict): 

221 # Create a copy to avoid modifying the original 

222 enhanced_config = source_config.copy() 

223 

224 # Always inject source_type and source fields 

225 enhanced_config["source_type"] = source_type 

226 enhanced_config["source"] = source_name 

227 

228 enhanced_source_configs[source_name] = enhanced_config 

229 else: 

230 enhanced_source_configs[source_name] = source_config 

231 

232 enhanced_data[source_type] = enhanced_source_configs 

233 

234 return enhanced_data 

235 

236 def _is_valid_project_id(self, project_id: str) -> bool: 

237 """Validate project ID format. 

238 

239 Args: 

240 project_id: Project identifier to validate 

241 

242 Returns: 

243 bool: True if valid, False otherwise 

244 """ 

245 # Project IDs must be valid Python identifiers 

246 # Allow alphanumeric characters, underscores, and hyphens 

247 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

248 return bool(re.match(pattern, project_id)) 

249 

250 def _merge_configs( 

251 self, global_config: GlobalConfig, project_overrides: dict[str, Any] 

252 ) -> dict[str, Any]: 

253 """Merge project-specific overrides with global configuration. 

254 

255 Args: 

256 global_config: Global configuration 

257 project_overrides: Project-specific overrides 

258 

259 Returns: 

260 Dict[str, Any]: Merged configuration 

261 """ 

262 # Convert global config to dict 

263 global_dict = global_config.to_dict() 

264 

265 # Deep merge project overrides 

266 merged = self._deep_merge_dicts(global_dict, project_overrides) 

267 

268 return merged 

269 

270 def _deep_merge_dicts( 

271 self, base: dict[str, Any], override: dict[str, Any] 

272 ) -> dict[str, Any]: 

273 """Deep merge two dictionaries. 

274 

275 Args: 

276 base: Base dictionary 

277 override: Override dictionary 

278 

279 Returns: 

280 Dict[str, Any]: Merged dictionary 

281 """ 

282 result = base.copy() 

283 

284 for key, value in override.items(): 

285 if ( 

286 key in result 

287 and isinstance(result[key], dict) 

288 and isinstance(value, dict) 

289 ): 

290 result[key] = self._deep_merge_dicts(result[key], value) 

291 else: 

292 result[key] = value 

293 

294 return result