Coverage for src / qdrant_loader / config / validator.py: 99%

120 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-18 04:48 +0000

1"""Configuration validation for multi-project support. 

2 

3This module provides validation functionality for multi-project 

4configurations, ensuring data integrity and catching common configuration errors. 

5""" 

6 

7import re 

8from typing import Any 

9 

10from ..utils.logging import LoggingConfig 

11 

12 

13def _get_logger(): 

14 return LoggingConfig.get_logger(__name__) 

15 

16 

17class ConfigValidator: 

18 """Validates configuration data for multi-project support.""" 

19 

20 def __init__(self): 

21 """Initialize the validator.""" 

22 pass 

23 

24 def validate_structure(self, config_data: dict[str, Any]) -> None: 

25 """Validate the overall configuration structure. 

26 

27 Supports two formats: 

28 1. Standard: config with 'projects' section 

29 2. Simplified: config with top-level 'sources' section (auto-wrapped into a default project) 

30 

31 Args: 

32 config_data: Raw configuration data 

33 

34 Raises: 

35 ValueError: If configuration structure is invalid 

36 """ 

37 _get_logger().debug("Validating configuration structure") 

38 

39 # Check for required sections 

40 if not isinstance(config_data, dict): 

41 raise ValueError("Configuration must be a dictionary") 

42 

43 # Accept either 'projects' or top-level 'sources' (simplified format) 

44 has_projects = "projects" in config_data 

45 has_sources = "sources" in config_data 

46 

47 if not has_projects and not has_sources: 

48 raise ValueError( 

49 "Configuration must contain either 'projects' section " 

50 "or top-level 'sources' section" 

51 ) 

52 

53 if has_projects: 

54 # Validate projects section 

55 self._validate_projects_section(config_data["projects"]) 

56 

57 if has_sources and not has_projects: 

58 # Simplified format: validate top-level sources 

59 self._validate_sources_section(config_data["sources"]) 

60 

61 # Validate global section if present 

62 if "global" in config_data: 

63 self._validate_global_section(config_data["global"]) 

64 

65 _get_logger().debug("Configuration structure validation completed") 

66 

67 def _validate_projects_section(self, projects_data: Any) -> None: 

68 """Validate the projects section. 

69 

70 Args: 

71 projects_data: Projects configuration data 

72 

73 Raises: 

74 ValueError: If projects section is invalid 

75 """ 

76 if not isinstance(projects_data, dict): 

77 raise ValueError("'projects' section must be a dictionary") 

78 

79 if not projects_data: 

80 raise ValueError("'projects' section cannot be empty") 

81 

82 project_ids = set() 

83 collection_names = set() 

84 

85 for project_id, project_config in projects_data.items(): 

86 # Validate project ID 

87 self._validate_project_id(project_id) 

88 

89 if project_id in project_ids: 

90 raise ValueError(f"Duplicate project ID: '{project_id}'") 

91 project_ids.add(project_id) 

92 

93 # Validate individual project configuration 

94 self._validate_project_config(project_id, project_config) 

95 

96 # Check for duplicate collection names 

97 if "collection_name" in project_config: 

98 collection_name = project_config["collection_name"] 

99 if collection_name in collection_names: 

100 raise ValueError( 

101 f"Duplicate collection name '{collection_name}' " 

102 f"found in project '{project_id}'" 

103 ) 

104 collection_names.add(collection_name) 

105 

106 def _validate_project_config(self, project_id: str, project_config: Any) -> None: 

107 """Validate individual project configuration. 

108 

109 Args: 

110 project_id: Project identifier 

111 project_config: Project configuration data 

112 

113 Raises: 

114 ValueError: If project configuration is invalid 

115 """ 

116 if not isinstance(project_config, dict): 

117 raise ValueError( 

118 f"Project '{project_id}' configuration must be a dictionary" 

119 ) 

120 

121 # Validate required fields 

122 if "display_name" not in project_config: 

123 raise ValueError(f"Project '{project_id}' must have a 'display_name'") 

124 

125 display_name = project_config["display_name"] 

126 if not isinstance(display_name, str) or not display_name.strip(): 

127 raise ValueError( 

128 f"Project '{project_id}' display_name must be a non-empty string" 

129 ) 

130 

131 # Validate optional fields 

132 if "description" in project_config: 

133 description = project_config["description"] 

134 if description is not None and not isinstance(description, str): 

135 raise ValueError( 

136 f"Project '{project_id}' description must be a string or null" 

137 ) 

138 

139 if "collection_name" in project_config: 

140 collection_name = project_config["collection_name"] 

141 if not isinstance(collection_name, str) or not collection_name.strip(): 

142 raise ValueError( 

143 f"Project '{project_id}' collection_name must be a non-empty string" 

144 ) 

145 self._validate_collection_name(collection_name) 

146 

147 # Validate sources section if present 

148 if "sources" in project_config: 

149 self._validate_sources_section(project_config["sources"]) 

150 

151 # Validate overrides section if present 

152 if "overrides" in project_config: 

153 overrides = project_config["overrides"] 

154 if not isinstance(overrides, dict): 

155 raise ValueError( 

156 f"Project '{project_id}' overrides must be a dictionary" 

157 ) 

158 

159 def _validate_sources_section(self, sources_data: Any) -> None: 

160 """Validate sources configuration. 

161 

162 Args: 

163 sources_data: Sources configuration data 

164 

165 Raises: 

166 ValueError: If sources configuration is invalid 

167 """ 

168 if not isinstance(sources_data, dict): 

169 raise ValueError("'sources' section must be a dictionary") 

170 

171 # Allow empty sources section for testing purposes 

172 # In production, users would typically have at least one source configured 

173 if not sources_data: 

174 _get_logger().debug( 

175 "Sources section is empty - this is allowed but no data will be ingested" 

176 ) 

177 return 

178 

179 # Validate each source type 

180 for source_type, source_configs in sources_data.items(): 

181 if not isinstance(source_configs, dict): 

182 raise ValueError(f"Source type '{source_type}' must be a dictionary") 

183 

184 if not source_configs: 

185 raise ValueError(f"Source type '{source_type}' cannot be empty") 

186 

187 # Validate each source configuration 

188 for source_name, source_config in source_configs.items(): 

189 if not isinstance(source_config, dict): 

190 raise ValueError( 

191 f"Source '{source_name}' in '{source_type}' must be a dictionary" 

192 ) 

193 

194 # Note: source_type and source fields are automatically injected by the parser 

195 # so we don't need to validate their presence here 

196 

197 def _validate_global_section(self, global_data: Any) -> None: 

198 """Validate global configuration section. 

199 

200 Args: 

201 global_data: Global configuration data 

202 

203 Raises: 

204 ValueError: If global configuration is invalid 

205 """ 

206 if not isinstance(global_data, dict): 

207 raise ValueError("'global' section must be a dictionary") 

208 

209 # The actual validation of global config fields will be handled 

210 # by the GlobalConfig pydantic model, so we just do basic structure checks here 

211 

212 # Validate qdrant section if present 

213 if "qdrant" in global_data: 

214 qdrant_config = global_data["qdrant"] 

215 if not isinstance(qdrant_config, dict): 

216 raise ValueError("'global.qdrant' must be a dictionary") 

217 

218 if "collection_name" in qdrant_config: 

219 collection_name = qdrant_config["collection_name"] 

220 if not isinstance(collection_name, str) or not collection_name.strip(): 

221 raise ValueError( 

222 "'global.qdrant.collection_name' must be a non-empty string" 

223 ) 

224 self._validate_collection_name(collection_name) 

225 

226 def _validate_project_id(self, project_id: str) -> None: 

227 """Validate project ID format. 

228 

229 Args: 

230 project_id: Project identifier to validate 

231 

232 Raises: 

233 ValueError: If project ID is invalid 

234 """ 

235 if not isinstance(project_id, str): 

236 raise ValueError("Project ID must be a string") 

237 

238 if not project_id.strip(): 

239 raise ValueError("Project ID cannot be empty") 

240 

241 # Project IDs must be valid identifiers (alphanumeric + underscores + hyphens) 

242 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

243 if not re.match(pattern, project_id): 

244 raise ValueError( 

245 f"Invalid project ID '{project_id}'. " 

246 "Project IDs must start with a letter and contain only " 

247 "letters, numbers, underscores, and hyphens." 

248 ) 

249 

250 # Check for reserved project IDs ('default' is allowed for simplified config) 

251 reserved_ids = {"global", "admin", "system"} 

252 if project_id.lower() in reserved_ids: 

253 _get_logger().warning( 

254 f"Project ID '{project_id}' is reserved and may cause conflicts" 

255 ) 

256 

257 def _validate_source_name(self, source_name: str) -> None: 

258 """Validate source name format. 

259 

260 Args: 

261 source_name: Source name to validate 

262 

263 Raises: 

264 ValueError: If source name is invalid 

265 """ 

266 if not isinstance(source_name, str): 

267 raise ValueError("Source name must be a string") 

268 

269 if not source_name.strip(): 

270 raise ValueError("Source name cannot be empty") 

271 

272 # Source names should be valid identifiers 

273 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

274 if not re.match(pattern, source_name): 

275 raise ValueError( 

276 f"Invalid source name '{source_name}'. " 

277 "Source names must start with a letter and contain only " 

278 "letters, numbers, underscores, and hyphens." 

279 ) 

280 

281 def _validate_source_config( 

282 self, source_type: str, source_name: str, source_config: Any 

283 ) -> None: 

284 """Validate individual source configuration. 

285 

286 Args: 

287 source_type: Type of the source 

288 source_name: Name of the source 

289 source_config: Source configuration data 

290 

291 Raises: 

292 ValueError: If source configuration is invalid 

293 """ 

294 if not isinstance(source_config, dict): 

295 raise ValueError( 

296 f"Source '{source_name}' of type '{source_type}' " 

297 "configuration must be a dictionary" 

298 ) 

299 

300 # Basic validation - specific source validation will be handled 

301 # by the individual source config classes 

302 if not source_config: 

303 raise ValueError( 

304 f"Source '{source_name}' of type '{source_type}' " 

305 "configuration cannot be empty" 

306 ) 

307 

308 def _validate_collection_name(self, collection_name: str) -> None: 

309 """Validate QDrant collection name format. 

310 

311 Args: 

312 collection_name: Collection name to validate 

313 

314 Raises: 

315 ValueError: If collection name is invalid 

316 """ 

317 # QDrant collection names have specific requirements 

318 # They should be valid identifiers and not too long 

319 if len(collection_name) > 255: 

320 raise ValueError( 

321 f"Collection name '{collection_name}' is too long (max 255 characters)" 

322 ) 

323 

324 # Collection names should be valid identifiers 

325 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

326 if not re.match(pattern, collection_name): 

327 raise ValueError( 

328 f"Invalid collection name '{collection_name}'. " 

329 "Collection names must start with a letter and contain only " 

330 "letters, numbers, underscores, and hyphens." 

331 )