Coverage for src/qdrant_loader/config/validator.py: 66%

119 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:50 +0000

1"""Configuration validation for multi-project support. 

2 

3This module provides validation functionality for both legacy and multi-project 

4configurations, ensuring data integrity and catching common configuration errors. 

5""" 

6 

7import re 

8from typing import Any, Dict, List, Set 

9 

10from ..utils.logging import LoggingConfig 

11 

12logger = LoggingConfig.get_logger(__name__) 

13 

14 

15class ConfigValidator: 

16 """Validates configuration data for multi-project support.""" 

17 

18 def __init__(self): 

19 """Initialize the validator.""" 

20 pass 

21 

22 def validate_structure(self, config_data: Dict[str, Any]) -> None: 

23 """Validate the overall configuration structure. 

24 

25 Args: 

26 config_data: Raw configuration data 

27 

28 Raises: 

29 ValueError: If configuration structure is invalid 

30 """ 

31 logger.debug("Validating configuration structure") 

32 

33 # Check for required sections 

34 if not isinstance(config_data, dict): 

35 raise ValueError("Configuration must be a dictionary") 

36 

37 # Validate that we have either sources or projects, but configuration is valid either way 

38 has_sources = "sources" in config_data 

39 has_projects = "projects" in config_data 

40 

41 if not has_sources and not has_projects: 

42 raise ValueError( 

43 "Configuration must contain either 'sources' (legacy) or 'projects' section" 

44 ) 

45 

46 # If we have projects section, validate it 

47 if has_projects: 

48 self._validate_projects_section(config_data["projects"]) 

49 

50 # If we have legacy sources, validate them 

51 if has_sources and not has_projects: 

52 self._validate_sources_section(config_data["sources"]) 

53 

54 # Validate global section if present 

55 if "global" in config_data: 

56 self._validate_global_section(config_data["global"]) 

57 

58 logger.debug("Configuration structure validation completed") 

59 

60 def _validate_projects_section(self, projects_data: Any) -> None: 

61 """Validate the projects section. 

62 

63 Args: 

64 projects_data: Projects configuration data 

65 

66 Raises: 

67 ValueError: If projects section is invalid 

68 """ 

69 if not isinstance(projects_data, dict): 

70 raise ValueError("'projects' section must be a dictionary") 

71 

72 if not projects_data: 

73 raise ValueError("'projects' section cannot be empty") 

74 

75 project_ids = set() 

76 collection_names = set() 

77 

78 for project_id, project_config in projects_data.items(): 

79 # Validate project ID 

80 self._validate_project_id(project_id) 

81 

82 if project_id in project_ids: 

83 raise ValueError(f"Duplicate project ID: '{project_id}'") 

84 project_ids.add(project_id) 

85 

86 # Validate individual project configuration 

87 self._validate_project_config(project_id, project_config) 

88 

89 # Check for duplicate collection names 

90 if "collection_name" in project_config: 

91 collection_name = project_config["collection_name"] 

92 if collection_name in collection_names: 

93 raise ValueError( 

94 f"Duplicate collection name '{collection_name}' " 

95 f"found in project '{project_id}'" 

96 ) 

97 collection_names.add(collection_name) 

98 

99 def _validate_project_config(self, project_id: str, project_config: Any) -> None: 

100 """Validate individual project configuration. 

101 

102 Args: 

103 project_id: Project identifier 

104 project_config: Project configuration data 

105 

106 Raises: 

107 ValueError: If project configuration is invalid 

108 """ 

109 if not isinstance(project_config, dict): 

110 raise ValueError( 

111 f"Project '{project_id}' configuration must be a dictionary" 

112 ) 

113 

114 # Validate required fields 

115 if "display_name" not in project_config: 

116 raise ValueError(f"Project '{project_id}' must have a 'display_name'") 

117 

118 display_name = project_config["display_name"] 

119 if not isinstance(display_name, str) or not display_name.strip(): 

120 raise ValueError( 

121 f"Project '{project_id}' display_name must be a non-empty string" 

122 ) 

123 

124 # Validate optional fields 

125 if "description" in project_config: 

126 description = project_config["description"] 

127 if description is not None and not isinstance(description, str): 

128 raise ValueError( 

129 f"Project '{project_id}' description must be a string or null" 

130 ) 

131 

132 if "collection_name" in project_config: 

133 collection_name = project_config["collection_name"] 

134 if not isinstance(collection_name, str) or not collection_name.strip(): 

135 raise ValueError( 

136 f"Project '{project_id}' collection_name must be a non-empty string" 

137 ) 

138 self._validate_collection_name(collection_name) 

139 

140 # Validate sources section if present 

141 if "sources" in project_config: 

142 self._validate_sources_section(project_config["sources"]) 

143 

144 # Validate overrides section if present 

145 if "overrides" in project_config: 

146 overrides = project_config["overrides"] 

147 if not isinstance(overrides, dict): 

148 raise ValueError( 

149 f"Project '{project_id}' overrides must be a dictionary" 

150 ) 

151 

152 def _validate_sources_section(self, sources_data: Any) -> None: 

153 """Validate sources configuration. 

154 

155 Args: 

156 sources_data: Sources configuration data 

157 

158 Raises: 

159 ValueError: If sources configuration is invalid 

160 """ 

161 if not isinstance(sources_data, dict): 

162 raise ValueError("'sources' section must be a dictionary") 

163 

164 # Allow empty sources section for testing purposes 

165 # In production, users would typically have at least one source configured 

166 if not sources_data: 

167 logger.debug( 

168 "Sources section is empty - this is allowed but no data will be ingested" 

169 ) 

170 return 

171 

172 # Validate each source type 

173 for source_type, source_configs in sources_data.items(): 

174 if not isinstance(source_configs, dict): 

175 raise ValueError(f"Source type '{source_type}' must be a dictionary") 

176 

177 if not source_configs: 

178 raise ValueError(f"Source type '{source_type}' cannot be empty") 

179 

180 # Validate each source configuration 

181 for source_name, source_config in source_configs.items(): 

182 if not isinstance(source_config, dict): 

183 raise ValueError( 

184 f"Source '{source_name}' in '{source_type}' must be a dictionary" 

185 ) 

186 

187 # Note: source_type and source fields are automatically injected by the parser 

188 # so we don't need to validate their presence here 

189 

190 def _validate_global_section(self, global_data: Any) -> None: 

191 """Validate global configuration section. 

192 

193 Args: 

194 global_data: Global configuration data 

195 

196 Raises: 

197 ValueError: If global configuration is invalid 

198 """ 

199 if not isinstance(global_data, dict): 

200 raise ValueError("'global' section must be a dictionary") 

201 

202 # The actual validation of global config fields will be handled 

203 # by the GlobalConfig pydantic model, so we just do basic structure checks here 

204 

205 # Validate qdrant section if present 

206 if "qdrant" in global_data: 

207 qdrant_config = global_data["qdrant"] 

208 if not isinstance(qdrant_config, dict): 

209 raise ValueError("'global.qdrant' must be a dictionary") 

210 

211 if "collection_name" in qdrant_config: 

212 collection_name = qdrant_config["collection_name"] 

213 if not isinstance(collection_name, str) or not collection_name.strip(): 

214 raise ValueError( 

215 "'global.qdrant.collection_name' must be a non-empty string" 

216 ) 

217 self._validate_collection_name(collection_name) 

218 

219 def _validate_project_id(self, project_id: str) -> None: 

220 """Validate project ID format. 

221 

222 Args: 

223 project_id: Project identifier to validate 

224 

225 Raises: 

226 ValueError: If project ID is invalid 

227 """ 

228 if not isinstance(project_id, str): 

229 raise ValueError("Project ID must be a string") 

230 

231 if not project_id.strip(): 

232 raise ValueError("Project ID cannot be empty") 

233 

234 # Project IDs must be valid identifiers (alphanumeric + underscores + hyphens) 

235 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

236 if not re.match(pattern, project_id): 

237 raise ValueError( 

238 f"Invalid project ID '{project_id}'. " 

239 "Project IDs must start with a letter and contain only " 

240 "letters, numbers, underscores, and hyphens." 

241 ) 

242 

243 # Check for reserved project IDs 

244 reserved_ids = {"default", "global", "admin", "system"} 

245 if project_id.lower() in reserved_ids: 

246 logger.warning( 

247 f"Project ID '{project_id}' is reserved and may cause conflicts" 

248 ) 

249 

250 def _validate_source_name(self, source_name: str) -> None: 

251 """Validate source name format. 

252 

253 Args: 

254 source_name: Source name to validate 

255 

256 Raises: 

257 ValueError: If source name is invalid 

258 """ 

259 if not isinstance(source_name, str): 

260 raise ValueError("Source name must be a string") 

261 

262 if not source_name.strip(): 

263 raise ValueError("Source name cannot be empty") 

264 

265 # Source names should be valid identifiers 

266 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

267 if not re.match(pattern, source_name): 

268 raise ValueError( 

269 f"Invalid source name '{source_name}'. " 

270 "Source names must start with a letter and contain only " 

271 "letters, numbers, underscores, and hyphens." 

272 ) 

273 

274 def _validate_source_config( 

275 self, source_type: str, source_name: str, source_config: Any 

276 ) -> None: 

277 """Validate individual source configuration. 

278 

279 Args: 

280 source_type: Type of the source 

281 source_name: Name of the source 

282 source_config: Source configuration data 

283 

284 Raises: 

285 ValueError: If source configuration is invalid 

286 """ 

287 if not isinstance(source_config, dict): 

288 raise ValueError( 

289 f"Source '{source_name}' of type '{source_type}' " 

290 "configuration must be a dictionary" 

291 ) 

292 

293 # Basic validation - specific source validation will be handled 

294 # by the individual source config classes 

295 if not source_config: 

296 raise ValueError( 

297 f"Source '{source_name}' of type '{source_type}' " 

298 "configuration cannot be empty" 

299 ) 

300 

301 def _validate_collection_name(self, collection_name: str) -> None: 

302 """Validate QDrant collection name format. 

303 

304 Args: 

305 collection_name: Collection name to validate 

306 

307 Raises: 

308 ValueError: If collection name is invalid 

309 """ 

310 # QDrant collection names have specific requirements 

311 # They should be valid identifiers and not too long 

312 if len(collection_name) > 255: 

313 raise ValueError( 

314 f"Collection name '{collection_name}' is too long (max 255 characters)" 

315 ) 

316 

317 # Collection names should be valid identifiers 

318 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

319 if not re.match(pattern, collection_name): 

320 raise ValueError( 

321 f"Invalid collection name '{collection_name}'. " 

322 "Collection names must start with a letter and contain only " 

323 "letters, numbers, underscores, and hyphens." 

324 )