Coverage for src/qdrant_loader/config/validator.py: 100%

115 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-11 07:21 +0000

1"""Configuration validation for multi-project support. 

2 

3This module provides validation functionality for multi-project 

4configurations, ensuring data integrity and catching common configuration errors. 

5""" 

6 

7import re 

8from typing import Any 

9 

10from ..utils.logging import LoggingConfig 

11 

12 

13def _get_logger(): 

14 return LoggingConfig.get_logger(__name__) 

15 

16 

17class ConfigValidator: 

18 """Validates configuration data for multi-project support.""" 

19 

20 def __init__(self): 

21 """Initialize the validator.""" 

22 pass 

23 

24 def validate_structure(self, config_data: dict[str, Any]) -> None: 

25 """Validate the overall configuration structure. 

26 

27 Args: 

28 config_data: Raw configuration data 

29 

30 Raises: 

31 ValueError: If configuration structure is invalid 

32 """ 

33 _get_logger().debug("Validating configuration structure") 

34 

35 # Check for required sections 

36 if not isinstance(config_data, dict): 

37 raise ValueError("Configuration must be a dictionary") 

38 

39 # Validate that we have projects section 

40 if "projects" not in config_data: 

41 raise ValueError("Configuration must contain 'projects' section") 

42 

43 # Validate projects section 

44 self._validate_projects_section(config_data["projects"]) 

45 

46 # Validate global section if present 

47 if "global" in config_data: 

48 self._validate_global_section(config_data["global"]) 

49 

50 _get_logger().debug("Configuration structure validation completed") 

51 

52 def _validate_projects_section(self, projects_data: Any) -> None: 

53 """Validate the projects section. 

54 

55 Args: 

56 projects_data: Projects configuration data 

57 

58 Raises: 

59 ValueError: If projects section is invalid 

60 """ 

61 if not isinstance(projects_data, dict): 

62 raise ValueError("'projects' section must be a dictionary") 

63 

64 if not projects_data: 

65 raise ValueError("'projects' section cannot be empty") 

66 

67 project_ids = set() 

68 collection_names = set() 

69 

70 for project_id, project_config in projects_data.items(): 

71 # Validate project ID 

72 self._validate_project_id(project_id) 

73 

74 if project_id in project_ids: 

75 raise ValueError(f"Duplicate project ID: '{project_id}'") 

76 project_ids.add(project_id) 

77 

78 # Validate individual project configuration 

79 self._validate_project_config(project_id, project_config) 

80 

81 # Check for duplicate collection names 

82 if "collection_name" in project_config: 

83 collection_name = project_config["collection_name"] 

84 if collection_name in collection_names: 

85 raise ValueError( 

86 f"Duplicate collection name '{collection_name}' " 

87 f"found in project '{project_id}'" 

88 ) 

89 collection_names.add(collection_name) 

90 

91 def _validate_project_config(self, project_id: str, project_config: Any) -> None: 

92 """Validate individual project configuration. 

93 

94 Args: 

95 project_id: Project identifier 

96 project_config: Project configuration data 

97 

98 Raises: 

99 ValueError: If project configuration is invalid 

100 """ 

101 if not isinstance(project_config, dict): 

102 raise ValueError( 

103 f"Project '{project_id}' configuration must be a dictionary" 

104 ) 

105 

106 # Validate required fields 

107 if "display_name" not in project_config: 

108 raise ValueError(f"Project '{project_id}' must have a 'display_name'") 

109 

110 display_name = project_config["display_name"] 

111 if not isinstance(display_name, str) or not display_name.strip(): 

112 raise ValueError( 

113 f"Project '{project_id}' display_name must be a non-empty string" 

114 ) 

115 

116 # Validate optional fields 

117 if "description" in project_config: 

118 description = project_config["description"] 

119 if description is not None and not isinstance(description, str): 

120 raise ValueError( 

121 f"Project '{project_id}' description must be a string or null" 

122 ) 

123 

124 if "collection_name" in project_config: 

125 collection_name = project_config["collection_name"] 

126 if not isinstance(collection_name, str) or not collection_name.strip(): 

127 raise ValueError( 

128 f"Project '{project_id}' collection_name must be a non-empty string" 

129 ) 

130 self._validate_collection_name(collection_name) 

131 

132 # Validate sources section if present 

133 if "sources" in project_config: 

134 self._validate_sources_section(project_config["sources"]) 

135 

136 # Validate overrides section if present 

137 if "overrides" in project_config: 

138 overrides = project_config["overrides"] 

139 if not isinstance(overrides, dict): 

140 raise ValueError( 

141 f"Project '{project_id}' overrides must be a dictionary" 

142 ) 

143 

144 def _validate_sources_section(self, sources_data: Any) -> None: 

145 """Validate sources configuration. 

146 

147 Args: 

148 sources_data: Sources configuration data 

149 

150 Raises: 

151 ValueError: If sources configuration is invalid 

152 """ 

153 if not isinstance(sources_data, dict): 

154 raise ValueError("'sources' section must be a dictionary") 

155 

156 # Allow empty sources section for testing purposes 

157 # In production, users would typically have at least one source configured 

158 if not sources_data: 

159 _get_logger().debug( 

160 "Sources section is empty - this is allowed but no data will be ingested" 

161 ) 

162 return 

163 

164 # Validate each source type 

165 for source_type, source_configs in sources_data.items(): 

166 if not isinstance(source_configs, dict): 

167 raise ValueError(f"Source type '{source_type}' must be a dictionary") 

168 

169 if not source_configs: 

170 raise ValueError(f"Source type '{source_type}' cannot be empty") 

171 

172 # Validate each source configuration 

173 for source_name, source_config in source_configs.items(): 

174 if not isinstance(source_config, dict): 

175 raise ValueError( 

176 f"Source '{source_name}' in '{source_type}' must be a dictionary" 

177 ) 

178 

179 # Note: source_type and source fields are automatically injected by the parser 

180 # so we don't need to validate their presence here 

181 

182 def _validate_global_section(self, global_data: Any) -> None: 

183 """Validate global configuration section. 

184 

185 Args: 

186 global_data: Global configuration data 

187 

188 Raises: 

189 ValueError: If global configuration is invalid 

190 """ 

191 if not isinstance(global_data, dict): 

192 raise ValueError("'global' section must be a dictionary") 

193 

194 # The actual validation of global config fields will be handled 

195 # by the GlobalConfig pydantic model, so we just do basic structure checks here 

196 

197 # Validate qdrant section if present 

198 if "qdrant" in global_data: 

199 qdrant_config = global_data["qdrant"] 

200 if not isinstance(qdrant_config, dict): 

201 raise ValueError("'global.qdrant' must be a dictionary") 

202 

203 if "collection_name" in qdrant_config: 

204 collection_name = qdrant_config["collection_name"] 

205 if not isinstance(collection_name, str) or not collection_name.strip(): 

206 raise ValueError( 

207 "'global.qdrant.collection_name' must be a non-empty string" 

208 ) 

209 self._validate_collection_name(collection_name) 

210 

211 def _validate_project_id(self, project_id: str) -> None: 

212 """Validate project ID format. 

213 

214 Args: 

215 project_id: Project identifier to validate 

216 

217 Raises: 

218 ValueError: If project ID is invalid 

219 """ 

220 if not isinstance(project_id, str): 

221 raise ValueError("Project ID must be a string") 

222 

223 if not project_id.strip(): 

224 raise ValueError("Project ID cannot be empty") 

225 

226 # Project IDs must be valid identifiers (alphanumeric + underscores + hyphens) 

227 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

228 if not re.match(pattern, project_id): 

229 raise ValueError( 

230 f"Invalid project ID '{project_id}'. " 

231 "Project IDs must start with a letter and contain only " 

232 "letters, numbers, underscores, and hyphens." 

233 ) 

234 

235 # Check for reserved project IDs 

236 reserved_ids = {"default", "global", "admin", "system"} 

237 if project_id.lower() in reserved_ids: 

238 _get_logger().warning( 

239 f"Project ID '{project_id}' is reserved and may cause conflicts" 

240 ) 

241 

242 def _validate_source_name(self, source_name: str) -> None: 

243 """Validate source name format. 

244 

245 Args: 

246 source_name: Source name to validate 

247 

248 Raises: 

249 ValueError: If source name is invalid 

250 """ 

251 if not isinstance(source_name, str): 

252 raise ValueError("Source name must be a string") 

253 

254 if not source_name.strip(): 

255 raise ValueError("Source name cannot be empty") 

256 

257 # Source names should be valid identifiers 

258 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

259 if not re.match(pattern, source_name): 

260 raise ValueError( 

261 f"Invalid source name '{source_name}'. " 

262 "Source names must start with a letter and contain only " 

263 "letters, numbers, underscores, and hyphens." 

264 ) 

265 

266 def _validate_source_config( 

267 self, source_type: str, source_name: str, source_config: Any 

268 ) -> None: 

269 """Validate individual source configuration. 

270 

271 Args: 

272 source_type: Type of the source 

273 source_name: Name of the source 

274 source_config: Source configuration data 

275 

276 Raises: 

277 ValueError: If source configuration is invalid 

278 """ 

279 if not isinstance(source_config, dict): 

280 raise ValueError( 

281 f"Source '{source_name}' of type '{source_type}' " 

282 "configuration must be a dictionary" 

283 ) 

284 

285 # Basic validation - specific source validation will be handled 

286 # by the individual source config classes 

287 if not source_config: 

288 raise ValueError( 

289 f"Source '{source_name}' of type '{source_type}' " 

290 "configuration cannot be empty" 

291 ) 

292 

293 def _validate_collection_name(self, collection_name: str) -> None: 

294 """Validate QDrant collection name format. 

295 

296 Args: 

297 collection_name: Collection name to validate 

298 

299 Raises: 

300 ValueError: If collection name is invalid 

301 """ 

302 # QDrant collection names have specific requirements 

303 # They should be valid identifiers and not too long 

304 if len(collection_name) > 255: 

305 raise ValueError( 

306 f"Collection name '{collection_name}' is too long (max 255 characters)" 

307 ) 

308 

309 # Collection names should be valid identifiers 

310 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

311 if not re.match(pattern, collection_name): 

312 raise ValueError( 

313 f"Invalid collection name '{collection_name}'. " 

314 "Collection names must start with a letter and contain only " 

315 "letters, numbers, underscores, and hyphens." 

316 )