Coverage for src/qdrant_loader/config/validator.py: 100%

114 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""Configuration validation for multi-project support. 

2 

3This module provides validation functionality for multi-project 

4configurations, ensuring data integrity and catching common configuration errors. 

5""" 

6 

7import re 

8from typing import Any 

9 

10from ..utils.logging import LoggingConfig 

11 

12logger = LoggingConfig.get_logger(__name__) 

13 

14 

15class ConfigValidator: 

16 """Validates configuration data for multi-project support.""" 

17 

18 def __init__(self): 

19 """Initialize the validator.""" 

20 pass 

21 

22 def validate_structure(self, config_data: dict[str, Any]) -> None: 

23 """Validate the overall configuration structure. 

24 

25 Args: 

26 config_data: Raw configuration data 

27 

28 Raises: 

29 ValueError: If configuration structure is invalid 

30 """ 

31 logger.debug("Validating configuration structure") 

32 

33 # Check for required sections 

34 if not isinstance(config_data, dict): 

35 raise ValueError("Configuration must be a dictionary") 

36 

37 # Validate that we have projects section 

38 if "projects" not in config_data: 

39 raise ValueError("Configuration must contain 'projects' section") 

40 

41 # Validate projects section 

42 self._validate_projects_section(config_data["projects"]) 

43 

44 # Validate global section if present 

45 if "global" in config_data: 

46 self._validate_global_section(config_data["global"]) 

47 

48 logger.debug("Configuration structure validation completed") 

49 

50 def _validate_projects_section(self, projects_data: Any) -> None: 

51 """Validate the projects section. 

52 

53 Args: 

54 projects_data: Projects configuration data 

55 

56 Raises: 

57 ValueError: If projects section is invalid 

58 """ 

59 if not isinstance(projects_data, dict): 

60 raise ValueError("'projects' section must be a dictionary") 

61 

62 if not projects_data: 

63 raise ValueError("'projects' section cannot be empty") 

64 

65 project_ids = set() 

66 collection_names = set() 

67 

68 for project_id, project_config in projects_data.items(): 

69 # Validate project ID 

70 self._validate_project_id(project_id) 

71 

72 if project_id in project_ids: 

73 raise ValueError(f"Duplicate project ID: '{project_id}'") 

74 project_ids.add(project_id) 

75 

76 # Validate individual project configuration 

77 self._validate_project_config(project_id, project_config) 

78 

79 # Check for duplicate collection names 

80 if "collection_name" in project_config: 

81 collection_name = project_config["collection_name"] 

82 if collection_name in collection_names: 

83 raise ValueError( 

84 f"Duplicate collection name '{collection_name}' " 

85 f"found in project '{project_id}'" 

86 ) 

87 collection_names.add(collection_name) 

88 

89 def _validate_project_config(self, project_id: str, project_config: Any) -> None: 

90 """Validate individual project configuration. 

91 

92 Args: 

93 project_id: Project identifier 

94 project_config: Project configuration data 

95 

96 Raises: 

97 ValueError: If project configuration is invalid 

98 """ 

99 if not isinstance(project_config, dict): 

100 raise ValueError( 

101 f"Project '{project_id}' configuration must be a dictionary" 

102 ) 

103 

104 # Validate required fields 

105 if "display_name" not in project_config: 

106 raise ValueError(f"Project '{project_id}' must have a 'display_name'") 

107 

108 display_name = project_config["display_name"] 

109 if not isinstance(display_name, str) or not display_name.strip(): 

110 raise ValueError( 

111 f"Project '{project_id}' display_name must be a non-empty string" 

112 ) 

113 

114 # Validate optional fields 

115 if "description" in project_config: 

116 description = project_config["description"] 

117 if description is not None and not isinstance(description, str): 

118 raise ValueError( 

119 f"Project '{project_id}' description must be a string or null" 

120 ) 

121 

122 if "collection_name" in project_config: 

123 collection_name = project_config["collection_name"] 

124 if not isinstance(collection_name, str) or not collection_name.strip(): 

125 raise ValueError( 

126 f"Project '{project_id}' collection_name must be a non-empty string" 

127 ) 

128 self._validate_collection_name(collection_name) 

129 

130 # Validate sources section if present 

131 if "sources" in project_config: 

132 self._validate_sources_section(project_config["sources"]) 

133 

134 # Validate overrides section if present 

135 if "overrides" in project_config: 

136 overrides = project_config["overrides"] 

137 if not isinstance(overrides, dict): 

138 raise ValueError( 

139 f"Project '{project_id}' overrides must be a dictionary" 

140 ) 

141 

142 def _validate_sources_section(self, sources_data: Any) -> None: 

143 """Validate sources configuration. 

144 

145 Args: 

146 sources_data: Sources configuration data 

147 

148 Raises: 

149 ValueError: If sources configuration is invalid 

150 """ 

151 if not isinstance(sources_data, dict): 

152 raise ValueError("'sources' section must be a dictionary") 

153 

154 # Allow empty sources section for testing purposes 

155 # In production, users would typically have at least one source configured 

156 if not sources_data: 

157 logger.debug( 

158 "Sources section is empty - this is allowed but no data will be ingested" 

159 ) 

160 return 

161 

162 # Validate each source type 

163 for source_type, source_configs in sources_data.items(): 

164 if not isinstance(source_configs, dict): 

165 raise ValueError(f"Source type '{source_type}' must be a dictionary") 

166 

167 if not source_configs: 

168 raise ValueError(f"Source type '{source_type}' cannot be empty") 

169 

170 # Validate each source configuration 

171 for source_name, source_config in source_configs.items(): 

172 if not isinstance(source_config, dict): 

173 raise ValueError( 

174 f"Source '{source_name}' in '{source_type}' must be a dictionary" 

175 ) 

176 

177 # Note: source_type and source fields are automatically injected by the parser 

178 # so we don't need to validate their presence here 

179 

180 def _validate_global_section(self, global_data: Any) -> None: 

181 """Validate global configuration section. 

182 

183 Args: 

184 global_data: Global configuration data 

185 

186 Raises: 

187 ValueError: If global configuration is invalid 

188 """ 

189 if not isinstance(global_data, dict): 

190 raise ValueError("'global' section must be a dictionary") 

191 

192 # The actual validation of global config fields will be handled 

193 # by the GlobalConfig pydantic model, so we just do basic structure checks here 

194 

195 # Validate qdrant section if present 

196 if "qdrant" in global_data: 

197 qdrant_config = global_data["qdrant"] 

198 if not isinstance(qdrant_config, dict): 

199 raise ValueError("'global.qdrant' must be a dictionary") 

200 

201 if "collection_name" in qdrant_config: 

202 collection_name = qdrant_config["collection_name"] 

203 if not isinstance(collection_name, str) or not collection_name.strip(): 

204 raise ValueError( 

205 "'global.qdrant.collection_name' must be a non-empty string" 

206 ) 

207 self._validate_collection_name(collection_name) 

208 

209 def _validate_project_id(self, project_id: str) -> None: 

210 """Validate project ID format. 

211 

212 Args: 

213 project_id: Project identifier to validate 

214 

215 Raises: 

216 ValueError: If project ID is invalid 

217 """ 

218 if not isinstance(project_id, str): 

219 raise ValueError("Project ID must be a string") 

220 

221 if not project_id.strip(): 

222 raise ValueError("Project ID cannot be empty") 

223 

224 # Project IDs must be valid identifiers (alphanumeric + underscores + hyphens) 

225 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

226 if not re.match(pattern, project_id): 

227 raise ValueError( 

228 f"Invalid project ID '{project_id}'. " 

229 "Project IDs must start with a letter and contain only " 

230 "letters, numbers, underscores, and hyphens." 

231 ) 

232 

233 # Check for reserved project IDs 

234 reserved_ids = {"default", "global", "admin", "system"} 

235 if project_id.lower() in reserved_ids: 

236 logger.warning( 

237 f"Project ID '{project_id}' is reserved and may cause conflicts" 

238 ) 

239 

240 def _validate_source_name(self, source_name: str) -> None: 

241 """Validate source name format. 

242 

243 Args: 

244 source_name: Source name to validate 

245 

246 Raises: 

247 ValueError: If source name is invalid 

248 """ 

249 if not isinstance(source_name, str): 

250 raise ValueError("Source name must be a string") 

251 

252 if not source_name.strip(): 

253 raise ValueError("Source name cannot be empty") 

254 

255 # Source names should be valid identifiers 

256 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

257 if not re.match(pattern, source_name): 

258 raise ValueError( 

259 f"Invalid source name '{source_name}'. " 

260 "Source names must start with a letter and contain only " 

261 "letters, numbers, underscores, and hyphens." 

262 ) 

263 

264 def _validate_source_config( 

265 self, source_type: str, source_name: str, source_config: Any 

266 ) -> None: 

267 """Validate individual source configuration. 

268 

269 Args: 

270 source_type: Type of the source 

271 source_name: Name of the source 

272 source_config: Source configuration data 

273 

274 Raises: 

275 ValueError: If source configuration is invalid 

276 """ 

277 if not isinstance(source_config, dict): 

278 raise ValueError( 

279 f"Source '{source_name}' of type '{source_type}' " 

280 "configuration must be a dictionary" 

281 ) 

282 

283 # Basic validation - specific source validation will be handled 

284 # by the individual source config classes 

285 if not source_config: 

286 raise ValueError( 

287 f"Source '{source_name}' of type '{source_type}' " 

288 "configuration cannot be empty" 

289 ) 

290 

291 def _validate_collection_name(self, collection_name: str) -> None: 

292 """Validate QDrant collection name format. 

293 

294 Args: 

295 collection_name: Collection name to validate 

296 

297 Raises: 

298 ValueError: If collection name is invalid 

299 """ 

300 # QDrant collection names have specific requirements 

301 # They should be valid identifiers and not too long 

302 if len(collection_name) > 255: 

303 raise ValueError( 

304 f"Collection name '{collection_name}' is too long (max 255 characters)" 

305 ) 

306 

307 # Collection names should be valid identifiers 

308 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$" 

309 if not re.match(pattern, collection_name): 

310 raise ValueError( 

311 f"Invalid collection name '{collection_name}'. " 

312 "Collection names must start with a letter and contain only " 

313 "letters, numbers, underscores, and hyphens." 

314 )