Coverage for src/qdrant_loader/config/validator.py: 100%

1"""Configuration validation for multi-project support.

3This module provides validation functionality for multi-project

4configurations, ensuring data integrity and catching common configuration errors.

5"""

7import re

8from typing import Any

10from ..utils.logging import LoggingConfig

13def _get_logger():

14 return LoggingConfig.get_logger(__name__)

17class ConfigValidator:

18 """Validates configuration data for multi-project support."""

20 def __init__(self):

21 """Initialize the validator."""

22 pass

24 def validate_structure(self, config_data: dict[str, Any]) -> None:

25 """Validate the overall configuration structure.

27 Args:

28 config_data: Raw configuration data

30 Raises:

31 ValueError: If configuration structure is invalid

32 """

33 _get_logger().debug("Validating configuration structure")

35 # Check for required sections

36 if not isinstance(config_data, dict):

37 raise ValueError("Configuration must be a dictionary")

39 # Validate that we have projects section

40 if "projects" not in config_data:

41 raise ValueError("Configuration must contain 'projects' section")

43 # Validate projects section

44 self._validate_projects_section(config_data["projects"])

46 # Validate global section if present

47 if "global" in config_data:

48 self._validate_global_section(config_data["global"])

50 _get_logger().debug("Configuration structure validation completed")

52 def _validate_projects_section(self, projects_data: Any) -> None:

53 """Validate the projects section.

55 Args:

56 projects_data: Projects configuration data

58 Raises:

59 ValueError: If projects section is invalid

60 """

61 if not isinstance(projects_data, dict):

62 raise ValueError("'projects' section must be a dictionary")

64 if not projects_data:

65 raise ValueError("'projects' section cannot be empty")

67 project_ids = set()

68 collection_names = set()

70 for project_id, project_config in projects_data.items():

71 # Validate project ID

72 self._validate_project_id(project_id)

74 if project_id in project_ids:

75 raise ValueError(f"Duplicate project ID: '{project_id}'")

76 project_ids.add(project_id)

78 # Validate individual project configuration

79 self._validate_project_config(project_id, project_config)

81 # Check for duplicate collection names

82 if "collection_name" in project_config:

83 collection_name = project_config["collection_name"]

84 if collection_name in collection_names:

85 raise ValueError(

86 f"Duplicate collection name '{collection_name}' "

87 f"found in project '{project_id}'"

88 )

89 collection_names.add(collection_name)

91 def _validate_project_config(self, project_id: str, project_config: Any) -> None:

92 """Validate individual project configuration.

94 Args:

95 project_id: Project identifier

96 project_config: Project configuration data

98 Raises:

99 ValueError: If project configuration is invalid

100 """

101 if not isinstance(project_config, dict):

102 raise ValueError(

103 f"Project '{project_id}' configuration must be a dictionary"

104 )

105

106 # Validate required fields

107 if "display_name" not in project_config:

108 raise ValueError(f"Project '{project_id}' must have a 'display_name'")

109

110 display_name = project_config["display_name"]

111 if not isinstance(display_name, str) or not display_name.strip():

112 raise ValueError(

113 f"Project '{project_id}' display_name must be a non-empty string"

114 )

115

116 # Validate optional fields

117 if "description" in project_config:

118 description = project_config["description"]

119 if description is not None and not isinstance(description, str):

120 raise ValueError(

121 f"Project '{project_id}' description must be a string or null"

122 )

123

124 if "collection_name" in project_config:

125 collection_name = project_config["collection_name"]

126 if not isinstance(collection_name, str) or not collection_name.strip():

127 raise ValueError(

128 f"Project '{project_id}' collection_name must be a non-empty string"

129 )

130 self._validate_collection_name(collection_name)

131

132 # Validate sources section if present

133 if "sources" in project_config:

134 self._validate_sources_section(project_config["sources"])

135

136 # Validate overrides section if present

137 if "overrides" in project_config:

138 overrides = project_config["overrides"]

139 if not isinstance(overrides, dict):

140 raise ValueError(

141 f"Project '{project_id}' overrides must be a dictionary"

142 )

143

144 def _validate_sources_section(self, sources_data: Any) -> None:

145 """Validate sources configuration.

146

147 Args:

148 sources_data: Sources configuration data

149

150 Raises:

151 ValueError: If sources configuration is invalid

152 """

153 if not isinstance(sources_data, dict):

154 raise ValueError("'sources' section must be a dictionary")

155

156 # Allow empty sources section for testing purposes

157 # In production, users would typically have at least one source configured

158 if not sources_data:

159 _get_logger().debug(

160 "Sources section is empty - this is allowed but no data will be ingested"

161 )

162 return

163

164 # Validate each source type

165 for source_type, source_configs in sources_data.items():

166 if not isinstance(source_configs, dict):

167 raise ValueError(f"Source type '{source_type}' must be a dictionary")

168

169 if not source_configs:

170 raise ValueError(f"Source type '{source_type}' cannot be empty")

171

172 # Validate each source configuration

173 for source_name, source_config in source_configs.items():

174 if not isinstance(source_config, dict):

175 raise ValueError(

176 f"Source '{source_name}' in '{source_type}' must be a dictionary"

177 )

178

179 # Note: source_type and source fields are automatically injected by the parser

180 # so we don't need to validate their presence here

181

182 def _validate_global_section(self, global_data: Any) -> None:

183 """Validate global configuration section.

184

185 Args:

186 global_data: Global configuration data

187

188 Raises:

189 ValueError: If global configuration is invalid

190 """

191 if not isinstance(global_data, dict):

192 raise ValueError("'global' section must be a dictionary")

193

194 # The actual validation of global config fields will be handled

195 # by the GlobalConfig pydantic model, so we just do basic structure checks here

196

197 # Validate qdrant section if present

198 if "qdrant" in global_data:

199 qdrant_config = global_data["qdrant"]

200 if not isinstance(qdrant_config, dict):

201 raise ValueError("'global.qdrant' must be a dictionary")

202

203 if "collection_name" in qdrant_config:

204 collection_name = qdrant_config["collection_name"]

205 if not isinstance(collection_name, str) or not collection_name.strip():

206 raise ValueError(

207 "'global.qdrant.collection_name' must be a non-empty string"

208 )

209 self._validate_collection_name(collection_name)

210

211 def _validate_project_id(self, project_id: str) -> None:

212 """Validate project ID format.

213

214 Args:

215 project_id: Project identifier to validate

216

217 Raises:

218 ValueError: If project ID is invalid

219 """

220 if not isinstance(project_id, str):

221 raise ValueError("Project ID must be a string")

222

223 if not project_id.strip():

224 raise ValueError("Project ID cannot be empty")

225

226 # Project IDs must be valid identifiers (alphanumeric + underscores + hyphens)

227 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"

228 if not re.match(pattern, project_id):

229 raise ValueError(

230 f"Invalid project ID '{project_id}'. "

231 "Project IDs must start with a letter and contain only "

232 "letters, numbers, underscores, and hyphens."

233 )

234

235 # Check for reserved project IDs

236 reserved_ids = {"default", "global", "admin", "system"}

237 if project_id.lower() in reserved_ids:

238 _get_logger().warning(

239 f"Project ID '{project_id}' is reserved and may cause conflicts"

240 )

241

242 def _validate_source_name(self, source_name: str) -> None:

243 """Validate source name format.

244

245 Args:

246 source_name: Source name to validate

247

248 Raises:

249 ValueError: If source name is invalid

250 """

251 if not isinstance(source_name, str):

252 raise ValueError("Source name must be a string")

253

254 if not source_name.strip():

255 raise ValueError("Source name cannot be empty")

256

257 # Source names should be valid identifiers

258 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"

259 if not re.match(pattern, source_name):

260 raise ValueError(

261 f"Invalid source name '{source_name}'. "

262 "Source names must start with a letter and contain only "

263 "letters, numbers, underscores, and hyphens."

264 )

265

266 def _validate_source_config(

267 self, source_type: str, source_name: str, source_config: Any

268 ) -> None:

269 """Validate individual source configuration.

270

271 Args:

272 source_type: Type of the source

273 source_name: Name of the source

274 source_config: Source configuration data

275

276 Raises:

277 ValueError: If source configuration is invalid

278 """

279 if not isinstance(source_config, dict):

280 raise ValueError(

281 f"Source '{source_name}' of type '{source_type}' "

282 "configuration must be a dictionary"

283 )

284

285 # Basic validation - specific source validation will be handled

286 # by the individual source config classes

287 if not source_config:

288 raise ValueError(

289 f"Source '{source_name}' of type '{source_type}' "

290 "configuration cannot be empty"

291 )

292

293 def _validate_collection_name(self, collection_name: str) -> None:

294 """Validate QDrant collection name format.

295

296 Args:

297 collection_name: Collection name to validate

298

299 Raises:

300 ValueError: If collection name is invalid

301 """

302 # QDrant collection names have specific requirements

303 # They should be valid identifiers and not too long

304 if len(collection_name) > 255:

305 raise ValueError(

306 f"Collection name '{collection_name}' is too long (max 255 characters)"

307 )

308

309 # Collection names should be valid identifiers

310 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"

311 if not re.match(pattern, collection_name):

312 raise ValueError(

313 f"Invalid collection name '{collection_name}'. "

314 "Collection names must start with a letter and contain only "

315 "letters, numbers, underscores, and hyphens."

316 )

Coverage for src / qdrant_loader / config / validator.py: 100%

115 statements