Coverage for src / qdrant_loader / config / validator.py: 99%
120 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-18 04:48 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-18 04:48 +0000
1"""Configuration validation for multi-project support.
3This module provides validation functionality for multi-project
4configurations, ensuring data integrity and catching common configuration errors.
5"""
7import re
8from typing import Any
10from ..utils.logging import LoggingConfig
13def _get_logger():
14 return LoggingConfig.get_logger(__name__)
17class ConfigValidator:
18 """Validates configuration data for multi-project support."""
20 def __init__(self):
21 """Initialize the validator."""
22 pass
24 def validate_structure(self, config_data: dict[str, Any]) -> None:
25 """Validate the overall configuration structure.
27 Supports two formats:
28 1. Standard: config with 'projects' section
29 2. Simplified: config with top-level 'sources' section (auto-wrapped into a default project)
31 Args:
32 config_data: Raw configuration data
34 Raises:
35 ValueError: If configuration structure is invalid
36 """
37 _get_logger().debug("Validating configuration structure")
39 # Check for required sections
40 if not isinstance(config_data, dict):
41 raise ValueError("Configuration must be a dictionary")
43 # Accept either 'projects' or top-level 'sources' (simplified format)
44 has_projects = "projects" in config_data
45 has_sources = "sources" in config_data
47 if not has_projects and not has_sources:
48 raise ValueError(
49 "Configuration must contain either 'projects' section "
50 "or top-level 'sources' section"
51 )
53 if has_projects:
54 # Validate projects section
55 self._validate_projects_section(config_data["projects"])
57 if has_sources and not has_projects:
58 # Simplified format: validate top-level sources
59 self._validate_sources_section(config_data["sources"])
61 # Validate global section if present
62 if "global" in config_data:
63 self._validate_global_section(config_data["global"])
65 _get_logger().debug("Configuration structure validation completed")
67 def _validate_projects_section(self, projects_data: Any) -> None:
68 """Validate the projects section.
70 Args:
71 projects_data: Projects configuration data
73 Raises:
74 ValueError: If projects section is invalid
75 """
76 if not isinstance(projects_data, dict):
77 raise ValueError("'projects' section must be a dictionary")
79 if not projects_data:
80 raise ValueError("'projects' section cannot be empty")
82 project_ids = set()
83 collection_names = set()
85 for project_id, project_config in projects_data.items():
86 # Validate project ID
87 self._validate_project_id(project_id)
89 if project_id in project_ids:
90 raise ValueError(f"Duplicate project ID: '{project_id}'")
91 project_ids.add(project_id)
93 # Validate individual project configuration
94 self._validate_project_config(project_id, project_config)
96 # Check for duplicate collection names
97 if "collection_name" in project_config:
98 collection_name = project_config["collection_name"]
99 if collection_name in collection_names:
100 raise ValueError(
101 f"Duplicate collection name '{collection_name}' "
102 f"found in project '{project_id}'"
103 )
104 collection_names.add(collection_name)
106 def _validate_project_config(self, project_id: str, project_config: Any) -> None:
107 """Validate individual project configuration.
109 Args:
110 project_id: Project identifier
111 project_config: Project configuration data
113 Raises:
114 ValueError: If project configuration is invalid
115 """
116 if not isinstance(project_config, dict):
117 raise ValueError(
118 f"Project '{project_id}' configuration must be a dictionary"
119 )
121 # Validate required fields
122 if "display_name" not in project_config:
123 raise ValueError(f"Project '{project_id}' must have a 'display_name'")
125 display_name = project_config["display_name"]
126 if not isinstance(display_name, str) or not display_name.strip():
127 raise ValueError(
128 f"Project '{project_id}' display_name must be a non-empty string"
129 )
131 # Validate optional fields
132 if "description" in project_config:
133 description = project_config["description"]
134 if description is not None and not isinstance(description, str):
135 raise ValueError(
136 f"Project '{project_id}' description must be a string or null"
137 )
139 if "collection_name" in project_config:
140 collection_name = project_config["collection_name"]
141 if not isinstance(collection_name, str) or not collection_name.strip():
142 raise ValueError(
143 f"Project '{project_id}' collection_name must be a non-empty string"
144 )
145 self._validate_collection_name(collection_name)
147 # Validate sources section if present
148 if "sources" in project_config:
149 self._validate_sources_section(project_config["sources"])
151 # Validate overrides section if present
152 if "overrides" in project_config:
153 overrides = project_config["overrides"]
154 if not isinstance(overrides, dict):
155 raise ValueError(
156 f"Project '{project_id}' overrides must be a dictionary"
157 )
159 def _validate_sources_section(self, sources_data: Any) -> None:
160 """Validate sources configuration.
162 Args:
163 sources_data: Sources configuration data
165 Raises:
166 ValueError: If sources configuration is invalid
167 """
168 if not isinstance(sources_data, dict):
169 raise ValueError("'sources' section must be a dictionary")
171 # Allow empty sources section for testing purposes
172 # In production, users would typically have at least one source configured
173 if not sources_data:
174 _get_logger().debug(
175 "Sources section is empty - this is allowed but no data will be ingested"
176 )
177 return
179 # Validate each source type
180 for source_type, source_configs in sources_data.items():
181 if not isinstance(source_configs, dict):
182 raise ValueError(f"Source type '{source_type}' must be a dictionary")
184 if not source_configs:
185 raise ValueError(f"Source type '{source_type}' cannot be empty")
187 # Validate each source configuration
188 for source_name, source_config in source_configs.items():
189 if not isinstance(source_config, dict):
190 raise ValueError(
191 f"Source '{source_name}' in '{source_type}' must be a dictionary"
192 )
194 # Note: source_type and source fields are automatically injected by the parser
195 # so we don't need to validate their presence here
197 def _validate_global_section(self, global_data: Any) -> None:
198 """Validate global configuration section.
200 Args:
201 global_data: Global configuration data
203 Raises:
204 ValueError: If global configuration is invalid
205 """
206 if not isinstance(global_data, dict):
207 raise ValueError("'global' section must be a dictionary")
209 # The actual validation of global config fields will be handled
210 # by the GlobalConfig pydantic model, so we just do basic structure checks here
212 # Validate qdrant section if present
213 if "qdrant" in global_data:
214 qdrant_config = global_data["qdrant"]
215 if not isinstance(qdrant_config, dict):
216 raise ValueError("'global.qdrant' must be a dictionary")
218 if "collection_name" in qdrant_config:
219 collection_name = qdrant_config["collection_name"]
220 if not isinstance(collection_name, str) or not collection_name.strip():
221 raise ValueError(
222 "'global.qdrant.collection_name' must be a non-empty string"
223 )
224 self._validate_collection_name(collection_name)
226 def _validate_project_id(self, project_id: str) -> None:
227 """Validate project ID format.
229 Args:
230 project_id: Project identifier to validate
232 Raises:
233 ValueError: If project ID is invalid
234 """
235 if not isinstance(project_id, str):
236 raise ValueError("Project ID must be a string")
238 if not project_id.strip():
239 raise ValueError("Project ID cannot be empty")
241 # Project IDs must be valid identifiers (alphanumeric + underscores + hyphens)
242 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"
243 if not re.match(pattern, project_id):
244 raise ValueError(
245 f"Invalid project ID '{project_id}'. "
246 "Project IDs must start with a letter and contain only "
247 "letters, numbers, underscores, and hyphens."
248 )
250 # Check for reserved project IDs ('default' is allowed for simplified config)
251 reserved_ids = {"global", "admin", "system"}
252 if project_id.lower() in reserved_ids:
253 _get_logger().warning(
254 f"Project ID '{project_id}' is reserved and may cause conflicts"
255 )
257 def _validate_source_name(self, source_name: str) -> None:
258 """Validate source name format.
260 Args:
261 source_name: Source name to validate
263 Raises:
264 ValueError: If source name is invalid
265 """
266 if not isinstance(source_name, str):
267 raise ValueError("Source name must be a string")
269 if not source_name.strip():
270 raise ValueError("Source name cannot be empty")
272 # Source names should be valid identifiers
273 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"
274 if not re.match(pattern, source_name):
275 raise ValueError(
276 f"Invalid source name '{source_name}'. "
277 "Source names must start with a letter and contain only "
278 "letters, numbers, underscores, and hyphens."
279 )
281 def _validate_source_config(
282 self, source_type: str, source_name: str, source_config: Any
283 ) -> None:
284 """Validate individual source configuration.
286 Args:
287 source_type: Type of the source
288 source_name: Name of the source
289 source_config: Source configuration data
291 Raises:
292 ValueError: If source configuration is invalid
293 """
294 if not isinstance(source_config, dict):
295 raise ValueError(
296 f"Source '{source_name}' of type '{source_type}' "
297 "configuration must be a dictionary"
298 )
300 # Basic validation - specific source validation will be handled
301 # by the individual source config classes
302 if not source_config:
303 raise ValueError(
304 f"Source '{source_name}' of type '{source_type}' "
305 "configuration cannot be empty"
306 )
308 def _validate_collection_name(self, collection_name: str) -> None:
309 """Validate QDrant collection name format.
311 Args:
312 collection_name: Collection name to validate
314 Raises:
315 ValueError: If collection name is invalid
316 """
317 # QDrant collection names have specific requirements
318 # They should be valid identifiers and not too long
319 if len(collection_name) > 255:
320 raise ValueError(
321 f"Collection name '{collection_name}' is too long (max 255 characters)"
322 )
324 # Collection names should be valid identifiers
325 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"
326 if not re.match(pattern, collection_name):
327 raise ValueError(
328 f"Invalid collection name '{collection_name}'. "
329 "Collection names must start with a letter and contain only "
330 "letters, numbers, underscores, and hyphens."
331 )