Coverage for src/qdrant_loader/config/validator.py: 66%
119 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
1"""Configuration validation for multi-project support.
3This module provides validation functionality for both legacy and multi-project
4configurations, ensuring data integrity and catching common configuration errors.
5"""
7import re
8from typing import Any, Dict, List, Set
10from ..utils.logging import LoggingConfig
12logger = LoggingConfig.get_logger(__name__)
15class ConfigValidator:
16 """Validates configuration data for multi-project support."""
18 def __init__(self):
19 """Initialize the validator."""
20 pass
22 def validate_structure(self, config_data: Dict[str, Any]) -> None:
23 """Validate the overall configuration structure.
25 Args:
26 config_data: Raw configuration data
28 Raises:
29 ValueError: If configuration structure is invalid
30 """
31 logger.debug("Validating configuration structure")
33 # Check for required sections
34 if not isinstance(config_data, dict):
35 raise ValueError("Configuration must be a dictionary")
37 # Validate that we have either sources or projects, but configuration is valid either way
38 has_sources = "sources" in config_data
39 has_projects = "projects" in config_data
41 if not has_sources and not has_projects:
42 raise ValueError(
43 "Configuration must contain either 'sources' (legacy) or 'projects' section"
44 )
46 # If we have projects section, validate it
47 if has_projects:
48 self._validate_projects_section(config_data["projects"])
50 # If we have legacy sources, validate them
51 if has_sources and not has_projects:
52 self._validate_sources_section(config_data["sources"])
54 # Validate global section if present
55 if "global" in config_data:
56 self._validate_global_section(config_data["global"])
58 logger.debug("Configuration structure validation completed")
60 def _validate_projects_section(self, projects_data: Any) -> None:
61 """Validate the projects section.
63 Args:
64 projects_data: Projects configuration data
66 Raises:
67 ValueError: If projects section is invalid
68 """
69 if not isinstance(projects_data, dict):
70 raise ValueError("'projects' section must be a dictionary")
72 if not projects_data:
73 raise ValueError("'projects' section cannot be empty")
75 project_ids = set()
76 collection_names = set()
78 for project_id, project_config in projects_data.items():
79 # Validate project ID
80 self._validate_project_id(project_id)
82 if project_id in project_ids:
83 raise ValueError(f"Duplicate project ID: '{project_id}'")
84 project_ids.add(project_id)
86 # Validate individual project configuration
87 self._validate_project_config(project_id, project_config)
89 # Check for duplicate collection names
90 if "collection_name" in project_config:
91 collection_name = project_config["collection_name"]
92 if collection_name in collection_names:
93 raise ValueError(
94 f"Duplicate collection name '{collection_name}' "
95 f"found in project '{project_id}'"
96 )
97 collection_names.add(collection_name)
99 def _validate_project_config(self, project_id: str, project_config: Any) -> None:
100 """Validate individual project configuration.
102 Args:
103 project_id: Project identifier
104 project_config: Project configuration data
106 Raises:
107 ValueError: If project configuration is invalid
108 """
109 if not isinstance(project_config, dict):
110 raise ValueError(
111 f"Project '{project_id}' configuration must be a dictionary"
112 )
114 # Validate required fields
115 if "display_name" not in project_config:
116 raise ValueError(f"Project '{project_id}' must have a 'display_name'")
118 display_name = project_config["display_name"]
119 if not isinstance(display_name, str) or not display_name.strip():
120 raise ValueError(
121 f"Project '{project_id}' display_name must be a non-empty string"
122 )
124 # Validate optional fields
125 if "description" in project_config:
126 description = project_config["description"]
127 if description is not None and not isinstance(description, str):
128 raise ValueError(
129 f"Project '{project_id}' description must be a string or null"
130 )
132 if "collection_name" in project_config:
133 collection_name = project_config["collection_name"]
134 if not isinstance(collection_name, str) or not collection_name.strip():
135 raise ValueError(
136 f"Project '{project_id}' collection_name must be a non-empty string"
137 )
138 self._validate_collection_name(collection_name)
140 # Validate sources section if present
141 if "sources" in project_config:
142 self._validate_sources_section(project_config["sources"])
144 # Validate overrides section if present
145 if "overrides" in project_config:
146 overrides = project_config["overrides"]
147 if not isinstance(overrides, dict):
148 raise ValueError(
149 f"Project '{project_id}' overrides must be a dictionary"
150 )
152 def _validate_sources_section(self, sources_data: Any) -> None:
153 """Validate sources configuration.
155 Args:
156 sources_data: Sources configuration data
158 Raises:
159 ValueError: If sources configuration is invalid
160 """
161 if not isinstance(sources_data, dict):
162 raise ValueError("'sources' section must be a dictionary")
164 # Allow empty sources section for testing purposes
165 # In production, users would typically have at least one source configured
166 if not sources_data:
167 logger.debug(
168 "Sources section is empty - this is allowed but no data will be ingested"
169 )
170 return
172 # Validate each source type
173 for source_type, source_configs in sources_data.items():
174 if not isinstance(source_configs, dict):
175 raise ValueError(f"Source type '{source_type}' must be a dictionary")
177 if not source_configs:
178 raise ValueError(f"Source type '{source_type}' cannot be empty")
180 # Validate each source configuration
181 for source_name, source_config in source_configs.items():
182 if not isinstance(source_config, dict):
183 raise ValueError(
184 f"Source '{source_name}' in '{source_type}' must be a dictionary"
185 )
187 # Note: source_type and source fields are automatically injected by the parser
188 # so we don't need to validate their presence here
190 def _validate_global_section(self, global_data: Any) -> None:
191 """Validate global configuration section.
193 Args:
194 global_data: Global configuration data
196 Raises:
197 ValueError: If global configuration is invalid
198 """
199 if not isinstance(global_data, dict):
200 raise ValueError("'global' section must be a dictionary")
202 # The actual validation of global config fields will be handled
203 # by the GlobalConfig pydantic model, so we just do basic structure checks here
205 # Validate qdrant section if present
206 if "qdrant" in global_data:
207 qdrant_config = global_data["qdrant"]
208 if not isinstance(qdrant_config, dict):
209 raise ValueError("'global.qdrant' must be a dictionary")
211 if "collection_name" in qdrant_config:
212 collection_name = qdrant_config["collection_name"]
213 if not isinstance(collection_name, str) or not collection_name.strip():
214 raise ValueError(
215 "'global.qdrant.collection_name' must be a non-empty string"
216 )
217 self._validate_collection_name(collection_name)
219 def _validate_project_id(self, project_id: str) -> None:
220 """Validate project ID format.
222 Args:
223 project_id: Project identifier to validate
225 Raises:
226 ValueError: If project ID is invalid
227 """
228 if not isinstance(project_id, str):
229 raise ValueError("Project ID must be a string")
231 if not project_id.strip():
232 raise ValueError("Project ID cannot be empty")
234 # Project IDs must be valid identifiers (alphanumeric + underscores + hyphens)
235 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"
236 if not re.match(pattern, project_id):
237 raise ValueError(
238 f"Invalid project ID '{project_id}'. "
239 "Project IDs must start with a letter and contain only "
240 "letters, numbers, underscores, and hyphens."
241 )
243 # Check for reserved project IDs
244 reserved_ids = {"default", "global", "admin", "system"}
245 if project_id.lower() in reserved_ids:
246 logger.warning(
247 f"Project ID '{project_id}' is reserved and may cause conflicts"
248 )
250 def _validate_source_name(self, source_name: str) -> None:
251 """Validate source name format.
253 Args:
254 source_name: Source name to validate
256 Raises:
257 ValueError: If source name is invalid
258 """
259 if not isinstance(source_name, str):
260 raise ValueError("Source name must be a string")
262 if not source_name.strip():
263 raise ValueError("Source name cannot be empty")
265 # Source names should be valid identifiers
266 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"
267 if not re.match(pattern, source_name):
268 raise ValueError(
269 f"Invalid source name '{source_name}'. "
270 "Source names must start with a letter and contain only "
271 "letters, numbers, underscores, and hyphens."
272 )
274 def _validate_source_config(
275 self, source_type: str, source_name: str, source_config: Any
276 ) -> None:
277 """Validate individual source configuration.
279 Args:
280 source_type: Type of the source
281 source_name: Name of the source
282 source_config: Source configuration data
284 Raises:
285 ValueError: If source configuration is invalid
286 """
287 if not isinstance(source_config, dict):
288 raise ValueError(
289 f"Source '{source_name}' of type '{source_type}' "
290 "configuration must be a dictionary"
291 )
293 # Basic validation - specific source validation will be handled
294 # by the individual source config classes
295 if not source_config:
296 raise ValueError(
297 f"Source '{source_name}' of type '{source_type}' "
298 "configuration cannot be empty"
299 )
301 def _validate_collection_name(self, collection_name: str) -> None:
302 """Validate QDrant collection name format.
304 Args:
305 collection_name: Collection name to validate
307 Raises:
308 ValueError: If collection name is invalid
309 """
310 # QDrant collection names have specific requirements
311 # They should be valid identifiers and not too long
312 if len(collection_name) > 255:
313 raise ValueError(
314 f"Collection name '{collection_name}' is too long (max 255 characters)"
315 )
317 # Collection names should be valid identifiers
318 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"
319 if not re.match(pattern, collection_name):
320 raise ValueError(
321 f"Invalid collection name '{collection_name}'. "
322 "Collection names must start with a letter and contain only "
323 "letters, numbers, underscores, and hyphens."
324 )