Coverage for src/qdrant_loader/config/validator.py: 100%
115 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-11 07:21 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-11 07:21 +0000
1"""Configuration validation for multi-project support.
3This module provides validation functionality for multi-project
4configurations, ensuring data integrity and catching common configuration errors.
5"""
7import re
8from typing import Any
10from ..utils.logging import LoggingConfig
13def _get_logger():
14 return LoggingConfig.get_logger(__name__)
17class ConfigValidator:
18 """Validates configuration data for multi-project support."""
20 def __init__(self):
21 """Initialize the validator."""
22 pass
24 def validate_structure(self, config_data: dict[str, Any]) -> None:
25 """Validate the overall configuration structure.
27 Args:
28 config_data: Raw configuration data
30 Raises:
31 ValueError: If configuration structure is invalid
32 """
33 _get_logger().debug("Validating configuration structure")
35 # Check for required sections
36 if not isinstance(config_data, dict):
37 raise ValueError("Configuration must be a dictionary")
39 # Validate that we have projects section
40 if "projects" not in config_data:
41 raise ValueError("Configuration must contain 'projects' section")
43 # Validate projects section
44 self._validate_projects_section(config_data["projects"])
46 # Validate global section if present
47 if "global" in config_data:
48 self._validate_global_section(config_data["global"])
50 _get_logger().debug("Configuration structure validation completed")
52 def _validate_projects_section(self, projects_data: Any) -> None:
53 """Validate the projects section.
55 Args:
56 projects_data: Projects configuration data
58 Raises:
59 ValueError: If projects section is invalid
60 """
61 if not isinstance(projects_data, dict):
62 raise ValueError("'projects' section must be a dictionary")
64 if not projects_data:
65 raise ValueError("'projects' section cannot be empty")
67 project_ids = set()
68 collection_names = set()
70 for project_id, project_config in projects_data.items():
71 # Validate project ID
72 self._validate_project_id(project_id)
74 if project_id in project_ids:
75 raise ValueError(f"Duplicate project ID: '{project_id}'")
76 project_ids.add(project_id)
78 # Validate individual project configuration
79 self._validate_project_config(project_id, project_config)
81 # Check for duplicate collection names
82 if "collection_name" in project_config:
83 collection_name = project_config["collection_name"]
84 if collection_name in collection_names:
85 raise ValueError(
86 f"Duplicate collection name '{collection_name}' "
87 f"found in project '{project_id}'"
88 )
89 collection_names.add(collection_name)
91 def _validate_project_config(self, project_id: str, project_config: Any) -> None:
92 """Validate individual project configuration.
94 Args:
95 project_id: Project identifier
96 project_config: Project configuration data
98 Raises:
99 ValueError: If project configuration is invalid
100 """
101 if not isinstance(project_config, dict):
102 raise ValueError(
103 f"Project '{project_id}' configuration must be a dictionary"
104 )
106 # Validate required fields
107 if "display_name" not in project_config:
108 raise ValueError(f"Project '{project_id}' must have a 'display_name'")
110 display_name = project_config["display_name"]
111 if not isinstance(display_name, str) or not display_name.strip():
112 raise ValueError(
113 f"Project '{project_id}' display_name must be a non-empty string"
114 )
116 # Validate optional fields
117 if "description" in project_config:
118 description = project_config["description"]
119 if description is not None and not isinstance(description, str):
120 raise ValueError(
121 f"Project '{project_id}' description must be a string or null"
122 )
124 if "collection_name" in project_config:
125 collection_name = project_config["collection_name"]
126 if not isinstance(collection_name, str) or not collection_name.strip():
127 raise ValueError(
128 f"Project '{project_id}' collection_name must be a non-empty string"
129 )
130 self._validate_collection_name(collection_name)
132 # Validate sources section if present
133 if "sources" in project_config:
134 self._validate_sources_section(project_config["sources"])
136 # Validate overrides section if present
137 if "overrides" in project_config:
138 overrides = project_config["overrides"]
139 if not isinstance(overrides, dict):
140 raise ValueError(
141 f"Project '{project_id}' overrides must be a dictionary"
142 )
144 def _validate_sources_section(self, sources_data: Any) -> None:
145 """Validate sources configuration.
147 Args:
148 sources_data: Sources configuration data
150 Raises:
151 ValueError: If sources configuration is invalid
152 """
153 if not isinstance(sources_data, dict):
154 raise ValueError("'sources' section must be a dictionary")
156 # Allow empty sources section for testing purposes
157 # In production, users would typically have at least one source configured
158 if not sources_data:
159 _get_logger().debug(
160 "Sources section is empty - this is allowed but no data will be ingested"
161 )
162 return
164 # Validate each source type
165 for source_type, source_configs in sources_data.items():
166 if not isinstance(source_configs, dict):
167 raise ValueError(f"Source type '{source_type}' must be a dictionary")
169 if not source_configs:
170 raise ValueError(f"Source type '{source_type}' cannot be empty")
172 # Validate each source configuration
173 for source_name, source_config in source_configs.items():
174 if not isinstance(source_config, dict):
175 raise ValueError(
176 f"Source '{source_name}' in '{source_type}' must be a dictionary"
177 )
179 # Note: source_type and source fields are automatically injected by the parser
180 # so we don't need to validate their presence here
182 def _validate_global_section(self, global_data: Any) -> None:
183 """Validate global configuration section.
185 Args:
186 global_data: Global configuration data
188 Raises:
189 ValueError: If global configuration is invalid
190 """
191 if not isinstance(global_data, dict):
192 raise ValueError("'global' section must be a dictionary")
194 # The actual validation of global config fields will be handled
195 # by the GlobalConfig pydantic model, so we just do basic structure checks here
197 # Validate qdrant section if present
198 if "qdrant" in global_data:
199 qdrant_config = global_data["qdrant"]
200 if not isinstance(qdrant_config, dict):
201 raise ValueError("'global.qdrant' must be a dictionary")
203 if "collection_name" in qdrant_config:
204 collection_name = qdrant_config["collection_name"]
205 if not isinstance(collection_name, str) or not collection_name.strip():
206 raise ValueError(
207 "'global.qdrant.collection_name' must be a non-empty string"
208 )
209 self._validate_collection_name(collection_name)
211 def _validate_project_id(self, project_id: str) -> None:
212 """Validate project ID format.
214 Args:
215 project_id: Project identifier to validate
217 Raises:
218 ValueError: If project ID is invalid
219 """
220 if not isinstance(project_id, str):
221 raise ValueError("Project ID must be a string")
223 if not project_id.strip():
224 raise ValueError("Project ID cannot be empty")
226 # Project IDs must be valid identifiers (alphanumeric + underscores + hyphens)
227 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"
228 if not re.match(pattern, project_id):
229 raise ValueError(
230 f"Invalid project ID '{project_id}'. "
231 "Project IDs must start with a letter and contain only "
232 "letters, numbers, underscores, and hyphens."
233 )
235 # Check for reserved project IDs
236 reserved_ids = {"default", "global", "admin", "system"}
237 if project_id.lower() in reserved_ids:
238 _get_logger().warning(
239 f"Project ID '{project_id}' is reserved and may cause conflicts"
240 )
242 def _validate_source_name(self, source_name: str) -> None:
243 """Validate source name format.
245 Args:
246 source_name: Source name to validate
248 Raises:
249 ValueError: If source name is invalid
250 """
251 if not isinstance(source_name, str):
252 raise ValueError("Source name must be a string")
254 if not source_name.strip():
255 raise ValueError("Source name cannot be empty")
257 # Source names should be valid identifiers
258 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"
259 if not re.match(pattern, source_name):
260 raise ValueError(
261 f"Invalid source name '{source_name}'. "
262 "Source names must start with a letter and contain only "
263 "letters, numbers, underscores, and hyphens."
264 )
266 def _validate_source_config(
267 self, source_type: str, source_name: str, source_config: Any
268 ) -> None:
269 """Validate individual source configuration.
271 Args:
272 source_type: Type of the source
273 source_name: Name of the source
274 source_config: Source configuration data
276 Raises:
277 ValueError: If source configuration is invalid
278 """
279 if not isinstance(source_config, dict):
280 raise ValueError(
281 f"Source '{source_name}' of type '{source_type}' "
282 "configuration must be a dictionary"
283 )
285 # Basic validation - specific source validation will be handled
286 # by the individual source config classes
287 if not source_config:
288 raise ValueError(
289 f"Source '{source_name}' of type '{source_type}' "
290 "configuration cannot be empty"
291 )
293 def _validate_collection_name(self, collection_name: str) -> None:
294 """Validate QDrant collection name format.
296 Args:
297 collection_name: Collection name to validate
299 Raises:
300 ValueError: If collection name is invalid
301 """
302 # QDrant collection names have specific requirements
303 # They should be valid identifiers and not too long
304 if len(collection_name) > 255:
305 raise ValueError(
306 f"Collection name '{collection_name}' is too long (max 255 characters)"
307 )
309 # Collection names should be valid identifiers
310 pattern = r"^[a-zA-Z][a-zA-Z0-9_-]*$"
311 if not re.match(pattern, collection_name):
312 raise ValueError(
313 f"Invalid collection name '{collection_name}'. "
314 "Collection names must start with a letter and contain only "
315 "letters, numbers, underscores, and hyphens."
316 )