Coverage for src / qdrant_loader_core / config / sparse.py: 98%
49 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-06-11 09:34 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-06-11 09:34 +0000
1"""Shared sparse/hybrid runtime configuration.
3The contract for sparse/hybrid retrieval lives here: a Pydantic ``BaseModel``
4with field validators that explicitly reject anything outside ``bool`` /
5``"true"`` / ``"false"`` for bool fields and non-empty strings for string
6fields. YAML is the sole source of truth — callers pass the parsed global
7config to :meth:`SparseRuntimeConfig.from_global_config`; layering and
8missing-value handling live in this module.
9"""
11from __future__ import annotations
13from collections.abc import Mapping
14from typing import Any
16from pydantic import BaseModel, ConfigDict, Field, field_validator
19class SparseRuntimeConfig(BaseModel):
20 """Sparse/hybrid retrieval configuration.
22 Immutable (``frozen=True``) and closed (``extra="forbid"``) — unknown
23 fields at construction raise ``ValidationError``, and accepted fields can
24 only be replaced via :meth:`model_copy`. ``use_qdrant_hybrid`` only
25 affects retrieval (the MCP server); the loader package ignores it.
26 """
28 model_config = ConfigDict(frozen=True, extra="forbid")
30 enabled: bool = Field(
31 default=True,
32 description=(
33 "Declare whether collections are created with dense+sparse vectors. "
34 "True means new collections always include a sparse vector; failures "
35 "propagate. False means dense-only — set this explicitly when "
36 "targeting Qdrant servers that do not support sparse vectors."
37 ),
38 )
39 model: str = Field(
40 default="bm25",
41 min_length=1,
42 description="Sparse encoder model identifier (e.g. 'bm25').",
43 )
44 dense_vector_name: str = Field(
45 default="dense",
46 min_length=1,
47 description="Named-vector key for the dense embedding in Qdrant.",
48 )
49 sparse_vector_name: str = Field(
50 default="sparse",
51 min_length=1,
52 description="Named-vector key for the sparse embedding in Qdrant.",
53 )
54 use_qdrant_hybrid: bool = Field(
55 default=True,
56 description="Use Qdrant server-side fusion for retrieval (MCP server only).",
57 )
59 @field_validator("enabled", "use_qdrant_hybrid", mode="before")
60 @classmethod
61 def _strict_bool(cls, v: Any) -> bool:
62 """Accept only ``bool`` or the strings ``"true"`` / ``"false"`` (case-insensitive).
64 This is intentionally stricter than Pydantic's default bool coercion —
65 we don't want ``1``, ``"yes"``, or ``"on"`` to silently mean True in a
66 config file.
67 """
68 if isinstance(v, bool):
69 return v
70 if isinstance(v, str):
71 normalized = v.strip().lower()
72 if normalized == "true":
73 return True
74 if normalized == "false":
75 return False
76 raise ValueError(f"expected bool or 'true'/'false', got {v!r}")
78 @field_validator("model", "dense_vector_name", "sparse_vector_name", mode="before")
79 @classmethod
80 def _strict_str(cls, v: Any) -> str:
81 """Accept only non-empty strings; reject ``None``, numbers, etc."""
82 if isinstance(v, str):
83 normalized = v.strip()
84 if normalized:
85 return normalized
86 raise ValueError(f"expected non-empty string, got {v!r}")
88 @classmethod
89 def from_global_config(
90 cls, global_config: Mapping[str, Any] | None = None
91 ) -> SparseRuntimeConfig:
92 """Build a config from the parsed global YAML section.
94 Reads from ``global_config['llm']['sparse']`` and
95 ``global_config['llm']['retrieval']['sparse']`` (the retrieval block
96 wins where it overlaps), plus
97 ``global_config['llm']['retrieval']['use_qdrant_hybrid']``.
99 Missing or wrongly-shaped sections produce defaults; invalid values
100 raise ``pydantic.ValidationError``.
101 """
102 if not isinstance(global_config, Mapping):
103 return cls()
104 llm = global_config.get("llm")
105 if not isinstance(llm, Mapping):
106 return cls()
108 # The retrieval block wins where it overlaps with the top-level sparse block.
109 overrides: dict[str, Any] = {}
110 if isinstance(llm.get("sparse"), Mapping):
111 overrides.update(llm["sparse"])
113 retrieval = llm.get("retrieval")
114 if isinstance(retrieval, Mapping):
115 if isinstance(retrieval.get("sparse"), Mapping):
116 overrides.update(retrieval["sparse"])
117 if "use_qdrant_hybrid" in retrieval:
118 overrides["use_qdrant_hybrid"] = retrieval["use_qdrant_hybrid"]
120 # Drop unknown keys here so the contract surface is the model's fields,
121 # not whatever the YAML happens to contain.
122 recognised = {k: v for k, v in overrides.items() if k in cls.model_fields}
123 return cls(**recognised)