Coverage for src / qdrant_loader_core / config / sparse.py: 98%

49 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-06-11 09:34 +0000

1"""Shared sparse/hybrid runtime configuration. 

2 

3The contract for sparse/hybrid retrieval lives here: a Pydantic ``BaseModel`` 

4with field validators that explicitly reject anything outside ``bool`` / 

5``"true"`` / ``"false"`` for bool fields and non-empty strings for string 

6fields. YAML is the sole source of truth — callers pass the parsed global 

7config to :meth:`SparseRuntimeConfig.from_global_config`; layering and 

8missing-value handling live in this module. 

9""" 

10 

11from __future__ import annotations 

12 

13from collections.abc import Mapping 

14from typing import Any 

15 

16from pydantic import BaseModel, ConfigDict, Field, field_validator 

17 

18 

19class SparseRuntimeConfig(BaseModel): 

20 """Sparse/hybrid retrieval configuration. 

21 

22 Immutable (``frozen=True``) and closed (``extra="forbid"``) — unknown 

23 fields at construction raise ``ValidationError``, and accepted fields can 

24 only be replaced via :meth:`model_copy`. ``use_qdrant_hybrid`` only 

25 affects retrieval (the MCP server); the loader package ignores it. 

26 """ 

27 

28 model_config = ConfigDict(frozen=True, extra="forbid") 

29 

30 enabled: bool = Field( 

31 default=True, 

32 description=( 

33 "Declare whether collections are created with dense+sparse vectors. " 

34 "True means new collections always include a sparse vector; failures " 

35 "propagate. False means dense-only — set this explicitly when " 

36 "targeting Qdrant servers that do not support sparse vectors." 

37 ), 

38 ) 

39 model: str = Field( 

40 default="bm25", 

41 min_length=1, 

42 description="Sparse encoder model identifier (e.g. 'bm25').", 

43 ) 

44 dense_vector_name: str = Field( 

45 default="dense", 

46 min_length=1, 

47 description="Named-vector key for the dense embedding in Qdrant.", 

48 ) 

49 sparse_vector_name: str = Field( 

50 default="sparse", 

51 min_length=1, 

52 description="Named-vector key for the sparse embedding in Qdrant.", 

53 ) 

54 use_qdrant_hybrid: bool = Field( 

55 default=True, 

56 description="Use Qdrant server-side fusion for retrieval (MCP server only).", 

57 ) 

58 

59 @field_validator("enabled", "use_qdrant_hybrid", mode="before") 

60 @classmethod 

61 def _strict_bool(cls, v: Any) -> bool: 

62 """Accept only ``bool`` or the strings ``"true"`` / ``"false"`` (case-insensitive). 

63 

64 This is intentionally stricter than Pydantic's default bool coercion — 

65 we don't want ``1``, ``"yes"``, or ``"on"`` to silently mean True in a 

66 config file. 

67 """ 

68 if isinstance(v, bool): 

69 return v 

70 if isinstance(v, str): 

71 normalized = v.strip().lower() 

72 if normalized == "true": 

73 return True 

74 if normalized == "false": 

75 return False 

76 raise ValueError(f"expected bool or 'true'/'false', got {v!r}") 

77 

78 @field_validator("model", "dense_vector_name", "sparse_vector_name", mode="before") 

79 @classmethod 

80 def _strict_str(cls, v: Any) -> str: 

81 """Accept only non-empty strings; reject ``None``, numbers, etc.""" 

82 if isinstance(v, str): 

83 normalized = v.strip() 

84 if normalized: 

85 return normalized 

86 raise ValueError(f"expected non-empty string, got {v!r}") 

87 

88 @classmethod 

89 def from_global_config( 

90 cls, global_config: Mapping[str, Any] | None = None 

91 ) -> SparseRuntimeConfig: 

92 """Build a config from the parsed global YAML section. 

93 

94 Reads from ``global_config['llm']['sparse']`` and 

95 ``global_config['llm']['retrieval']['sparse']`` (the retrieval block 

96 wins where it overlaps), plus 

97 ``global_config['llm']['retrieval']['use_qdrant_hybrid']``. 

98 

99 Missing or wrongly-shaped sections produce defaults; invalid values 

100 raise ``pydantic.ValidationError``. 

101 """ 

102 if not isinstance(global_config, Mapping): 

103 return cls() 

104 llm = global_config.get("llm") 

105 if not isinstance(llm, Mapping): 

106 return cls() 

107 

108 # The retrieval block wins where it overlaps with the top-level sparse block. 

109 overrides: dict[str, Any] = {} 

110 if isinstance(llm.get("sparse"), Mapping): 

111 overrides.update(llm["sparse"]) 

112 

113 retrieval = llm.get("retrieval") 

114 if isinstance(retrieval, Mapping): 

115 if isinstance(retrieval.get("sparse"), Mapping): 

116 overrides.update(retrieval["sparse"]) 

117 if "use_qdrant_hybrid" in retrieval: 

118 overrides["use_qdrant_hybrid"] = retrieval["use_qdrant_hybrid"] 

119 

120 # Drop unknown keys here so the contract surface is the model's fields, 

121 # not whatever the YAML happens to contain. 

122 recognised = {k: v for k, v in overrides.items() if k in cls.model_fields} 

123 return cls(**recognised)