Coverage for src/qdrant_loader_mcp_server/search/components/field_query

1"""Field query parser for handling field-specific search syntax."""

3import re

4from dataclasses import dataclass

6from qdrant_client.http import models

8from ...utils.logging import LoggingConfig

10logger = LoggingConfig.get_logger(__name__)

13@dataclass

14class FieldQuery:

15 """Represents a parsed field query."""

17 field_name: str

18 field_value: str

19 original_query: str

20 remaining_query: str = "" # Any remaining text after field extraction

23@dataclass

24class ParsedQuery:

25 """Represents a fully parsed query with field filters and text search."""

27 field_queries: list[FieldQuery]

28 text_query: str

29 original_query: str

32class FieldQueryParser:

33 """Parses field-specific query syntax and converts to Qdrant filters."""

35 # Supported field mappings (field_name -> qdrant_payload_key)

36 SUPPORTED_FIELDS = {

37 "document_id": "document_id",

38 "source_type": "source_type",

39 "source": "source",

40 "project_id": "project_id",

41 "title": "title",

42 "url": "url",

43 "file_path": "file_path",

44 "file_name": "file_name",

45 "file_type": "file_type",

46 "collection_name": "collection_name",

47 # Nested metadata fields

48 "chunk_index": "metadata.chunk_index",

49 "total_chunks": "metadata.total_chunks",

50 "chunking_strategy": "metadata.chunking_strategy",

51 "original_file_type": "metadata.original_file_type",

52 "conversion_method": "metadata.conversion_method",

53 }

55 # Field query pattern: field_name:value or field_name:"quoted value"

56 FIELD_PATTERN = re.compile(r'(\w+):(?:"([^"]+)"|([^\s]+))')

58 def __init__(self):

59 """Initialize the field query parser."""

60 self.logger = LoggingConfig.get_logger(__name__)

61 # Define fields that should be treated as numeric for exact matching

62 self._numeric_fields = {"chunk_index", "total_chunks"}

64 def _convert_value_for_key(self, payload_key: str, raw_value: str) -> int | str:

65 """Convert a raw string value to the correct type for the given payload key.

67 Handles both top-level and nested keys (e.g., metadata.chunk_index).

68 Currently coerces known numeric fields to int; leaves others as-is.

69 """

70 try:

71 key_name = payload_key.split(".")[-1]

72 if key_name in self._numeric_fields:

73 # Coerce to integer for numeric fields

74 return int(raw_value)

75 except (ValueError, TypeError):

76 self.logger.warning(

77 f"Expected numeric value for '{payload_key}', got '{raw_value}'. Using original value."

78 )

79 return raw_value

81 def parse_query(self, query: str) -> ParsedQuery:

82 """Parse a query string into field queries and text search.

84 Args:

85 query: The input query string

87 Returns:

88 ParsedQuery object with separated field queries and text search

90 Examples:

91 "document_id:abc123" -> field_queries=[FieldQuery(field_name="document_id", field_value="abc123")]

92 "document_id:abc123 python tutorial" -> field + text search

93 "source_type:confluence title:\"API Documentation\"" -> multiple field queries

94 """

95 field_queries = []

96 remaining_text = query

98 # Find all field:value patterns and collect spans to remove safely

99 matches = list(self.FIELD_PATTERN.finditer(query))

100 spans_to_remove: list[tuple[int, int]] = []

101

102 for match in matches:

103 field_name = match.group(1)

104 field_value = match.group(2) or match.group(3) # quoted or unquoted value

105

106 if field_name in self.SUPPORTED_FIELDS:

107 field_query = FieldQuery(

108 field_name=field_name,

109 field_value=field_value,

110 original_query=match.group(0),

111 )

112 field_queries.append(field_query)

113 spans_to_remove.append(match.span())

114 self.logger.debug(f"Parsed field query: {field_name}={field_value}")

115 else:

116 self.logger.warning(f"Unsupported field: {field_name}")

117

118 # Remove matched substrings from remaining_text by slicing in reverse order

119 if spans_to_remove:

120 parts = []

121 last_index = len(query)

122 for start, end in sorted(spans_to_remove, key=lambda s: s[0], reverse=True):

123 # Append segment after this match

124 parts.append(query[end:last_index])

125 last_index = start

126 parts.append(query[:last_index])

127 remaining_text = "".join(reversed(parts)).strip()

128

129 # Clean up remaining text (remove extra spaces)

130 text_query = re.sub(r"\s+", " ", remaining_text).strip()

131

132 parsed = ParsedQuery(

133 field_queries=field_queries, text_query=text_query, original_query=query

134 )

135

136 self.logger.debug(

137 f"Parsed query: {len(field_queries)} field queries, text: '{text_query}'"

138 )

139 return parsed

140

141 def create_qdrant_filter(

142 self,

143 field_queries: list[FieldQuery] | None,

144 project_ids: list[str] | None = None,

145 ) -> models.Filter | None:

146 """Create a Qdrant filter from field queries.

147

148 Args:

149 field_queries: List of field queries to convert to filters

150 project_ids: Optional project ID filters to include

151

152 Returns:

153 Qdrant Filter object or None if no filters needed

154 """

155 must_conditions = []

156 should_conditions = []

157

158 # Add field query conditions

159 if field_queries:

160 for field_query in field_queries:

161 payload_key = self.SUPPORTED_FIELDS[field_query.field_name]

162 match_value = self._convert_value_for_key(

163 payload_key, field_query.field_value

164 )

165

166 # Handle nested fields (e.g., metadata.chunk_index)

167 if "." in payload_key:

168 parts = payload_key.split(".", 1)

169 condition = models.NestedCondition(

170 nested=models.Nested(

171 key=parts[0],

172 filter=models.Filter(

173 must=[

174 models.FieldCondition(

175 key=parts[1],

176 match=models.MatchValue(value=match_value),

177 )

178 ]

179 ),

180 )

181 )

182 else:

183 # For top-level fields, use direct field condition

184 condition = models.FieldCondition(

185 key=payload_key, match=models.MatchValue(value=match_value)

186 )

187

188 must_conditions.append(condition)

189 self.logger.debug(

190 f"Added filter condition: {payload_key} = {match_value}"

191 )

192

193 # Add project ID filters if provided and not already specified in field queries

194 has_project_id_field_query = (

195 any(fq.field_name == "project_id" for fq in field_queries)

196 if field_queries

197 else False

198 )

199 if project_ids and not has_project_id_field_query:

200 # Support both top-level project_id and nested metadata.project_id, and root 'source'

201 top_level = models.FieldCondition(

202 key="project_id", match=models.MatchAny(any=project_ids)

203 )

204 top_level_source = models.FieldCondition(

205 key="source", match=models.MatchAny(any=project_ids)

206 )

207 nested_meta = models.NestedCondition(

208 nested=models.Nested(

209 key="metadata",

210 filter=models.Filter(

211 must=[

212 models.FieldCondition(

213 key="project_id",

214 match=models.MatchAny(any=project_ids),

215 )

216 ]

217 ),

218 )

219 )

220

221 # Use OR semantics so either storage layout matches

222 should_conditions.extend([top_level, top_level_source, nested_meta])

223 self.logger.debug(

224 f"Added project filter (top-level or nested): {project_ids}"

225 )

226 elif project_ids and has_project_id_field_query:

227 self.logger.debug(

228 "Skipping project filter because a project_id field query is present"

229 )

230

231 # Return filter if we have conditions

232 if must_conditions or should_conditions:

233 return models.Filter(must=must_conditions, should=should_conditions)

234

235 return None

236

237 def should_use_filter_only(self, parsed_query: ParsedQuery) -> bool:

238 """Determine if we should use filter-only search (no text search).

239

240 Args:

241 parsed_query: The parsed query object

242

243 Returns:

244 True if this should be a filter-only search (exact field matching)

245 """

246 # Use filter-only if we have field queries but no meaningful text search

247 has_field_queries = len(parsed_query.field_queries) > 0

248 has_meaningful_text = len(parsed_query.text_query.strip()) > 0

249

250 # Special case: document_id queries should be exact matches

251 has_document_id_query = any(

252 fq.field_name == "document_id" for fq in parsed_query.field_queries

253 )

254

255 return has_field_queries and (not has_meaningful_text or has_document_id_query)

256

257 def get_supported_fields(self) -> list[str]:

258 """Get list of supported field names for queries.

259

260 Returns:

261 List of supported field names

262 """

263 return list(self.SUPPORTED_FIELDS.keys())

Coverage for src/qdrant_loader_mcp_server/search/components/field_query_parser.py: 62%

88 statements