Coverage for src/qdrant_loader_mcp_server/mcp/handlers/intelligence/utils.py: 56%
52 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1from __future__ import annotations
3import hashlib
4import json
5from collections.abc import Iterable, Mapping
6from datetime import date, datetime
7from enum import Enum
8from pathlib import Path
9from typing import Any
12def get_field(obj: Any, key: str, default: Any = None) -> Any:
13 if isinstance(obj, dict):
14 return obj.get(key, default)
15 return getattr(obj, key, default)
18def get_or_create_document_id(doc: Any) -> str:
19 explicit_id = get_field(doc, "document_id", None)
20 if explicit_id is not None:
21 explicit_id_str = str(explicit_id).strip()
22 if explicit_id_str:
23 return explicit_id_str
25 raw_source_type = get_field(doc, "source_type", "unknown")
26 raw_source_title = get_field(doc, "source_title", "unknown")
28 source_type = str(raw_source_type or "unknown").replace(":", "-")
29 source_title = str(raw_source_title or "unknown").replace(":", "-")
31 candidate_fields = {
32 "title": get_field(doc, "title", None),
33 "source_type": source_type,
34 "source_title": source_title,
35 "source_url": get_field(doc, "source_url", None),
36 "file_path": get_field(doc, "file_path", None),
37 "repo_name": get_field(doc, "repo_name", None),
38 "parent_id": get_field(doc, "parent_id", None),
39 "original_filename": get_field(doc, "original_filename", None),
40 "id": get_field(doc, "id", None),
41 }
43 def _to_stable_primitive(value: Any) -> Any:
44 # None and basic primitives
45 if value is None or isinstance(value, str | int | float | bool):
46 return value
47 # datetime/date
48 if isinstance(value, datetime | date):
49 return value.isoformat()
50 # bytes/bytearray
51 if isinstance(value, bytes | bytearray):
52 try:
53 return value.decode("utf-8")
54 except Exception:
55 return value.decode("utf-8", errors="replace")
56 # Path
57 if isinstance(value, Path):
58 return str(value)
59 # Enum
60 if isinstance(value, Enum):
61 return value.value # type: ignore[return-value]
62 # Mapping
63 if isinstance(value, Mapping):
64 # Convert keys to str and recurse on values; sort by key for determinism
65 converted_items = (
66 (str(k), _to_stable_primitive(v)) for k, v in value.items()
67 )
68 sorted_items = sorted(converted_items, key=lambda kv: kv[0])
69 return dict(sorted_items)
70 # Iterables (list/tuple/set etc.), but not strings/bytes (already handled)
71 if isinstance(value, Iterable):
72 converted_list = [_to_stable_primitive(v) for v in value]
73 # For sets or unordered iterables, sort deterministically by JSON representation
74 try:
75 return sorted(
76 converted_list,
77 key=lambda x: json.dumps(x, sort_keys=True, ensure_ascii=False),
78 )
79 except Exception:
80 return converted_list
81 # Fallback to string representation for anything else
82 return str(value)
84 stable_fields = {
85 k: _to_stable_primitive(v) for k, v in candidate_fields.items() if v is not None
86 }
88 payload = json.dumps(
89 stable_fields,
90 sort_keys=True,
91 ensure_ascii=False,
92 )
93 short_hash = hashlib.sha256(payload.encode("utf-8")).hexdigest()[:10]
94 return f"{source_type}:{source_title}:{short_hash}"