Coverage for src/qdrant_loader_mcp_server/mcp/handlers/intelligence/utils.py: 56%

52 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1from __future__ import annotations 

2 

3import hashlib 

4import json 

5from collections.abc import Iterable, Mapping 

6from datetime import date, datetime 

7from enum import Enum 

8from pathlib import Path 

9from typing import Any 

10 

11 

12def get_field(obj: Any, key: str, default: Any = None) -> Any: 

13 if isinstance(obj, dict): 

14 return obj.get(key, default) 

15 return getattr(obj, key, default) 

16 

17 

18def get_or_create_document_id(doc: Any) -> str: 

19 explicit_id = get_field(doc, "document_id", None) 

20 if explicit_id is not None: 

21 explicit_id_str = str(explicit_id).strip() 

22 if explicit_id_str: 

23 return explicit_id_str 

24 

25 raw_source_type = get_field(doc, "source_type", "unknown") 

26 raw_source_title = get_field(doc, "source_title", "unknown") 

27 

28 source_type = str(raw_source_type or "unknown").replace(":", "-") 

29 source_title = str(raw_source_title or "unknown").replace(":", "-") 

30 

31 candidate_fields = { 

32 "title": get_field(doc, "title", None), 

33 "source_type": source_type, 

34 "source_title": source_title, 

35 "source_url": get_field(doc, "source_url", None), 

36 "file_path": get_field(doc, "file_path", None), 

37 "repo_name": get_field(doc, "repo_name", None), 

38 "parent_id": get_field(doc, "parent_id", None), 

39 "original_filename": get_field(doc, "original_filename", None), 

40 "id": get_field(doc, "id", None), 

41 } 

42 

43 def _to_stable_primitive(value: Any) -> Any: 

44 # None and basic primitives 

45 if value is None or isinstance(value, str | int | float | bool): 

46 return value 

47 # datetime/date 

48 if isinstance(value, datetime | date): 

49 return value.isoformat() 

50 # bytes/bytearray 

51 if isinstance(value, bytes | bytearray): 

52 try: 

53 return value.decode("utf-8") 

54 except Exception: 

55 return value.decode("utf-8", errors="replace") 

56 # Path 

57 if isinstance(value, Path): 

58 return str(value) 

59 # Enum 

60 if isinstance(value, Enum): 

61 return value.value # type: ignore[return-value] 

62 # Mapping 

63 if isinstance(value, Mapping): 

64 # Convert keys to str and recurse on values; sort by key for determinism 

65 converted_items = ( 

66 (str(k), _to_stable_primitive(v)) for k, v in value.items() 

67 ) 

68 sorted_items = sorted(converted_items, key=lambda kv: kv[0]) 

69 return dict(sorted_items) 

70 # Iterables (list/tuple/set etc.), but not strings/bytes (already handled) 

71 if isinstance(value, Iterable): 

72 converted_list = [_to_stable_primitive(v) for v in value] 

73 # For sets or unordered iterables, sort deterministically by JSON representation 

74 try: 

75 return sorted( 

76 converted_list, 

77 key=lambda x: json.dumps(x, sort_keys=True, ensure_ascii=False), 

78 ) 

79 except Exception: 

80 return converted_list 

81 # Fallback to string representation for anything else 

82 return str(value) 

83 

84 stable_fields = { 

85 k: _to_stable_primitive(v) for k, v in candidate_fields.items() if v is not None 

86 } 

87 

88 payload = json.dumps( 

89 stable_fields, 

90 sort_keys=True, 

91 ensure_ascii=False, 

92 ) 

93 short_hash = hashlib.sha256(payload.encode("utf-8")).hexdigest()[:10] 

94 return f"{source_type}:{source_title}:{short_hash}"