Coverage for src/qdrant_loader/connectors/shared/attachments/metadata.py: 69%

67 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1from __future__ import annotations 

2 

3from typing import Any 

4 

5from qdrant_loader.core.attachment_downloader import AttachmentMetadata 

6 

7 

8def normalize_attachment_metadata(meta: dict, *, parent_id: str) -> AttachmentMetadata: 

9 """Normalize provider-agnostic metadata to `AttachmentMetadata`. 

10 

11 Minimal helper for shared usage; connectors can implement richer adapters. 

12 """ 

13 # Extract fields without coercing None to string 

14 raw_id = meta.get("id") or meta.get("attachment_id") 

15 attachment_id = str(raw_id) if raw_id is not None else None 

16 

17 download_raw = meta.get("download_url") or meta.get("content") 

18 download_url = str(download_raw) if download_raw is not None else None 

19 

20 metadata = AttachmentMetadata( 

21 id=attachment_id if attachment_id is not None else "None", 

22 filename=meta.get("filename") or meta.get("name") or "unknown", 

23 size=int(meta.get("size") or 0), 

24 mime_type=meta.get("mime_type") 

25 or meta.get("contentType") 

26 or "application/octet-stream", 

27 download_url=download_url if download_url is not None else "None", 

28 parent_document_id=parent_id, 

29 created_at=meta.get("created") or meta.get("created_at"), 

30 updated_at=meta.get("updated") or meta.get("updated_at"), 

31 author=(meta.get("author") or meta.get("creator")), 

32 ) 

33 

34 # Post-construction validation for required fields 

35 if not metadata.id or metadata.id == "None": 

36 raise ValueError("Attachment ID is required") 

37 if not metadata.download_url or metadata.download_url == "None": 

38 raise ValueError("Attachment download_url is required") 

39 

40 return metadata 

41 

42 

43def jira_attachment_to_metadata(att: Any, *, parent_id: str) -> AttachmentMetadata: 

44 """Convert a JiraAttachment-like object to AttachmentMetadata.""" 

45 created = getattr(att, "created", None) 

46 author = getattr(att, "author", None) 

47 return AttachmentMetadata( 

48 id=str(att.id), 

49 filename=str(att.filename), 

50 size=int(att.size), 

51 mime_type=str(att.mime_type), 

52 download_url=str(att.content_url), 

53 parent_document_id=parent_id, 

54 created_at=created.isoformat() if created is not None else None, 

55 updated_at=None, 

56 author=getattr(author, "display_name", None) if author is not None else None, 

57 ) 

58 

59 

60def confluence_attachment_to_metadata( 

61 attachment_data: dict, 

62 *, 

63 base_url: str, 

64 parent_id: str, 

65) -> AttachmentMetadata | None: 

66 """Convert raw Confluence attachment JSON to AttachmentMetadata. 

67 

68 Returns None if a valid download URL cannot be determined. 

69 """ 

70 try: 

71 attachment_id = attachment_data.get("id") 

72 filename = attachment_data.get("title", "unknown") 

73 

74 # File size from metadata or extensions 

75 metadata = attachment_data.get("metadata", {}) 

76 file_size = 0 

77 if "mediaType" in metadata: 

78 file_size = metadata.get("mediaType", {}).get("size", 0) 

79 elif "properties" in metadata: 

80 file_size = metadata.get("properties", {}).get("size", 0) 

81 if file_size == 0: 

82 extensions = attachment_data.get("extensions", {}) 

83 file_size = extensions.get("fileSize", 0) 

84 

85 # MIME type from metadata or extensions 

86 mime_type = "application/octet-stream" 

87 if "mediaType" in metadata: 

88 mime_type = metadata.get("mediaType", {}).get("name", mime_type) 

89 elif "properties" in metadata: 

90 mime_type = metadata.get("properties", {}).get("mediaType", mime_type) 

91 if mime_type == "application/octet-stream": 

92 extensions = attachment_data.get("extensions", {}) 

93 mime_type = extensions.get("mediaType", mime_type) 

94 

95 # Download URL construction 

96 download_link = attachment_data.get("_links", {}).get("download") 

97 if not download_link: 

98 return None 

99 if str(download_link).startswith("http"): 

100 download_url = download_link 

101 elif str(download_link).startswith("/"): 

102 download_url = f"{base_url}{download_link}" 

103 else: 

104 download_url = f"{base_url}/rest/api/{download_link}" 

105 

106 # Author and timestamps 

107 version = attachment_data.get("version", {}) 

108 history = attachment_data.get("history", {}) 

109 

110 author = None 

111 if "by" in version: 

112 author = version.get("by", {}).get("displayName") 

113 elif "createdBy" in history: 

114 author = history.get("createdBy", {}).get("displayName") 

115 

116 created_at = None 

117 if "createdDate" in history: 

118 created_at = history.get("createdDate") 

119 elif "created" in attachment_data: 

120 created_at = attachment_data.get("created") 

121 

122 updated_at = None 

123 if "when" in version: 

124 updated_at = version.get("when") 

125 elif "lastModified" in history: 

126 updated_at = history.get("lastModified") 

127 

128 return AttachmentMetadata( 

129 id=attachment_id, 

130 filename=filename, 

131 size=file_size, 

132 mime_type=mime_type, 

133 download_url=download_url, 

134 parent_document_id=parent_id, 

135 created_at=created_at, 

136 updated_at=updated_at, 

137 author=author, 

138 ) 

139 except Exception: 

140 return None