Coverage for src/qdrant_loader/connectors/shared/attachments/metadata.py: 69%
67 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1from __future__ import annotations
3from typing import Any
5from qdrant_loader.core.attachment_downloader import AttachmentMetadata
8def normalize_attachment_metadata(meta: dict, *, parent_id: str) -> AttachmentMetadata:
9 """Normalize provider-agnostic metadata to `AttachmentMetadata`.
11 Minimal helper for shared usage; connectors can implement richer adapters.
12 """
13 # Extract fields without coercing None to string
14 raw_id = meta.get("id") or meta.get("attachment_id")
15 attachment_id = str(raw_id) if raw_id is not None else None
17 download_raw = meta.get("download_url") or meta.get("content")
18 download_url = str(download_raw) if download_raw is not None else None
20 metadata = AttachmentMetadata(
21 id=attachment_id if attachment_id is not None else "None",
22 filename=meta.get("filename") or meta.get("name") or "unknown",
23 size=int(meta.get("size") or 0),
24 mime_type=meta.get("mime_type")
25 or meta.get("contentType")
26 or "application/octet-stream",
27 download_url=download_url if download_url is not None else "None",
28 parent_document_id=parent_id,
29 created_at=meta.get("created") or meta.get("created_at"),
30 updated_at=meta.get("updated") or meta.get("updated_at"),
31 author=(meta.get("author") or meta.get("creator")),
32 )
34 # Post-construction validation for required fields
35 if not metadata.id or metadata.id == "None":
36 raise ValueError("Attachment ID is required")
37 if not metadata.download_url or metadata.download_url == "None":
38 raise ValueError("Attachment download_url is required")
40 return metadata
43def jira_attachment_to_metadata(att: Any, *, parent_id: str) -> AttachmentMetadata:
44 """Convert a JiraAttachment-like object to AttachmentMetadata."""
45 created = getattr(att, "created", None)
46 author = getattr(att, "author", None)
47 return AttachmentMetadata(
48 id=str(att.id),
49 filename=str(att.filename),
50 size=int(att.size),
51 mime_type=str(att.mime_type),
52 download_url=str(att.content_url),
53 parent_document_id=parent_id,
54 created_at=created.isoformat() if created is not None else None,
55 updated_at=None,
56 author=getattr(author, "display_name", None) if author is not None else None,
57 )
60def confluence_attachment_to_metadata(
61 attachment_data: dict,
62 *,
63 base_url: str,
64 parent_id: str,
65) -> AttachmentMetadata | None:
66 """Convert raw Confluence attachment JSON to AttachmentMetadata.
68 Returns None if a valid download URL cannot be determined.
69 """
70 try:
71 attachment_id = attachment_data.get("id")
72 filename = attachment_data.get("title", "unknown")
74 # File size from metadata or extensions
75 metadata = attachment_data.get("metadata", {})
76 file_size = 0
77 if "mediaType" in metadata:
78 file_size = metadata.get("mediaType", {}).get("size", 0)
79 elif "properties" in metadata:
80 file_size = metadata.get("properties", {}).get("size", 0)
81 if file_size == 0:
82 extensions = attachment_data.get("extensions", {})
83 file_size = extensions.get("fileSize", 0)
85 # MIME type from metadata or extensions
86 mime_type = "application/octet-stream"
87 if "mediaType" in metadata:
88 mime_type = metadata.get("mediaType", {}).get("name", mime_type)
89 elif "properties" in metadata:
90 mime_type = metadata.get("properties", {}).get("mediaType", mime_type)
91 if mime_type == "application/octet-stream":
92 extensions = attachment_data.get("extensions", {})
93 mime_type = extensions.get("mediaType", mime_type)
95 # Download URL construction
96 download_link = attachment_data.get("_links", {}).get("download")
97 if not download_link:
98 return None
99 if str(download_link).startswith("http"):
100 download_url = download_link
101 elif str(download_link).startswith("/"):
102 download_url = f"{base_url}{download_link}"
103 else:
104 download_url = f"{base_url}/rest/api/{download_link}"
106 # Author and timestamps
107 version = attachment_data.get("version", {})
108 history = attachment_data.get("history", {})
110 author = None
111 if "by" in version:
112 author = version.get("by", {}).get("displayName")
113 elif "createdBy" in history:
114 author = history.get("createdBy", {}).get("displayName")
116 created_at = None
117 if "createdDate" in history:
118 created_at = history.get("createdDate")
119 elif "created" in attachment_data:
120 created_at = attachment_data.get("created")
122 updated_at = None
123 if "when" in version:
124 updated_at = version.get("when")
125 elif "lastModified" in history:
126 updated_at = history.get("lastModified")
128 return AttachmentMetadata(
129 id=attachment_id,
130 filename=filename,
131 size=file_size,
132 mime_type=mime_type,
133 download_url=download_url,
134 parent_document_id=parent_id,
135 created_at=created_at,
136 updated_at=updated_at,
137 author=author,
138 )
139 except Exception:
140 return None