Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/metadata.py: 74%
62 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1from __future__ import annotations
3import logging
4from dataclasses import asdict, is_dataclass
5from typing import Any
7logger = logging.getLogger(__name__)
10def extract_metadata_info(metadata_extractor: Any, metadata: dict) -> dict:
11 """Extract and flatten metadata using the provided metadata_extractor."""
12 # Validate extractor interface defensively (mirrors extract_project_info approach)
13 if not hasattr(metadata_extractor, "extract_all_metadata"):
14 logger.warning(
15 "Metadata extractor missing 'extract_all_metadata'; returning empty metadata"
16 )
17 return {}
18 extract_callable = metadata_extractor.extract_all_metadata
19 if not callable(extract_callable):
20 logger.warning(
21 "Metadata extractor 'extract_all_metadata' is not callable; returning empty metadata"
22 )
23 return {}
24 try:
25 components = extract_callable(metadata)
26 except Exception:
27 # Log full traceback and return safe default
28 logger.exception("Error calling extract_all_metadata; returning empty metadata")
29 return {}
31 if not isinstance(components, dict):
32 components = {}
33 flattened: dict[str, Any] = {}
35 for _component_name, component in components.items():
36 if component is None:
37 continue
38 # Handle dataclasses explicitly
39 if is_dataclass(component):
40 component_dict = asdict(component)
41 # Handle regular objects by inspecting vars and filtering private/callable
42 elif hasattr(component, "__dict__"):
43 component_dict = {
44 k: v
45 for k, v in vars(component).items()
46 if not k.startswith("_") and not callable(v)
47 }
48 # Handle dictionaries
49 elif isinstance(component, dict):
50 component_dict = component
51 else:
52 # Fallback: skip unsupported component types
53 continue
55 # Merge without overwriting existing keys
56 for key, value in component_dict.items():
57 if key in flattened:
58 continue
59 flattened[key] = value
61 expected_keys = [
62 # Project info
63 "project_id",
64 "project_name",
65 "project_description",
66 "collection_name",
67 # Hierarchy info
68 "parent_id",
69 "parent_title",
70 "breadcrumb_text",
71 "depth",
72 "children_count",
73 "hierarchy_context",
74 # Attachment info
75 "is_attachment",
76 "parent_document_id",
77 "parent_document_title",
78 "attachment_id",
79 "original_filename",
80 "file_size",
81 "mime_type",
82 "attachment_author",
83 "attachment_context",
84 # Section info
85 "section_title",
86 "section_type",
87 "section_level",
88 "section_anchor",
89 "section_breadcrumb",
90 "section_depth",
91 # Content analysis
92 "has_code_blocks",
93 "has_tables",
94 "has_images",
95 "has_links",
96 "word_count",
97 "char_count",
98 "estimated_read_time",
99 "paragraph_count",
100 # Semantic analysis
101 "entities",
102 "topics",
103 "key_phrases",
104 "pos_tags",
105 # Navigation context
106 "previous_section",
107 "next_section",
108 "sibling_sections",
109 "subsections",
110 "document_hierarchy",
111 # Chunking context
112 "chunk_index",
113 "total_chunks",
114 "chunking_strategy",
115 # Conversion info
116 "original_file_type",
117 "conversion_method",
118 "is_excel_sheet",
119 "is_converted",
120 # Cross-reference info
121 "cross_references",
122 "topic_analysis",
123 "content_type_context",
124 ]
126 for key in expected_keys:
127 if key not in flattened:
128 if key in [
129 "is_attachment",
130 "has_code_blocks",
131 "has_tables",
132 "has_images",
133 "has_links",
134 "is_excel_sheet",
135 "is_converted",
136 ]:
137 flattened[key] = False
138 elif key in [
139 "entities",
140 "topics",
141 "key_phrases",
142 "pos_tags",
143 "sibling_sections",
144 "subsections",
145 "document_hierarchy",
146 "cross_references",
147 ]:
148 flattened[key] = []
149 else:
150 flattened[key] = None
152 return flattened
155def extract_project_info(metadata_extractor: Any, metadata: dict) -> dict:
156 """Safely extract project info using a provided extractor.
158 Ensures extractor has a callable `extract_project_info` attribute and guards against
159 exceptions thrown by the extractor. Always returns a mapping with expected keys.
160 """
161 # Default safe shape
162 safe_empty: dict[str, Any] = {
163 "project_id": None,
164 "project_name": None,
165 "project_description": None,
166 "collection_name": None,
167 }
169 # Validate extractor interface
170 if not hasattr(metadata_extractor, "extract_project_info"):
171 return safe_empty
172 extract_callable = metadata_extractor.extract_project_info
173 if not callable(extract_callable):
174 return safe_empty
176 try:
177 project_info = extract_callable(metadata)
178 except Exception:
179 # Fail closed to safe shape if extractor raises
180 return safe_empty
182 data: dict[str, Any] = {}
183 if project_info:
184 if isinstance(project_info, dict):
185 data = project_info
186 else:
187 data = getattr(project_info, "__dict__", {}) or {}
189 # Remove private keys
190 data = {k: v for k, v in data.items() if not k.startswith("_")}
192 return {
193 "project_id": data.get("project_id"),
194 "project_name": data.get("project_name"),
195 "project_description": data.get("project_description"),
196 "collection_name": data.get("collection_name"),
197 }