Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/metadata.py: 74%

1from __future__ import annotations

3import logging

4from dataclasses import asdict, is_dataclass

5from typing import Any

7logger = logging.getLogger(__name__)

10def extract_metadata_info(metadata_extractor: Any, metadata: dict) -> dict:

11 """Extract and flatten metadata using the provided metadata_extractor."""

12 # Validate extractor interface defensively (mirrors extract_project_info approach)

13 if not hasattr(metadata_extractor, "extract_all_metadata"):

14 logger.warning(

15 "Metadata extractor missing 'extract_all_metadata'; returning empty metadata"

16 )

17 return {}

18 extract_callable = metadata_extractor.extract_all_metadata

19 if not callable(extract_callable):

20 logger.warning(

21 "Metadata extractor 'extract_all_metadata' is not callable; returning empty metadata"

22 )

23 return {}

24 try:

25 components = extract_callable(metadata)

26 except Exception:

27 # Log full traceback and return safe default

28 logger.exception("Error calling extract_all_metadata; returning empty metadata")

29 return {}

31 if not isinstance(components, dict):

32 components = {}

33 flattened: dict[str, Any] = {}

35 for _component_name, component in components.items():

36 if component is None:

37 continue

38 # Handle dataclasses explicitly

39 if is_dataclass(component):

40 component_dict = asdict(component)

41 # Handle regular objects by inspecting vars and filtering private/callable

42 elif hasattr(component, "__dict__"):

43 component_dict = {

44 k: v

45 for k, v in vars(component).items()

46 if not k.startswith("_") and not callable(v)

47 }

48 # Handle dictionaries

49 elif isinstance(component, dict):

50 component_dict = component

51 else:

52 # Fallback: skip unsupported component types

53 continue

55 # Merge without overwriting existing keys

56 for key, value in component_dict.items():

57 if key in flattened:

58 continue

59 flattened[key] = value

61 expected_keys = [

62 # Project info

63 "project_id",

64 "project_name",

65 "project_description",

66 "collection_name",

67 # Hierarchy info

68 "parent_id",

69 "parent_title",

70 "breadcrumb_text",

71 "depth",

72 "children_count",

73 "hierarchy_context",

74 # Attachment info

75 "is_attachment",

76 "parent_document_id",

77 "parent_document_title",

78 "attachment_id",

79 "original_filename",

80 "file_size",

81 "mime_type",

82 "attachment_author",

83 "attachment_context",

84 # Section info

85 "section_title",

86 "section_type",

87 "section_level",

88 "section_anchor",

89 "section_breadcrumb",

90 "section_depth",

91 # Content analysis

92 "has_code_blocks",

93 "has_tables",

94 "has_images",

95 "has_links",

96 "word_count",

97 "char_count",

98 "estimated_read_time",

99 "paragraph_count",

100 # Semantic analysis

101 "entities",

102 "topics",

103 "key_phrases",

104 "pos_tags",

105 # Navigation context

106 "previous_section",

107 "next_section",

108 "sibling_sections",

109 "subsections",

110 "document_hierarchy",

111 # Chunking context

112 "chunk_index",

113 "total_chunks",

114 "chunking_strategy",

115 # Conversion info

116 "original_file_type",

117 "conversion_method",

118 "is_excel_sheet",

119 "is_converted",

120 # Cross-reference info

121 "cross_references",

122 "topic_analysis",

123 "content_type_context",

124 ]

125

126 for key in expected_keys:

127 if key not in flattened:

128 if key in [

129 "is_attachment",

130 "has_code_blocks",

131 "has_tables",

132 "has_images",

133 "has_links",

134 "is_excel_sheet",

135 "is_converted",

136 ]:

137 flattened[key] = False

138 elif key in [

139 "entities",

140 "topics",

141 "key_phrases",

142 "pos_tags",

143 "sibling_sections",

144 "subsections",

145 "document_hierarchy",

146 "cross_references",

147 ]:

148 flattened[key] = []

149 else:

150 flattened[key] = None

151

152 return flattened

153

154

155def extract_project_info(metadata_extractor: Any, metadata: dict) -> dict:

156 """Safely extract project info using a provided extractor.

157

158 Ensures extractor has a callable `extract_project_info` attribute and guards against

159 exceptions thrown by the extractor. Always returns a mapping with expected keys.

160 """

161 # Default safe shape

162 safe_empty: dict[str, Any] = {

163 "project_id": None,

164 "project_name": None,

165 "project_description": None,

166 "collection_name": None,

167 }

168

169 # Validate extractor interface

170 if not hasattr(metadata_extractor, "extract_project_info"):

171 return safe_empty

172 extract_callable = metadata_extractor.extract_project_info

173 if not callable(extract_callable):

174 return safe_empty

175

176 try:

177 project_info = extract_callable(metadata)

178 except Exception:

179 # Fail closed to safe shape if extractor raises

180 return safe_empty

181

182 data: dict[str, Any] = {}

183 if project_info:

184 if isinstance(project_info, dict):

185 data = project_info

186 else:

187 data = getattr(project_info, "__dict__", {}) or {}

188

189 # Remove private keys

190 data = {k: v for k, v in data.items() if not k.startswith("_")}

191

192 return {

193 "project_id": data.get("project_id"),

194 "project_name": data.get("project_name"),

195 "project_description": data.get("project_description"),

196 "collection_name": data.get("collection_name"),

197 }