Coverage for src/qdrant_loader_mcp_server/search/hybrid/components/metadata.py: 74%

62 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1from __future__ import annotations 

2 

3import logging 

4from dataclasses import asdict, is_dataclass 

5from typing import Any 

6 

7logger = logging.getLogger(__name__) 

8 

9 

10def extract_metadata_info(metadata_extractor: Any, metadata: dict) -> dict: 

11 """Extract and flatten metadata using the provided metadata_extractor.""" 

12 # Validate extractor interface defensively (mirrors extract_project_info approach) 

13 if not hasattr(metadata_extractor, "extract_all_metadata"): 

14 logger.warning( 

15 "Metadata extractor missing 'extract_all_metadata'; returning empty metadata" 

16 ) 

17 return {} 

18 extract_callable = metadata_extractor.extract_all_metadata 

19 if not callable(extract_callable): 

20 logger.warning( 

21 "Metadata extractor 'extract_all_metadata' is not callable; returning empty metadata" 

22 ) 

23 return {} 

24 try: 

25 components = extract_callable(metadata) 

26 except Exception: 

27 # Log full traceback and return safe default 

28 logger.exception("Error calling extract_all_metadata; returning empty metadata") 

29 return {} 

30 

31 if not isinstance(components, dict): 

32 components = {} 

33 flattened: dict[str, Any] = {} 

34 

35 for _component_name, component in components.items(): 

36 if component is None: 

37 continue 

38 # Handle dataclasses explicitly 

39 if is_dataclass(component): 

40 component_dict = asdict(component) 

41 # Handle regular objects by inspecting vars and filtering private/callable 

42 elif hasattr(component, "__dict__"): 

43 component_dict = { 

44 k: v 

45 for k, v in vars(component).items() 

46 if not k.startswith("_") and not callable(v) 

47 } 

48 # Handle dictionaries 

49 elif isinstance(component, dict): 

50 component_dict = component 

51 else: 

52 # Fallback: skip unsupported component types 

53 continue 

54 

55 # Merge without overwriting existing keys 

56 for key, value in component_dict.items(): 

57 if key in flattened: 

58 continue 

59 flattened[key] = value 

60 

61 expected_keys = [ 

62 # Project info 

63 "project_id", 

64 "project_name", 

65 "project_description", 

66 "collection_name", 

67 # Hierarchy info 

68 "parent_id", 

69 "parent_title", 

70 "breadcrumb_text", 

71 "depth", 

72 "children_count", 

73 "hierarchy_context", 

74 # Attachment info 

75 "is_attachment", 

76 "parent_document_id", 

77 "parent_document_title", 

78 "attachment_id", 

79 "original_filename", 

80 "file_size", 

81 "mime_type", 

82 "attachment_author", 

83 "attachment_context", 

84 # Section info 

85 "section_title", 

86 "section_type", 

87 "section_level", 

88 "section_anchor", 

89 "section_breadcrumb", 

90 "section_depth", 

91 # Content analysis 

92 "has_code_blocks", 

93 "has_tables", 

94 "has_images", 

95 "has_links", 

96 "word_count", 

97 "char_count", 

98 "estimated_read_time", 

99 "paragraph_count", 

100 # Semantic analysis 

101 "entities", 

102 "topics", 

103 "key_phrases", 

104 "pos_tags", 

105 # Navigation context 

106 "previous_section", 

107 "next_section", 

108 "sibling_sections", 

109 "subsections", 

110 "document_hierarchy", 

111 # Chunking context 

112 "chunk_index", 

113 "total_chunks", 

114 "chunking_strategy", 

115 # Conversion info 

116 "original_file_type", 

117 "conversion_method", 

118 "is_excel_sheet", 

119 "is_converted", 

120 # Cross-reference info 

121 "cross_references", 

122 "topic_analysis", 

123 "content_type_context", 

124 ] 

125 

126 for key in expected_keys: 

127 if key not in flattened: 

128 if key in [ 

129 "is_attachment", 

130 "has_code_blocks", 

131 "has_tables", 

132 "has_images", 

133 "has_links", 

134 "is_excel_sheet", 

135 "is_converted", 

136 ]: 

137 flattened[key] = False 

138 elif key in [ 

139 "entities", 

140 "topics", 

141 "key_phrases", 

142 "pos_tags", 

143 "sibling_sections", 

144 "subsections", 

145 "document_hierarchy", 

146 "cross_references", 

147 ]: 

148 flattened[key] = [] 

149 else: 

150 flattened[key] = None 

151 

152 return flattened 

153 

154 

155def extract_project_info(metadata_extractor: Any, metadata: dict) -> dict: 

156 """Safely extract project info using a provided extractor. 

157 

158 Ensures extractor has a callable `extract_project_info` attribute and guards against 

159 exceptions thrown by the extractor. Always returns a mapping with expected keys. 

160 """ 

161 # Default safe shape 

162 safe_empty: dict[str, Any] = { 

163 "project_id": None, 

164 "project_name": None, 

165 "project_description": None, 

166 "collection_name": None, 

167 } 

168 

169 # Validate extractor interface 

170 if not hasattr(metadata_extractor, "extract_project_info"): 

171 return safe_empty 

172 extract_callable = metadata_extractor.extract_project_info 

173 if not callable(extract_callable): 

174 return safe_empty 

175 

176 try: 

177 project_info = extract_callable(metadata) 

178 except Exception: 

179 # Fail closed to safe shape if extractor raises 

180 return safe_empty 

181 

182 data: dict[str, Any] = {} 

183 if project_info: 

184 if isinstance(project_info, dict): 

185 data = project_info 

186 else: 

187 data = getattr(project_info, "__dict__", {}) or {} 

188 

189 # Remove private keys 

190 data = {k: v for k, v in data.items() if not k.startswith("_")} 

191 

192 return { 

193 "project_id": data.get("project_id"), 

194 "project_name": data.get("project_name"), 

195 "project_description": data.get("project_description"), 

196 "collection_name": data.get("collection_name"), 

197 }