Coverage for src/qdrant_loader_mcp_server/search/models.py: 65%

178 statements  

« prev     ^ index     » next       coverage.py v7.10.0, created at 2025-07-25 11:38 +0000

1"""Search result models.""" 

2 

3from pydantic import BaseModel 

4 

5 

6class SearchResult(BaseModel): 

7 """Search result model with comprehensive metadata.""" 

8 

9 score: float 

10 text: str 

11 source_type: str 

12 source_title: str 

13 source_url: str | None = None 

14 file_path: str | None = None 

15 repo_name: str | None = None 

16 

17 # Project information (for multi-project support) 

18 project_id: str | None = None 

19 project_name: str | None = None 

20 project_description: str | None = None 

21 collection_name: str | None = None 

22 

23 # Hierarchy information (primarily for Confluence) 

24 parent_id: str | None = None 

25 parent_title: str | None = None 

26 breadcrumb_text: str | None = None 

27 depth: int | None = None 

28 children_count: int | None = None 

29 hierarchy_context: str | None = None 

30 

31 # Attachment information (for files attached to documents) 

32 is_attachment: bool = False 

33 parent_document_id: str | None = None 

34 parent_document_title: str | None = None 

35 attachment_id: str | None = None 

36 original_filename: str | None = None 

37 file_size: int | None = None 

38 mime_type: str | None = None 

39 attachment_author: str | None = None 

40 attachment_context: str | None = None 

41 

42 # 🔥 NEW: Section-level intelligence 

43 section_title: str | None = None 

44 section_type: str | None = None # e.g., "h1", "h2", "content" 

45 section_level: int | None = None 

46 section_anchor: str | None = None 

47 section_breadcrumb: str | None = None 

48 section_depth: int | None = None 

49 

50 # 🔥 NEW: Content analysis 

51 has_code_blocks: bool = False 

52 has_tables: bool = False 

53 has_images: bool = False 

54 has_links: bool = False 

55 word_count: int | None = None 

56 char_count: int | None = None 

57 estimated_read_time: int | None = None # minutes 

58 paragraph_count: int | None = None 

59 

60 # 🔥 NEW: Semantic analysis (NLP results) 

61 entities: list[dict | str] = [] # Handle both dict and string formats 

62 topics: list[dict | str] = [] # Handle both dict and string formats  

63 key_phrases: list[dict | str] = [] 

64 pos_tags: list[dict] = [] 

65 

66 # 🔥 NEW: Navigation context 

67 previous_section: str | None = None 

68 next_section: str | None = None 

69 sibling_sections: list[str] = [] 

70 subsections: list[str] = [] 

71 document_hierarchy: list[str] = [] 

72 

73 # 🔥 NEW: Chunking context 

74 chunk_index: int | None = None 

75 total_chunks: int | None = None 

76 chunking_strategy: str | None = None 

77 

78 # 🔥 NEW: File conversion intelligence 

79 original_file_type: str | None = None 

80 conversion_method: str | None = None 

81 is_excel_sheet: bool = False 

82 is_converted: bool = False 

83 

84 # 🔥 NEW: Cross-references and enhanced context 

85 cross_references: list[dict] = [] 

86 topic_analysis: dict | None = None 

87 content_type_context: str | None = None # Human-readable content description 

88 

89 def get_display_title(self) -> str: 

90 """Get the display title with enhanced hierarchy context.""" 

91 # 🔥 ENHANCED: Use section breadcrumb for better context 

92 if self.section_breadcrumb: 

93 return f"{self.section_title or self.source_title} ({self.section_breadcrumb})" 

94 elif self.breadcrumb_text and self.source_type == "confluence": 

95 return f"{self.source_title} ({self.breadcrumb_text})" 

96 elif self.section_title and self.section_title != self.source_title: 

97 return f"{self.source_title} > {self.section_title}" 

98 return self.source_title 

99 

100 def get_project_info(self) -> str | None: 

101 """Get formatted project information for display.""" 

102 if not self.project_id: 

103 return None 

104 

105 project_info = f"Project: {self.project_name or self.project_id}" 

106 if self.project_description: 

107 project_info += f" - {self.project_description}" 

108 if self.collection_name: 

109 project_info += f" (Collection: {self.collection_name})" 

110 return project_info 

111 

112 def get_hierarchy_info(self) -> str | None: 

113 """Get formatted hierarchy information for display.""" 

114 # Only return hierarchy info for Confluence sources 

115 if self.source_type != "confluence": 

116 return None 

117 

118 # 🔥 ENHANCED: Include section hierarchy 

119 parts = [] 

120 

121 if self.hierarchy_context: 

122 parts.append(self.hierarchy_context) 

123 

124 if self.section_breadcrumb: 

125 parts.append(f"Section: {self.section_breadcrumb}") 

126 

127 if self.chunk_index is not None and self.total_chunks is not None: 

128 parts.append(f"Chunk: {self.chunk_index + 1}/{self.total_chunks}") 

129 

130 return " | ".join(parts) if parts else None 

131 

132 def get_content_info(self) -> str | None: 

133 """Get formatted content analysis information.""" 

134 # 🔥 NEW: Content type summary 

135 if not any([self.has_code_blocks, self.has_tables, self.has_images, self.has_links]): 

136 return None 

137 

138 content_parts = [] 

139 if self.has_code_blocks: 

140 content_parts.append("Code") 

141 if self.has_tables: 

142 content_parts.append("Tables") 

143 if self.has_images: 

144 content_parts.append("Images") 

145 if self.has_links: 

146 content_parts.append("Links") 

147 

148 content_info = f"Contains: {', '.join(content_parts)}" 

149 

150 if self.word_count: 

151 content_info += f" | {self.word_count} words" 

152 if self.estimated_read_time: 

153 content_info += f" | ~{self.estimated_read_time}min read" 

154 

155 return content_info 

156 

157 def get_semantic_info(self) -> str | None: 

158 """Get formatted semantic analysis information.""" 

159 # 🔥 NEW: Semantic analysis summary 

160 parts = [] 

161 

162 if self.entities: 

163 entity_count = len(self.entities) 

164 parts.append(f"{entity_count} entities") 

165 

166 if self.topics: 

167 topic_list = ", ".join(self.topics[:3]) # Show first 3 topics 

168 if len(self.topics) > 3: 

169 topic_list += f" (+{len(self.topics) - 3} more)" 

170 parts.append(f"Topics: {topic_list}") 

171 

172 if self.key_phrases: 

173 phrase_count = len(self.key_phrases) 

174 parts.append(f"{phrase_count} key phrases") 

175 

176 return " | ".join(parts) if parts else None 

177 

178 def get_navigation_info(self) -> str | None: 

179 """Get formatted navigation context.""" 

180 # 🔥 NEW: Navigation context 

181 parts = [] 

182 

183 if self.previous_section: 

184 parts.append(f"Previous: {self.previous_section}") 

185 if self.next_section: 

186 parts.append(f"Next: {self.next_section}") 

187 if self.sibling_sections: 

188 sibling_count = len(self.sibling_sections) 

189 parts.append(f"{sibling_count} siblings") 

190 if self.subsections: 

191 subsection_count = len(self.subsections) 

192 parts.append(f"{subsection_count} subsections") 

193 

194 return " | ".join(parts) if parts else None 

195 

196 def is_root_document(self) -> bool: 

197 """Check if this is a root document (no parent).""" 

198 return self.parent_id is None and self.parent_document_id is None 

199 

200 def has_children(self) -> bool: 

201 """Check if this document has children.""" 

202 return (self.children_count is not None and self.children_count > 0) or bool(self.subsections) 

203 

204 def get_attachment_info(self) -> str | None: 

205 """Get formatted attachment information for display.""" 

206 if not self.is_attachment or not self.attachment_context: 

207 return None 

208 return self.attachment_context 

209 

210 def is_file_attachment(self) -> bool: 

211 """Check if this is a file attachment.""" 

212 return self.is_attachment 

213 

214 def get_file_type(self) -> str | None: 

215 """Get the file type from MIME type or filename.""" 

216 # 🔥 ENHANCED: Include conversion info 

217 if self.original_file_type: 

218 file_type = self.original_file_type 

219 if self.is_converted and self.conversion_method: 

220 file_type += f" (converted via {self.conversion_method})" 

221 return file_type 

222 elif self.mime_type: 

223 return self.mime_type 

224 elif self.original_filename: 

225 # Extract extension from filename 

226 import os 

227 _, ext = os.path.splitext(self.original_filename) 

228 return ext.lower().lstrip(".") if ext else None 

229 return None 

230 

231 def belongs_to_project(self, project_id: str) -> bool: 

232 """Check if this result belongs to a specific project.""" 

233 return self.project_id == project_id 

234 

235 def belongs_to_any_project(self, project_ids: list[str]) -> bool: 

236 """Check if this result belongs to any of the specified projects.""" 

237 return self.project_id is not None and self.project_id in project_ids 

238 

239 def is_code_content(self) -> bool: 

240 """Check if this result contains code.""" 

241 return self.has_code_blocks or self.section_type == "code" 

242 

243 def is_documentation(self) -> bool: 

244 """Check if this result is documentation content.""" 

245 return self.source_type in ["confluence", "localfile"] and not self.has_code_blocks 

246 

247 def is_structured_data(self) -> bool: 

248 """Check if this result contains structured data.""" 

249 return self.has_tables or self.is_excel_sheet 

250 

251 def get_section_context(self) -> str | None: 

252 """Get section context for enhanced display.""" 

253 # 🔥 NEW: Rich section context 

254 if not self.section_title: 

255 return None 

256 

257 context = self.section_title 

258 if self.section_type and self.section_level: 

259 context = f"[{self.section_type.upper()}] {context}" 

260 if self.section_anchor: 

261 context += f" (#{self.section_anchor})" 

262 

263 return context 

264 

265 def get_comprehensive_context(self) -> dict[str, str | None]: 

266 """Get all available context information organized by type.""" 

267 # 🔥 NEW: Comprehensive context for advanced UIs 

268 return { 

269 "project": self.get_project_info(), 

270 "hierarchy": self.get_hierarchy_info(), 

271 "content": self.get_content_info(), 

272 "semantic": self.get_semantic_info(), 

273 "navigation": self.get_navigation_info(), 

274 "section": self.get_section_context(), 

275 "attachment": self.get_attachment_info(), 

276 "file_type": self.get_file_type(), 

277 }