Coverage for src/qdrant_loader_mcp_server/search/models.py: 94%

196 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:06 +0000

1"""Search result models.""" 

2 

3from pydantic import BaseModel 

4 

5 

6class SearchResult(BaseModel): 

7 """Search result model with comprehensive metadata.""" 

8 

9 score: float 

10 text: str 

11 source_type: str 

12 source_title: str 

13 source_url: str | None = None 

14 file_path: str | None = None 

15 repo_name: str | None = None 

16 

17 # Document identification 

18 document_id: str | None = None 

19 created_at: str | None = None 

20 last_modified: str | None = None 

21 

22 # Project information (for multi-project support) 

23 project_id: str | None = None 

24 project_name: str | None = None 

25 project_description: str | None = None 

26 collection_name: str | None = None 

27 

28 # Hierarchy information (primarily for Confluence) 

29 parent_id: str | None = None 

30 parent_title: str | None = None 

31 breadcrumb_text: str | None = None 

32 depth: int | None = None 

33 children_count: int | None = None 

34 hierarchy_context: str | None = None 

35 

36 # Attachment information (for files attached to documents) 

37 is_attachment: bool = False 

38 parent_document_id: str | None = None 

39 parent_document_title: str | None = None 

40 attachment_id: str | None = None 

41 original_filename: str | None = None 

42 file_size: int | None = None 

43 mime_type: str | None = None 

44 attachment_author: str | None = None 

45 attachment_context: str | None = None 

46 

47 # 🔥 NEW: Section-level intelligence 

48 section_title: str | None = None 

49 section_type: str | None = None # e.g., "h1", "h2", "content" 

50 section_level: int | None = None 

51 section_anchor: str | None = None 

52 section_breadcrumb: str | None = None 

53 section_depth: int | None = None 

54 

55 # 🔥 NEW: Content analysis 

56 has_code_blocks: bool = False 

57 has_tables: bool = False 

58 has_images: bool = False 

59 has_links: bool = False 

60 word_count: int | None = None 

61 char_count: int | None = None 

62 estimated_read_time: int | None = None # minutes 

63 paragraph_count: int | None = None 

64 

65 # 🔥 NEW: Semantic analysis (NLP results) 

66 entities: list[dict | str] = [] # Handle both dict and string formats 

67 topics: list[dict | str] = [] # Handle both dict and string formats 

68 key_phrases: list[dict | str] = [] 

69 pos_tags: list[dict] = [] 

70 

71 # 🔥 NEW: Navigation context 

72 previous_section: str | None = None 

73 next_section: str | None = None 

74 sibling_sections: list[str] = [] 

75 subsections: list[str] = [] 

76 document_hierarchy: list[str] = [] 

77 

78 # 🔥 NEW: Chunking context 

79 chunk_index: int | None = None 

80 total_chunks: int | None = None 

81 chunking_strategy: str | None = None 

82 

83 # 🔥 NEW: File conversion intelligence 

84 original_file_type: str | None = None 

85 conversion_method: str | None = None 

86 is_excel_sheet: bool = False 

87 is_converted: bool = False 

88 

89 # 🔥 NEW: Cross-references and enhanced context 

90 cross_references: list[dict] = [] 

91 topic_analysis: dict | None = None 

92 content_type_context: str | None = None # Human-readable content description 

93 

94 def get_display_title(self) -> str: 

95 """Get the display title with enhanced hierarchy context.""" 

96 # Use source_title as base, but if empty, derive from other fields 

97 base_title = self.source_title 

98 if not base_title or base_title.strip() == "": 

99 # Try to create title from available data 

100 if self.file_path: 

101 import os 

102 

103 base_title = os.path.basename(self.file_path) 

104 elif self.repo_name: 

105 base_title = self.repo_name 

106 else: 

107 base_title = "Untitled" 

108 

109 # 🔥 ENHANCED: Use section breadcrumb for better context 

110 if self.section_breadcrumb: 

111 return f"{self.section_title or base_title} ({self.section_breadcrumb})" 

112 elif self.breadcrumb_text and self.source_type == "confluence": 

113 return f"{base_title} ({self.breadcrumb_text})" 

114 elif self.section_title and self.section_title != base_title: 

115 return f"{base_title} > {self.section_title}" 

116 return base_title 

117 

118 def get_project_info(self) -> str | None: 

119 """Get formatted project information for display.""" 

120 if not self.project_id: 

121 return None 

122 

123 project_info = f"Project: {self.project_name or self.project_id}" 

124 if self.project_description: 

125 project_info += f" - {self.project_description}" 

126 if self.collection_name: 

127 project_info += f" (Collection: {self.collection_name})" 

128 return project_info 

129 

130 def get_hierarchy_info(self) -> str | None: 

131 """Get formatted hierarchy information for display.""" 

132 # Only return hierarchy info for Confluence sources 

133 if self.source_type != "confluence": 

134 return None 

135 

136 # 🔥 ENHANCED: Include section hierarchy 

137 parts = [] 

138 

139 if self.hierarchy_context: 

140 parts.append(self.hierarchy_context) 

141 

142 if self.section_breadcrumb: 

143 parts.append(f"Section: {self.section_breadcrumb}") 

144 

145 if self.chunk_index is not None and self.total_chunks is not None: 

146 parts.append(f"Chunk: {self.chunk_index + 1}/{self.total_chunks}") 

147 

148 return " | ".join(parts) if parts else None 

149 

150 def get_content_info(self) -> str | None: 

151 """Get formatted content analysis information.""" 

152 # 🔥 NEW: Content type summary 

153 if not any( 

154 [self.has_code_blocks, self.has_tables, self.has_images, self.has_links] 

155 ): 

156 return None 

157 

158 content_parts = [] 

159 if self.has_code_blocks: 

160 content_parts.append("Code") 

161 if self.has_tables: 

162 content_parts.append("Tables") 

163 if self.has_images: 

164 content_parts.append("Images") 

165 if self.has_links: 

166 content_parts.append("Links") 

167 

168 content_info = f"Contains: {', '.join(content_parts)}" 

169 

170 if self.word_count: 

171 content_info += f" | {self.word_count} words" 

172 if self.estimated_read_time: 

173 content_info += f" | ~{self.estimated_read_time}min read" 

174 

175 return content_info 

176 

177 def get_semantic_info(self) -> str | None: 

178 """Get formatted semantic analysis information.""" 

179 # 🔥 NEW: Semantic analysis summary 

180 parts = [] 

181 

182 if self.entities: 

183 entity_count = len(self.entities) 

184 parts.append(f"{entity_count} entities") 

185 

186 if self.topics: 

187 # Handle both string and dict formats for topics 

188 topic_texts = [] 

189 for topic in self.topics[:3]: 

190 if isinstance(topic, str): 

191 topic_texts.append(topic) 

192 elif isinstance(topic, dict): 

193 topic_texts.append(topic.get("text", str(topic))) 

194 else: 

195 topic_texts.append(str(topic)) 

196 

197 topic_list = ", ".join(topic_texts) 

198 if len(self.topics) > 3: 

199 topic_list += f" (+{len(self.topics) - 3} more)" 

200 parts.append(f"Topics: {topic_list}") 

201 

202 if self.key_phrases: 

203 phrase_count = len(self.key_phrases) 

204 parts.append(f"{phrase_count} key phrases") 

205 

206 return " | ".join(parts) if parts else None 

207 

208 def get_navigation_info(self) -> str | None: 

209 """Get formatted navigation context.""" 

210 # 🔥 NEW: Navigation context 

211 parts = [] 

212 

213 if self.previous_section: 

214 parts.append(f"Previous: {self.previous_section}") 

215 if self.next_section: 

216 parts.append(f"Next: {self.next_section}") 

217 if self.sibling_sections: 

218 sibling_count = len(self.sibling_sections) 

219 parts.append(f"{sibling_count} siblings") 

220 if self.subsections: 

221 subsection_count = len(self.subsections) 

222 parts.append(f"{subsection_count} subsections") 

223 

224 return " | ".join(parts) if parts else None 

225 

226 def is_root_document(self) -> bool: 

227 """Check if this is a root document (no parent).""" 

228 return self.parent_id is None and self.parent_document_id is None 

229 

230 def has_children(self) -> bool: 

231 """Check if this document has children.""" 

232 return (self.children_count is not None and self.children_count > 0) or bool( 

233 self.subsections 

234 ) 

235 

236 def get_attachment_info(self) -> str | None: 

237 """Get formatted attachment information for display.""" 

238 if not self.is_attachment or not self.attachment_context: 

239 return None 

240 return self.attachment_context 

241 

242 def is_file_attachment(self) -> bool: 

243 """Check if this is a file attachment.""" 

244 return self.is_attachment 

245 

246 def get_file_type(self) -> str | None: 

247 """Get the file type from MIME type or filename.""" 

248 # 🔥 ENHANCED: Include conversion info 

249 if self.original_file_type: 

250 file_type = self.original_file_type 

251 if self.is_converted and self.conversion_method: 

252 file_type += f" (converted via {self.conversion_method})" 

253 return file_type 

254 elif self.mime_type: 

255 return self.mime_type 

256 elif self.original_filename: 

257 # Extract extension from filename 

258 import os 

259 

260 _, ext = os.path.splitext(self.original_filename) 

261 return ext.lower().lstrip(".") if ext else None 

262 return None 

263 

264 def belongs_to_project(self, project_id: str) -> bool: 

265 """Check if this result belongs to a specific project.""" 

266 return self.project_id == project_id 

267 

268 def belongs_to_any_project(self, project_ids: list[str]) -> bool: 

269 """Check if this result belongs to any of the specified projects.""" 

270 return self.project_id is not None and self.project_id in project_ids 

271 

272 def is_code_content(self) -> bool: 

273 """Check if this result contains code.""" 

274 return self.has_code_blocks or self.section_type == "code" 

275 

276 def is_documentation(self) -> bool: 

277 """Check if this result is documentation content.""" 

278 return ( 

279 self.source_type in ["confluence", "localfile"] and not self.has_code_blocks 

280 ) 

281 

282 def is_structured_data(self) -> bool: 

283 """Check if this result contains structured data.""" 

284 return self.has_tables or self.is_excel_sheet 

285 

286 def get_section_context(self) -> str | None: 

287 """Get section context for enhanced display.""" 

288 # 🔥 NEW: Rich section context 

289 if not self.section_title: 

290 return None 

291 

292 context = self.section_title 

293 if self.section_type and self.section_level: 

294 context = f"[{self.section_type.upper()}] {context}" 

295 if self.section_anchor: 

296 context += f" (#{self.section_anchor})" 

297 

298 return context 

299 

300 def get_comprehensive_context(self) -> dict[str, str | None]: 

301 """Get all available context information organized by type.""" 

302 # 🔥 NEW: Comprehensive context for advanced UIs 

303 return { 

304 "project": self.get_project_info(), 

305 "hierarchy": self.get_hierarchy_info(), 

306 "content": self.get_content_info(), 

307 "semantic": self.get_semantic_info(), 

308 "navigation": self.get_navigation_info(), 

309 "section": self.get_section_context(), 

310 "attachment": self.get_attachment_info(), 

311 "file_type": self.get_file_type(), 

312 }