Coverage for src/qdrant_loader_mcp_server/search/models.py: 94%

1"""Search result models."""

3from pydantic import BaseModel

6class SearchResult(BaseModel):

7 """Search result model with comprehensive metadata."""

9 score: float

10 text: str

11 source_type: str

12 source_title: str

13 source_url: str | None = None

14 file_path: str | None = None

15 repo_name: str | None = None

17 # Document identification

18 document_id: str | None = None

19 created_at: str | None = None

20 last_modified: str | None = None

22 # Project information (for multi-project support)

23 project_id: str | None = None

24 project_name: str | None = None

25 project_description: str | None = None

26 collection_name: str | None = None

28 # Hierarchy information (primarily for Confluence)

29 parent_id: str | None = None

30 parent_title: str | None = None

31 breadcrumb_text: str | None = None

32 depth: int | None = None

33 children_count: int | None = None

34 hierarchy_context: str | None = None

36 # Attachment information (for files attached to documents)

37 is_attachment: bool = False

38 parent_document_id: str | None = None

39 parent_document_title: str | None = None

40 attachment_id: str | None = None

41 original_filename: str | None = None

42 file_size: int | None = None

43 mime_type: str | None = None

44 attachment_author: str | None = None

45 attachment_context: str | None = None

47 # 🔥 NEW: Section-level intelligence

48 section_title: str | None = None

49 section_type: str | None = None # e.g., "h1", "h2", "content"

50 section_level: int | None = None

51 section_anchor: str | None = None

52 section_breadcrumb: str | None = None

53 section_depth: int | None = None

55 # 🔥 NEW: Content analysis

56 has_code_blocks: bool = False

57 has_tables: bool = False

58 has_images: bool = False

59 has_links: bool = False

60 word_count: int | None = None

61 char_count: int | None = None

62 estimated_read_time: int | None = None # minutes

63 paragraph_count: int | None = None

65 # 🔥 NEW: Semantic analysis (NLP results)

66 entities: list[dict | str] = [] # Handle both dict and string formats

67 topics: list[dict | str] = [] # Handle both dict and string formats

68 key_phrases: list[dict | str] = []

69 pos_tags: list[dict] = []

71 # 🔥 NEW: Navigation context

72 previous_section: str | None = None

73 next_section: str | None = None

74 sibling_sections: list[str] = []

75 subsections: list[str] = []

76 document_hierarchy: list[str] = []

78 # 🔥 NEW: Chunking context

79 chunk_index: int | None = None

80 total_chunks: int | None = None

81 chunking_strategy: str | None = None

83 # 🔥 NEW: File conversion intelligence

84 original_file_type: str | None = None

85 conversion_method: str | None = None

86 is_excel_sheet: bool = False

87 is_converted: bool = False

89 # 🔥 NEW: Cross-references and enhanced context

90 cross_references: list[dict] = []

91 topic_analysis: dict | None = None

92 content_type_context: str | None = None # Human-readable content description

94 def get_display_title(self) -> str:

95 """Get the display title with enhanced hierarchy context."""

96 # Use source_title as base, but if empty, derive from other fields

97 base_title = self.source_title

98 if not base_title or base_title.strip() == "":

99 # Try to create title from available data

100 if self.file_path:

101 import os

102

103 base_title = os.path.basename(self.file_path)

104 elif self.repo_name:

105 base_title = self.repo_name

106 else:

107 base_title = "Untitled"

108

109 # 🔥 ENHANCED: Use section breadcrumb for better context

110 if self.section_breadcrumb:

111 return f"{self.section_title or base_title} ({self.section_breadcrumb})"

112 elif self.breadcrumb_text and self.source_type == "confluence":

113 return f"{base_title} ({self.breadcrumb_text})"

114 elif self.section_title and self.section_title != base_title:

115 return f"{base_title} > {self.section_title}"

116 return base_title

117

118 def get_project_info(self) -> str | None:

119 """Get formatted project information for display."""

120 if not self.project_id:

121 return None

122

123 project_info = f"Project: {self.project_name or self.project_id}"

124 if self.project_description:

125 project_info += f" - {self.project_description}"

126 if self.collection_name:

127 project_info += f" (Collection: {self.collection_name})"

128 return project_info

129

130 def get_hierarchy_info(self) -> str | None:

131 """Get formatted hierarchy information for display."""

132 # Only return hierarchy info for Confluence sources

133 if self.source_type != "confluence":

134 return None

135

136 # 🔥 ENHANCED: Include section hierarchy

137 parts = []

138

139 if self.hierarchy_context:

140 parts.append(self.hierarchy_context)

141

142 if self.section_breadcrumb:

143 parts.append(f"Section: {self.section_breadcrumb}")

144

145 if self.chunk_index is not None and self.total_chunks is not None:

146 parts.append(f"Chunk: {self.chunk_index + 1}/{self.total_chunks}")

147

148 return " | ".join(parts) if parts else None

149

150 def get_content_info(self) -> str | None:

151 """Get formatted content analysis information."""

152 # 🔥 NEW: Content type summary

153 if not any(

154 [self.has_code_blocks, self.has_tables, self.has_images, self.has_links]

155 ):

156 return None

157

158 content_parts = []

159 if self.has_code_blocks:

160 content_parts.append("Code")

161 if self.has_tables:

162 content_parts.append("Tables")

163 if self.has_images:

164 content_parts.append("Images")

165 if self.has_links:

166 content_parts.append("Links")

167

168 content_info = f"Contains: {', '.join(content_parts)}"

169

170 if self.word_count:

171 content_info += f" | {self.word_count} words"

172 if self.estimated_read_time:

173 content_info += f" | ~{self.estimated_read_time}min read"

174

175 return content_info

176

177 def get_semantic_info(self) -> str | None:

178 """Get formatted semantic analysis information."""

179 # 🔥 NEW: Semantic analysis summary

180 parts = []

181

182 if self.entities:

183 entity_count = len(self.entities)

184 parts.append(f"{entity_count} entities")

185

186 if self.topics:

187 # Handle both string and dict formats for topics

188 topic_texts = []

189 for topic in self.topics[:3]:

190 if isinstance(topic, str):

191 topic_texts.append(topic)

192 elif isinstance(topic, dict):

193 topic_texts.append(topic.get("text", str(topic)))

194 else:

195 topic_texts.append(str(topic))

196

197 topic_list = ", ".join(topic_texts)

198 if len(self.topics) > 3:

199 topic_list += f" (+{len(self.topics) - 3} more)"

200 parts.append(f"Topics: {topic_list}")

201

202 if self.key_phrases:

203 phrase_count = len(self.key_phrases)

204 parts.append(f"{phrase_count} key phrases")

205

206 return " | ".join(parts) if parts else None

207

208 def get_navigation_info(self) -> str | None:

209 """Get formatted navigation context."""

210 # 🔥 NEW: Navigation context

211 parts = []

212

213 if self.previous_section:

214 parts.append(f"Previous: {self.previous_section}")

215 if self.next_section:

216 parts.append(f"Next: {self.next_section}")

217 if self.sibling_sections:

218 sibling_count = len(self.sibling_sections)

219 parts.append(f"{sibling_count} siblings")

220 if self.subsections:

221 subsection_count = len(self.subsections)

222 parts.append(f"{subsection_count} subsections")

223

224 return " | ".join(parts) if parts else None

225

226 def is_root_document(self) -> bool:

227 """Check if this is a root document (no parent)."""

228 return self.parent_id is None and self.parent_document_id is None

229

230 def has_children(self) -> bool:

231 """Check if this document has children."""

232 return (self.children_count is not None and self.children_count > 0) or bool(

233 self.subsections

234 )

235

236 def get_attachment_info(self) -> str | None:

237 """Get formatted attachment information for display."""

238 if not self.is_attachment or not self.attachment_context:

239 return None

240 return self.attachment_context

241

242 def is_file_attachment(self) -> bool:

243 """Check if this is a file attachment."""

244 return self.is_attachment

245

246 def get_file_type(self) -> str | None:

247 """Get the file type from MIME type or filename."""

248 # 🔥 ENHANCED: Include conversion info

249 if self.original_file_type:

250 file_type = self.original_file_type

251 if self.is_converted and self.conversion_method:

252 file_type += f" (converted via {self.conversion_method})"

253 return file_type

254 elif self.mime_type:

255 return self.mime_type

256 elif self.original_filename:

257 # Extract extension from filename

258 import os

259

260 _, ext = os.path.splitext(self.original_filename)

261 return ext.lower().lstrip(".") if ext else None

262 return None

263

264 def belongs_to_project(self, project_id: str) -> bool:

265 """Check if this result belongs to a specific project."""

266 return self.project_id == project_id

267

268 def belongs_to_any_project(self, project_ids: list[str]) -> bool:

269 """Check if this result belongs to any of the specified projects."""

270 return self.project_id is not None and self.project_id in project_ids

271

272 def is_code_content(self) -> bool:

273 """Check if this result contains code."""

274 return self.has_code_blocks or self.section_type == "code"

275

276 def is_documentation(self) -> bool:

277 """Check if this result is documentation content."""

278 return (

279 self.source_type in ["confluence", "localfile"] and not self.has_code_blocks

280 )

281

282 def is_structured_data(self) -> bool:

283 """Check if this result contains structured data."""

284 return self.has_tables or self.is_excel_sheet

285

286 def get_section_context(self) -> str | None:

287 """Get section context for enhanced display."""

288 # 🔥 NEW: Rich section context

289 if not self.section_title:

290 return None

291

292 context = self.section_title

293 if self.section_type and self.section_level:

294 context = f"[{self.section_type.upper()}] {context}"

295 if self.section_anchor:

296 context += f" (#{self.section_anchor})"

297

298 return context

299

300 def get_comprehensive_context(self) -> dict[str, str | None]:

301 """Get all available context information organized by type."""

302 # 🔥 NEW: Comprehensive context for advanced UIs

303 return {

304 "project": self.get_project_info(),

305 "hierarchy": self.get_hierarchy_info(),

306 "content": self.get_content_info(),

307 "semantic": self.get_semantic_info(),

308 "navigation": self.get_navigation_info(),

309 "section": self.get_section_context(),

310 "attachment": self.get_attachment_info(),

311 "file_type": self.get_file_type(),

312 }