Coverage for src/qdrant_loader_mcp_server/search/models.py: 94%
196 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:06 +0000
1"""Search result models."""
3from pydantic import BaseModel
6class SearchResult(BaseModel):
7 """Search result model with comprehensive metadata."""
9 score: float
10 text: str
11 source_type: str
12 source_title: str
13 source_url: str | None = None
14 file_path: str | None = None
15 repo_name: str | None = None
17 # Document identification
18 document_id: str | None = None
19 created_at: str | None = None
20 last_modified: str | None = None
22 # Project information (for multi-project support)
23 project_id: str | None = None
24 project_name: str | None = None
25 project_description: str | None = None
26 collection_name: str | None = None
28 # Hierarchy information (primarily for Confluence)
29 parent_id: str | None = None
30 parent_title: str | None = None
31 breadcrumb_text: str | None = None
32 depth: int | None = None
33 children_count: int | None = None
34 hierarchy_context: str | None = None
36 # Attachment information (for files attached to documents)
37 is_attachment: bool = False
38 parent_document_id: str | None = None
39 parent_document_title: str | None = None
40 attachment_id: str | None = None
41 original_filename: str | None = None
42 file_size: int | None = None
43 mime_type: str | None = None
44 attachment_author: str | None = None
45 attachment_context: str | None = None
47 # 🔥 NEW: Section-level intelligence
48 section_title: str | None = None
49 section_type: str | None = None # e.g., "h1", "h2", "content"
50 section_level: int | None = None
51 section_anchor: str | None = None
52 section_breadcrumb: str | None = None
53 section_depth: int | None = None
55 # 🔥 NEW: Content analysis
56 has_code_blocks: bool = False
57 has_tables: bool = False
58 has_images: bool = False
59 has_links: bool = False
60 word_count: int | None = None
61 char_count: int | None = None
62 estimated_read_time: int | None = None # minutes
63 paragraph_count: int | None = None
65 # 🔥 NEW: Semantic analysis (NLP results)
66 entities: list[dict | str] = [] # Handle both dict and string formats
67 topics: list[dict | str] = [] # Handle both dict and string formats
68 key_phrases: list[dict | str] = []
69 pos_tags: list[dict] = []
71 # 🔥 NEW: Navigation context
72 previous_section: str | None = None
73 next_section: str | None = None
74 sibling_sections: list[str] = []
75 subsections: list[str] = []
76 document_hierarchy: list[str] = []
78 # 🔥 NEW: Chunking context
79 chunk_index: int | None = None
80 total_chunks: int | None = None
81 chunking_strategy: str | None = None
83 # 🔥 NEW: File conversion intelligence
84 original_file_type: str | None = None
85 conversion_method: str | None = None
86 is_excel_sheet: bool = False
87 is_converted: bool = False
89 # 🔥 NEW: Cross-references and enhanced context
90 cross_references: list[dict] = []
91 topic_analysis: dict | None = None
92 content_type_context: str | None = None # Human-readable content description
94 def get_display_title(self) -> str:
95 """Get the display title with enhanced hierarchy context."""
96 # Use source_title as base, but if empty, derive from other fields
97 base_title = self.source_title
98 if not base_title or base_title.strip() == "":
99 # Try to create title from available data
100 if self.file_path:
101 import os
103 base_title = os.path.basename(self.file_path)
104 elif self.repo_name:
105 base_title = self.repo_name
106 else:
107 base_title = "Untitled"
109 # 🔥 ENHANCED: Use section breadcrumb for better context
110 if self.section_breadcrumb:
111 return f"{self.section_title or base_title} ({self.section_breadcrumb})"
112 elif self.breadcrumb_text and self.source_type == "confluence":
113 return f"{base_title} ({self.breadcrumb_text})"
114 elif self.section_title and self.section_title != base_title:
115 return f"{base_title} > {self.section_title}"
116 return base_title
118 def get_project_info(self) -> str | None:
119 """Get formatted project information for display."""
120 if not self.project_id:
121 return None
123 project_info = f"Project: {self.project_name or self.project_id}"
124 if self.project_description:
125 project_info += f" - {self.project_description}"
126 if self.collection_name:
127 project_info += f" (Collection: {self.collection_name})"
128 return project_info
130 def get_hierarchy_info(self) -> str | None:
131 """Get formatted hierarchy information for display."""
132 # Only return hierarchy info for Confluence sources
133 if self.source_type != "confluence":
134 return None
136 # 🔥 ENHANCED: Include section hierarchy
137 parts = []
139 if self.hierarchy_context:
140 parts.append(self.hierarchy_context)
142 if self.section_breadcrumb:
143 parts.append(f"Section: {self.section_breadcrumb}")
145 if self.chunk_index is not None and self.total_chunks is not None:
146 parts.append(f"Chunk: {self.chunk_index + 1}/{self.total_chunks}")
148 return " | ".join(parts) if parts else None
150 def get_content_info(self) -> str | None:
151 """Get formatted content analysis information."""
152 # 🔥 NEW: Content type summary
153 if not any(
154 [self.has_code_blocks, self.has_tables, self.has_images, self.has_links]
155 ):
156 return None
158 content_parts = []
159 if self.has_code_blocks:
160 content_parts.append("Code")
161 if self.has_tables:
162 content_parts.append("Tables")
163 if self.has_images:
164 content_parts.append("Images")
165 if self.has_links:
166 content_parts.append("Links")
168 content_info = f"Contains: {', '.join(content_parts)}"
170 if self.word_count:
171 content_info += f" | {self.word_count} words"
172 if self.estimated_read_time:
173 content_info += f" | ~{self.estimated_read_time}min read"
175 return content_info
177 def get_semantic_info(self) -> str | None:
178 """Get formatted semantic analysis information."""
179 # 🔥 NEW: Semantic analysis summary
180 parts = []
182 if self.entities:
183 entity_count = len(self.entities)
184 parts.append(f"{entity_count} entities")
186 if self.topics:
187 # Handle both string and dict formats for topics
188 topic_texts = []
189 for topic in self.topics[:3]:
190 if isinstance(topic, str):
191 topic_texts.append(topic)
192 elif isinstance(topic, dict):
193 topic_texts.append(topic.get("text", str(topic)))
194 else:
195 topic_texts.append(str(topic))
197 topic_list = ", ".join(topic_texts)
198 if len(self.topics) > 3:
199 topic_list += f" (+{len(self.topics) - 3} more)"
200 parts.append(f"Topics: {topic_list}")
202 if self.key_phrases:
203 phrase_count = len(self.key_phrases)
204 parts.append(f"{phrase_count} key phrases")
206 return " | ".join(parts) if parts else None
208 def get_navigation_info(self) -> str | None:
209 """Get formatted navigation context."""
210 # 🔥 NEW: Navigation context
211 parts = []
213 if self.previous_section:
214 parts.append(f"Previous: {self.previous_section}")
215 if self.next_section:
216 parts.append(f"Next: {self.next_section}")
217 if self.sibling_sections:
218 sibling_count = len(self.sibling_sections)
219 parts.append(f"{sibling_count} siblings")
220 if self.subsections:
221 subsection_count = len(self.subsections)
222 parts.append(f"{subsection_count} subsections")
224 return " | ".join(parts) if parts else None
226 def is_root_document(self) -> bool:
227 """Check if this is a root document (no parent)."""
228 return self.parent_id is None and self.parent_document_id is None
230 def has_children(self) -> bool:
231 """Check if this document has children."""
232 return (self.children_count is not None and self.children_count > 0) or bool(
233 self.subsections
234 )
236 def get_attachment_info(self) -> str | None:
237 """Get formatted attachment information for display."""
238 if not self.is_attachment or not self.attachment_context:
239 return None
240 return self.attachment_context
242 def is_file_attachment(self) -> bool:
243 """Check if this is a file attachment."""
244 return self.is_attachment
246 def get_file_type(self) -> str | None:
247 """Get the file type from MIME type or filename."""
248 # 🔥 ENHANCED: Include conversion info
249 if self.original_file_type:
250 file_type = self.original_file_type
251 if self.is_converted and self.conversion_method:
252 file_type += f" (converted via {self.conversion_method})"
253 return file_type
254 elif self.mime_type:
255 return self.mime_type
256 elif self.original_filename:
257 # Extract extension from filename
258 import os
260 _, ext = os.path.splitext(self.original_filename)
261 return ext.lower().lstrip(".") if ext else None
262 return None
264 def belongs_to_project(self, project_id: str) -> bool:
265 """Check if this result belongs to a specific project."""
266 return self.project_id == project_id
268 def belongs_to_any_project(self, project_ids: list[str]) -> bool:
269 """Check if this result belongs to any of the specified projects."""
270 return self.project_id is not None and self.project_id in project_ids
272 def is_code_content(self) -> bool:
273 """Check if this result contains code."""
274 return self.has_code_blocks or self.section_type == "code"
276 def is_documentation(self) -> bool:
277 """Check if this result is documentation content."""
278 return (
279 self.source_type in ["confluence", "localfile"] and not self.has_code_blocks
280 )
282 def is_structured_data(self) -> bool:
283 """Check if this result contains structured data."""
284 return self.has_tables or self.is_excel_sheet
286 def get_section_context(self) -> str | None:
287 """Get section context for enhanced display."""
288 # 🔥 NEW: Rich section context
289 if not self.section_title:
290 return None
292 context = self.section_title
293 if self.section_type and self.section_level:
294 context = f"[{self.section_type.upper()}] {context}"
295 if self.section_anchor:
296 context += f" (#{self.section_anchor})"
298 return context
300 def get_comprehensive_context(self) -> dict[str, str | None]:
301 """Get all available context information organized by type."""
302 # 🔥 NEW: Comprehensive context for advanced UIs
303 return {
304 "project": self.get_project_info(),
305 "hierarchy": self.get_hierarchy_info(),
306 "content": self.get_content_info(),
307 "semantic": self.get_semantic_info(),
308 "navigation": self.get_navigation_info(),
309 "section": self.get_section_context(),
310 "attachment": self.get_attachment_info(),
311 "file_type": self.get_file_type(),
312 }