Coverage for src/qdrant_loader_mcp_server/search/models.py: 65%
178 statements
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:38 +0000
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:38 +0000
1"""Search result models."""
3from pydantic import BaseModel
6class SearchResult(BaseModel):
7 """Search result model with comprehensive metadata."""
9 score: float
10 text: str
11 source_type: str
12 source_title: str
13 source_url: str | None = None
14 file_path: str | None = None
15 repo_name: str | None = None
17 # Project information (for multi-project support)
18 project_id: str | None = None
19 project_name: str | None = None
20 project_description: str | None = None
21 collection_name: str | None = None
23 # Hierarchy information (primarily for Confluence)
24 parent_id: str | None = None
25 parent_title: str | None = None
26 breadcrumb_text: str | None = None
27 depth: int | None = None
28 children_count: int | None = None
29 hierarchy_context: str | None = None
31 # Attachment information (for files attached to documents)
32 is_attachment: bool = False
33 parent_document_id: str | None = None
34 parent_document_title: str | None = None
35 attachment_id: str | None = None
36 original_filename: str | None = None
37 file_size: int | None = None
38 mime_type: str | None = None
39 attachment_author: str | None = None
40 attachment_context: str | None = None
42 # 🔥 NEW: Section-level intelligence
43 section_title: str | None = None
44 section_type: str | None = None # e.g., "h1", "h2", "content"
45 section_level: int | None = None
46 section_anchor: str | None = None
47 section_breadcrumb: str | None = None
48 section_depth: int | None = None
50 # 🔥 NEW: Content analysis
51 has_code_blocks: bool = False
52 has_tables: bool = False
53 has_images: bool = False
54 has_links: bool = False
55 word_count: int | None = None
56 char_count: int | None = None
57 estimated_read_time: int | None = None # minutes
58 paragraph_count: int | None = None
60 # 🔥 NEW: Semantic analysis (NLP results)
61 entities: list[dict | str] = [] # Handle both dict and string formats
62 topics: list[dict | str] = [] # Handle both dict and string formats
63 key_phrases: list[dict | str] = []
64 pos_tags: list[dict] = []
66 # 🔥 NEW: Navigation context
67 previous_section: str | None = None
68 next_section: str | None = None
69 sibling_sections: list[str] = []
70 subsections: list[str] = []
71 document_hierarchy: list[str] = []
73 # 🔥 NEW: Chunking context
74 chunk_index: int | None = None
75 total_chunks: int | None = None
76 chunking_strategy: str | None = None
78 # 🔥 NEW: File conversion intelligence
79 original_file_type: str | None = None
80 conversion_method: str | None = None
81 is_excel_sheet: bool = False
82 is_converted: bool = False
84 # 🔥 NEW: Cross-references and enhanced context
85 cross_references: list[dict] = []
86 topic_analysis: dict | None = None
87 content_type_context: str | None = None # Human-readable content description
89 def get_display_title(self) -> str:
90 """Get the display title with enhanced hierarchy context."""
91 # 🔥 ENHANCED: Use section breadcrumb for better context
92 if self.section_breadcrumb:
93 return f"{self.section_title or self.source_title} ({self.section_breadcrumb})"
94 elif self.breadcrumb_text and self.source_type == "confluence":
95 return f"{self.source_title} ({self.breadcrumb_text})"
96 elif self.section_title and self.section_title != self.source_title:
97 return f"{self.source_title} > {self.section_title}"
98 return self.source_title
100 def get_project_info(self) -> str | None:
101 """Get formatted project information for display."""
102 if not self.project_id:
103 return None
105 project_info = f"Project: {self.project_name or self.project_id}"
106 if self.project_description:
107 project_info += f" - {self.project_description}"
108 if self.collection_name:
109 project_info += f" (Collection: {self.collection_name})"
110 return project_info
112 def get_hierarchy_info(self) -> str | None:
113 """Get formatted hierarchy information for display."""
114 # Only return hierarchy info for Confluence sources
115 if self.source_type != "confluence":
116 return None
118 # 🔥 ENHANCED: Include section hierarchy
119 parts = []
121 if self.hierarchy_context:
122 parts.append(self.hierarchy_context)
124 if self.section_breadcrumb:
125 parts.append(f"Section: {self.section_breadcrumb}")
127 if self.chunk_index is not None and self.total_chunks is not None:
128 parts.append(f"Chunk: {self.chunk_index + 1}/{self.total_chunks}")
130 return " | ".join(parts) if parts else None
132 def get_content_info(self) -> str | None:
133 """Get formatted content analysis information."""
134 # 🔥 NEW: Content type summary
135 if not any([self.has_code_blocks, self.has_tables, self.has_images, self.has_links]):
136 return None
138 content_parts = []
139 if self.has_code_blocks:
140 content_parts.append("Code")
141 if self.has_tables:
142 content_parts.append("Tables")
143 if self.has_images:
144 content_parts.append("Images")
145 if self.has_links:
146 content_parts.append("Links")
148 content_info = f"Contains: {', '.join(content_parts)}"
150 if self.word_count:
151 content_info += f" | {self.word_count} words"
152 if self.estimated_read_time:
153 content_info += f" | ~{self.estimated_read_time}min read"
155 return content_info
157 def get_semantic_info(self) -> str | None:
158 """Get formatted semantic analysis information."""
159 # 🔥 NEW: Semantic analysis summary
160 parts = []
162 if self.entities:
163 entity_count = len(self.entities)
164 parts.append(f"{entity_count} entities")
166 if self.topics:
167 topic_list = ", ".join(self.topics[:3]) # Show first 3 topics
168 if len(self.topics) > 3:
169 topic_list += f" (+{len(self.topics) - 3} more)"
170 parts.append(f"Topics: {topic_list}")
172 if self.key_phrases:
173 phrase_count = len(self.key_phrases)
174 parts.append(f"{phrase_count} key phrases")
176 return " | ".join(parts) if parts else None
178 def get_navigation_info(self) -> str | None:
179 """Get formatted navigation context."""
180 # 🔥 NEW: Navigation context
181 parts = []
183 if self.previous_section:
184 parts.append(f"Previous: {self.previous_section}")
185 if self.next_section:
186 parts.append(f"Next: {self.next_section}")
187 if self.sibling_sections:
188 sibling_count = len(self.sibling_sections)
189 parts.append(f"{sibling_count} siblings")
190 if self.subsections:
191 subsection_count = len(self.subsections)
192 parts.append(f"{subsection_count} subsections")
194 return " | ".join(parts) if parts else None
196 def is_root_document(self) -> bool:
197 """Check if this is a root document (no parent)."""
198 return self.parent_id is None and self.parent_document_id is None
200 def has_children(self) -> bool:
201 """Check if this document has children."""
202 return (self.children_count is not None and self.children_count > 0) or bool(self.subsections)
204 def get_attachment_info(self) -> str | None:
205 """Get formatted attachment information for display."""
206 if not self.is_attachment or not self.attachment_context:
207 return None
208 return self.attachment_context
210 def is_file_attachment(self) -> bool:
211 """Check if this is a file attachment."""
212 return self.is_attachment
214 def get_file_type(self) -> str | None:
215 """Get the file type from MIME type or filename."""
216 # 🔥 ENHANCED: Include conversion info
217 if self.original_file_type:
218 file_type = self.original_file_type
219 if self.is_converted and self.conversion_method:
220 file_type += f" (converted via {self.conversion_method})"
221 return file_type
222 elif self.mime_type:
223 return self.mime_type
224 elif self.original_filename:
225 # Extract extension from filename
226 import os
227 _, ext = os.path.splitext(self.original_filename)
228 return ext.lower().lstrip(".") if ext else None
229 return None
231 def belongs_to_project(self, project_id: str) -> bool:
232 """Check if this result belongs to a specific project."""
233 return self.project_id == project_id
235 def belongs_to_any_project(self, project_ids: list[str]) -> bool:
236 """Check if this result belongs to any of the specified projects."""
237 return self.project_id is not None and self.project_id in project_ids
239 def is_code_content(self) -> bool:
240 """Check if this result contains code."""
241 return self.has_code_blocks or self.section_type == "code"
243 def is_documentation(self) -> bool:
244 """Check if this result is documentation content."""
245 return self.source_type in ["confluence", "localfile"] and not self.has_code_blocks
247 def is_structured_data(self) -> bool:
248 """Check if this result contains structured data."""
249 return self.has_tables or self.is_excel_sheet
251 def get_section_context(self) -> str | None:
252 """Get section context for enhanced display."""
253 # 🔥 NEW: Rich section context
254 if not self.section_title:
255 return None
257 context = self.section_title
258 if self.section_type and self.section_level:
259 context = f"[{self.section_type.upper()}] {context}"
260 if self.section_anchor:
261 context += f" (#{self.section_anchor})"
263 return context
265 def get_comprehensive_context(self) -> dict[str, str | None]:
266 """Get all available context information organized by type."""
267 # 🔥 NEW: Comprehensive context for advanced UIs
268 return {
269 "project": self.get_project_info(),
270 "hierarchy": self.get_hierarchy_info(),
271 "content": self.get_content_info(),
272 "semantic": self.get_semantic_info(),
273 "navigation": self.get_navigation_info(),
274 "section": self.get_section_context(),
275 "attachment": self.get_attachment_info(),
276 "file_type": self.get_file_type(),
277 }