Coverage for src/qdrant_loader/core/chunking/strategy/markdown/metadata_extractor.py: 87%
90 statements
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:39 +0000
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:39 +0000
1"""Metadata extraction for markdown chunks."""
3import re
4from typing import Any
6import structlog
8logger = structlog.get_logger(__name__)
11class CrossReferenceExtractor:
12 """Extracts cross-references from markdown text."""
14 @staticmethod
15 def extract_cross_references(text: str) -> list[dict[str, str]]:
16 """Extract cross-references from text.
18 Args:
19 text: Text to analyze
21 Returns:
22 List of cross-references
23 """
24 # Simple implementation - extract markdown links
25 references = []
26 lines = text.split("\n")
27 for line in lines:
28 if "[" in line and "](" in line:
29 # Extract link text and URL
30 parts = line.split("](")
31 if len(parts) == 2:
32 link_text = parts[0].split("[")[1]
33 url = parts[1].split(")")[0]
34 references.append({"text": link_text, "url": url})
35 return references
38class EntityExtractor:
39 """Extracts named entities from markdown text."""
41 @staticmethod
42 def extract_entities(text: str) -> list[dict[str, str]]:
43 """Extract named entities from text.
45 Args:
46 text: Text to analyze
48 Returns:
49 List of entities
50 """
51 # Simple implementation - extract capitalized phrases
52 entities = []
53 words = text.split()
54 current_entity = []
56 for word in words:
57 if word[0].isupper():
58 current_entity.append(word)
59 elif current_entity:
60 entities.append(
61 {
62 "text": " ".join(current_entity),
63 "type": "UNKNOWN", # Could be enhanced with NER
64 }
65 )
66 current_entity = []
68 if current_entity:
69 entities.append({"text": " ".join(current_entity), "type": "UNKNOWN"})
71 return entities
74class HierarchyExtractor:
75 """Extracts hierarchical relationships from markdown text."""
77 @staticmethod
78 def map_hierarchical_relationships(text: str) -> dict[str, Any]:
79 """Map hierarchical relationships in text.
81 Args:
82 text: Text to analyze
84 Returns:
85 Dictionary of hierarchical relationships
86 """
87 hierarchy = {}
88 current_path = []
90 lines = text.split("\n")
91 for line in lines:
92 if line.startswith("#"):
93 level = len(line.split()[0])
94 title = line.lstrip("#").strip()
96 # Update current path
97 while len(current_path) >= level:
98 current_path.pop()
99 current_path.append(title)
101 # Add to hierarchy
102 current = hierarchy
103 for part in current_path[:-1]:
104 if part not in current:
105 current[part] = {}
106 current = current[part]
107 current[current_path[-1]] = {}
109 return hierarchy
112class TopicAnalyzer:
113 """Analyzes topics in markdown text."""
115 @staticmethod
116 def analyze_topic(text: str) -> dict[str, Any]:
117 """Analyze topic of text.
119 Args:
120 text: Text to analyze
122 Returns:
123 Dictionary with topic analysis results
124 """
125 # Simple implementation - return basic topic info
126 return {
127 "topics": ["general"], # Could be enhanced with LDA
128 "coherence": 0.5, # Could be enhanced with topic coherence metrics
129 }
132class MetadataExtractor:
133 """Main metadata extractor that coordinates all extraction components."""
135 def __init__(self):
136 """Initialize the metadata extractor."""
137 self.cross_reference_extractor = CrossReferenceExtractor()
138 self.entity_extractor = EntityExtractor()
139 self.hierarchy_extractor = HierarchyExtractor()
140 self.topic_analyzer = TopicAnalyzer()
142 def extract_all_metadata(self, chunk_content: str, chunk_meta: dict[str, Any]) -> dict[str, Any]:
143 """Extract all metadata for a chunk.
145 Args:
146 chunk_content: The chunk content
147 chunk_meta: Existing chunk metadata
149 Returns:
150 Enhanced metadata dictionary
151 """
152 metadata = chunk_meta.copy()
154 # Extract cross-references
155 metadata["cross_references"] = self.cross_reference_extractor.extract_cross_references(
156 chunk_content
157 )
159 # Extract entities
160 metadata["entities"] = self.entity_extractor.extract_entities(chunk_content)
162 # Extract hierarchical relationships
163 metadata["hierarchy"] = self.hierarchy_extractor.map_hierarchical_relationships(
164 chunk_content
165 )
167 # Analyze topics
168 metadata["topic_analysis"] = self.topic_analyzer.analyze_topic(chunk_content)
170 return metadata
172 def extract_hierarchical_metadata(self, chunk_content: str, chunk_meta: dict[str, Any], document_context) -> dict[str, Any]:
173 """Extract rich hierarchical metadata following JIRA pattern.
175 Args:
176 chunk_content: The chunk content
177 chunk_meta: Existing chunk metadata
178 document_context: Original document for context
180 Returns:
181 Enhanced metadata with hierarchical relationships
182 """
183 metadata = self.extract_all_metadata(chunk_content, chunk_meta)
185 # 🔥 JIRA-style relationship metadata
186 metadata.update({
187 "parent_document_id": document_context.id,
188 "parent_document_title": document_context.title,
189 "parent_document_url": document_context.url,
191 # Enhanced hierarchical context
192 "section_breadcrumb": " > ".join(chunk_meta.get("path", []) + [chunk_meta.get("title", "")]),
193 "section_depth": len(chunk_meta.get("path", [])) + 1,
194 "section_anchor": self._generate_anchor(chunk_meta.get("title", "")),
196 # Content type analysis
197 "content_type_analysis": {
198 "has_code_blocks": bool(re.search(r"```", chunk_content)),
199 "has_tables": bool(re.search(r"\|.*\|", chunk_content)),
200 "has_images": bool(re.search(r"!\[.*?\]\(.*?\)", chunk_content)),
201 "has_links": bool(re.search(r"\[.*?\]\(.*?\)", chunk_content)),
202 "word_count": len(chunk_content.split()),
203 "char_count": len(chunk_content),
204 "estimated_read_time": max(1, len(chunk_content.split()) // 200), # minutes
205 "paragraph_count": len([p for p in chunk_content.split('\n\n') if p.strip()]),
206 },
208 # Document hierarchy for search filtering
209 "document_hierarchy": chunk_meta.get("path", []) + [chunk_meta.get("title", "")],
211 # Section type classification
212 "section_type": f"h{chunk_meta.get('level', 0)}" if chunk_meta.get('level', 0) > 0 else "content",
213 "section_level": chunk_meta.get("level", 0),
214 "section_title": chunk_meta.get("title", ""),
216 # Excel-specific metadata
217 "is_excel_sheet": chunk_meta.get("is_excel_sheet", False),
219 # Navigation hints (to be enhanced by caller with sibling info)
220 "has_subsections": False, # Will be updated by caller
221 "total_subsections": 0, # Will be updated by caller
222 })
224 return metadata
226 def _generate_anchor(self, title: str) -> str:
227 """Generate URL anchor from section title.
229 Args:
230 title: Section title
232 Returns:
233 URL-safe anchor string
234 """
235 if not title:
236 return ""
238 # Convert to lowercase, replace spaces and special chars with hyphens
239 anchor = re.sub(r'[^\w\s-]', '', title.lower())
240 anchor = re.sub(r'[-\s]+', '-', anchor)
241 return anchor.strip('-')
243 def extract_section_title(self, chunk: str) -> str:
244 """Extract section title from a chunk.
246 Args:
247 chunk: The text chunk
249 Returns:
250 Section title or default title
251 """
252 # Try to find header at the beginning of the chunk
253 header_match = re.match(r"^(#{1,6})\s+(.*?)(?:\n|$)", chunk)
254 if header_match:
255 return header_match.group(2).strip()
257 # Try to find the first sentence if no header
258 first_sentence_match = re.match(r"^([^\.!?]+[\.!?])", chunk)
259 if first_sentence_match:
260 title = first_sentence_match.group(1).strip()
261 # Truncate if too long
262 if len(title) > 50:
263 title = title[:50] + "..."
264 return title
266 return "Untitled Section"