Coverage for src/qdrant_loader/core/chunking/strategy/markdown/metadata_extractor.py: 79%
92 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1"""Metadata extraction for markdown chunks."""
3import re
4from typing import Any
6import structlog
8logger = structlog.get_logger(__name__)
11class CrossReferenceExtractor:
12 """Extracts cross-references from markdown text."""
14 @staticmethod
15 def extract_cross_references(text: str) -> list[dict[str, str]]:
16 """Extract cross-references from text.
18 Args:
19 text: Text to analyze
21 Returns:
22 List of cross-references
23 """
24 # Simple implementation - extract markdown links
25 references = []
26 lines = text.split("\n")
27 for line in lines:
28 if "[" in line and "](" in line:
29 # Extract link text and URL
30 parts = line.split("](")
31 if len(parts) == 2:
32 link_text = parts[0].split("[")[1]
33 url = parts[1].split(")")[0]
34 references.append({"text": link_text, "url": url})
35 return references
38class EntityExtractor:
39 """Extracts named entities from markdown text."""
41 @staticmethod
42 def extract_entities(text: str) -> list[dict[str, str]]:
43 """Extract named entities from text.
45 Args:
46 text: Text to analyze
48 Returns:
49 List of entities
50 """
51 # Simple implementation - extract capitalized phrases
52 entities = []
53 words = text.split()
54 current_entity = []
56 for word in words:
57 if word[0].isupper():
58 current_entity.append(word)
59 elif current_entity:
60 entities.append(
61 {
62 "text": " ".join(current_entity),
63 "type": "UNKNOWN", # Could be enhanced with NER
64 }
65 )
66 current_entity = []
68 if current_entity:
69 entities.append({"text": " ".join(current_entity), "type": "UNKNOWN"})
71 return entities
74class HierarchyExtractor:
75 """Extracts hierarchical relationships from markdown text."""
77 @staticmethod
78 def map_hierarchical_relationships(text: str) -> dict[str, Any]:
79 """Map hierarchical relationships in text.
81 Args:
82 text: Text to analyze
84 Returns:
85 Dictionary of hierarchical relationships
86 """
87 hierarchy = {}
88 current_path = []
90 lines = text.split("\n")
91 for line in lines:
92 if line.startswith("#"):
93 level = len(line.split()[0])
94 title = line.lstrip("#").strip()
96 # Update current path
97 while len(current_path) >= level:
98 current_path.pop()
99 current_path.append(title)
101 # Add to hierarchy
102 current = hierarchy
103 for part in current_path[:-1]:
104 if part not in current:
105 current[part] = {}
106 current = current[part]
107 current[current_path[-1]] = {}
109 return hierarchy
112class TopicAnalyzer:
113 """Analyzes topics in markdown text."""
115 @staticmethod
116 def analyze_topic(text: str) -> dict[str, Any]:
117 """Analyze topic of text.
119 Args:
120 text: Text to analyze
122 Returns:
123 Dictionary with topic analysis results
124 """
125 # Simple implementation - return basic topic info
126 return {
127 "topics": ["general"], # Could be enhanced with LDA
128 "coherence": 0.5, # Could be enhanced with topic coherence metrics
129 }
132class MetadataExtractor:
133 """Main metadata extractor that coordinates all extraction components."""
135 def __init__(self, settings=None):
136 """Initialize the metadata extractor.
138 Args:
139 settings: Configuration settings containing markdown strategy config
140 """
141 self.settings = settings
142 self.cross_reference_extractor = CrossReferenceExtractor()
143 self.entity_extractor = EntityExtractor()
144 self.hierarchy_extractor = HierarchyExtractor()
145 self.topic_analyzer = TopicAnalyzer()
147 def extract_all_metadata(
148 self, chunk_content: str, chunk_meta: dict[str, Any]
149 ) -> dict[str, Any]:
150 """Extract all metadata for a chunk.
152 Args:
153 chunk_content: The chunk content
154 chunk_meta: Existing chunk metadata
156 Returns:
157 Enhanced metadata dictionary
158 """
159 metadata = chunk_meta.copy()
161 # Extract cross-references
162 metadata["cross_references"] = (
163 self.cross_reference_extractor.extract_cross_references(chunk_content)
164 )
166 # Extract entities
167 metadata["entities"] = self.entity_extractor.extract_entities(chunk_content)
169 # Extract hierarchical relationships
170 metadata["hierarchy"] = self.hierarchy_extractor.map_hierarchical_relationships(
171 chunk_content
172 )
174 # Analyze topics
175 metadata["topic_analysis"] = self.topic_analyzer.analyze_topic(chunk_content)
177 return metadata
179 def extract_hierarchical_metadata(
180 self, chunk_content: str, chunk_meta: dict[str, Any], document_context
181 ) -> dict[str, Any]:
182 """Extract rich hierarchical metadata following JIRA pattern.
184 Args:
185 chunk_content: The chunk content
186 chunk_meta: Existing chunk metadata
187 document_context: Original document for context
189 Returns:
190 Enhanced metadata with hierarchical relationships
191 """
192 metadata = self.extract_all_metadata(chunk_content, chunk_meta)
194 # Calculate reading speed from configuration
195 words_per_minute = (
196 self.settings.global_config.chunking.strategies.markdown.words_per_minute_reading
197 if self.settings
198 else 200 # Default fallback
199 )
201 # 🔥 JIRA-style relationship metadata
202 metadata.update(
203 {
204 "parent_document_id": document_context.id,
205 "parent_document_title": document_context.title,
206 "parent_document_url": document_context.url,
207 # Enhanced hierarchical context
208 "section_breadcrumb": " > ".join(
209 chunk_meta.get("path", []) + [chunk_meta.get("title", "")]
210 ),
211 "section_depth": len(chunk_meta.get("path", [])) + 1,
212 "section_anchor": self._generate_anchor(chunk_meta.get("title", "")),
213 # Content type analysis
214 "content_type_analysis": {
215 "has_code_blocks": bool(re.search(r"```", chunk_content)),
216 "has_tables": bool(re.search(r"\|.*\|", chunk_content)),
217 "has_images": bool(re.search(r"!\[.*?\]\(.*?\)", chunk_content)),
218 "has_links": bool(re.search(r"\[.*?\]\(.*?\)", chunk_content)),
219 "word_count": len(chunk_content.split()),
220 "char_count": len(chunk_content),
221 "estimated_read_time": max(
222 1, len(chunk_content.split()) // words_per_minute
223 ), # minutes
224 "paragraph_count": len(
225 [p for p in chunk_content.split("\n\n") if p.strip()]
226 ),
227 },
228 # Document hierarchy for search filtering
229 "document_hierarchy": chunk_meta.get("path", [])
230 + [chunk_meta.get("title", "")],
231 # Section type classification
232 "section_type": (
233 f"h{chunk_meta.get('level', 0)}"
234 if chunk_meta.get("level", 0) > 0
235 else "content"
236 ),
237 "section_level": chunk_meta.get("level", 0),
238 "section_title": chunk_meta.get("title", ""),
239 # Excel-specific metadata
240 "is_excel_sheet": chunk_meta.get("is_excel_sheet", False),
241 # Navigation hints (to be enhanced by caller with sibling info)
242 "has_subsections": False, # Will be updated by caller
243 "total_subsections": 0, # Will be updated by caller
244 }
245 )
247 return metadata
249 def _generate_anchor(self, title: str) -> str:
250 """Generate URL anchor from section title.
252 Args:
253 title: Section title
255 Returns:
256 URL-safe anchor string
257 """
258 if not title:
259 return ""
261 # Convert to lowercase, replace spaces and special chars with hyphens
262 anchor = re.sub(r"[^\w\s-]", "", title.lower())
263 anchor = re.sub(r"[-\s]+", "-", anchor)
264 return anchor.strip("-")
266 def extract_section_title(self, chunk: str) -> str:
267 """Extract section title from a chunk.
269 Args:
270 chunk: The text chunk
272 Returns:
273 Section title or default title
274 """
275 # Try to find header at the beginning of the chunk
276 header_match = re.match(r"^(#{1,6})\s+(.*?)(?:\n|$)", chunk)
277 if header_match:
278 return header_match.group(2).strip()
280 # Try to find the first sentence if no header
281 first_sentence_match = re.match(r"^([^\.!?]+[\.!?])", chunk)
282 if first_sentence_match:
283 title = first_sentence_match.group(1).strip()
284 # Truncate if too long
285 if len(title) > 50:
286 title = title[:50] + "..."
287 return title
289 return "Untitled Section"