Coverage for src / qdrant_loader / core / chunking / strategy / markdown / metadata_extractor.py: 82%
96 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:40 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:40 +0000
1"""Metadata extraction for markdown chunks."""
3import re
4from typing import Any
6import structlog
8logger = structlog.get_logger(__name__)
11class CrossReferenceExtractor:
12 """Extracts cross-references from markdown text."""
14 @staticmethod
15 def extract_cross_references(text: str) -> list[dict[str, str]]:
16 """Extract cross-references from text.
18 Args:
19 text: Text to analyze
21 Returns:
22 List of cross-references
23 """
24 # Simple implementation - extract markdown links
25 references = []
26 lines = text.split("\n")
27 for line in lines:
28 if "[" in line and "](" in line:
29 # Extract link text and URL
30 parts = line.split("](")
31 if len(parts) == 2:
32 link_text = parts[0].split("[")[1]
33 url = parts[1].split(")")[0]
34 references.append({"text": link_text, "url": url})
35 return references
38class EntityExtractor:
39 """Extracts named entities from markdown text."""
41 @staticmethod
42 def extract_entities(text: str) -> list[dict[str, str]]:
43 """Extract named entities from text.
45 Args:
46 text: Text to analyze
48 Returns:
49 List of entities
50 """
51 # Simple implementation - extract capitalized phrases
52 entities = []
53 words = text.split()
54 current_entity = []
56 for word in words:
57 if word[0].isupper():
58 current_entity.append(word)
59 elif current_entity:
60 entities.append(
61 {
62 "text": " ".join(current_entity),
63 "type": "UNKNOWN", # Could be enhanced with NER
64 }
65 )
66 current_entity = []
68 if current_entity:
69 entities.append({"text": " ".join(current_entity), "type": "UNKNOWN"})
71 return entities
74class HierarchyExtractor:
75 """Extracts hierarchical relationships from markdown text."""
77 @staticmethod
78 def map_hierarchical_relationships(text: str) -> dict[str, Any]:
79 """Map hierarchical relationships in text.
81 Args:
82 text: Text to analyze
84 Returns:
85 Dictionary of hierarchical relationships
86 """
87 hierarchy = {}
88 current_path = []
90 lines = text.split("\n")
91 for line in lines:
92 if line.startswith("#"):
93 level = len(line.split()[0])
94 title = line.lstrip("#").strip()
96 # Update current path
97 while len(current_path) >= level:
98 current_path.pop()
99 current_path.append(title)
101 # Add to hierarchy
102 current = hierarchy
103 for part in current_path[:-1]:
104 if part not in current:
105 current[part] = {}
106 current = current[part]
107 current[current_path[-1]] = {}
109 return hierarchy
112class TopicAnalyzer:
113 """Analyzes topics in markdown text."""
115 @staticmethod
116 def analyze_topic(text: str) -> dict[str, Any]:
117 """Analyze topic of text.
119 Args:
120 text: Text to analyze
122 Returns:
123 Dictionary with topic analysis results
124 """
125 # Simple implementation - return basic topic info
126 return {
127 "topics": ["general"], # Could be enhanced with LDA
128 "coherence": 0.5, # Could be enhanced with topic coherence metrics
129 }
132class MetadataExtractor:
133 """Main metadata extractor that coordinates all extraction components."""
135 def __init__(self, settings=None):
136 """Initialize the metadata extractor.
138 Args:
139 settings: Configuration settings containing markdown strategy config
140 """
141 self.settings = settings
142 self._semantic_analysis_enabled = bool(
143 getattr(
144 getattr(getattr(settings, "global_config", None), "chunking", None),
145 "enable_semantic_analysis",
146 True,
147 )
148 )
149 self.cross_reference_extractor = CrossReferenceExtractor()
150 self.entity_extractor = EntityExtractor()
151 self.hierarchy_extractor = HierarchyExtractor()
152 self.topic_analyzer = TopicAnalyzer()
154 def extract_all_metadata(
155 self, chunk_content: str, chunk_meta: dict[str, Any]
156 ) -> dict[str, Any]:
157 """Extract all metadata for a chunk.
159 Args:
160 chunk_content: The chunk content
161 chunk_meta: Existing chunk metadata
163 Returns:
164 Enhanced metadata dictionary
165 """
166 metadata = chunk_meta.copy()
168 # Extract cross-references
169 metadata["cross_references"] = (
170 self.cross_reference_extractor.extract_cross_references(chunk_content)
171 )
173 # Respect global semantic-analysis master switch.
174 if self._semantic_analysis_enabled:
175 metadata["entities"] = self.entity_extractor.extract_entities(chunk_content)
176 metadata["topic_analysis"] = self.topic_analyzer.analyze_topic(
177 chunk_content
178 )
179 else:
180 metadata["entities"] = []
181 metadata["topic_analysis"] = {"topics": [], "coherence": 0.0}
183 # Extract hierarchical relationships
184 metadata["hierarchy"] = self.hierarchy_extractor.map_hierarchical_relationships(
185 chunk_content
186 )
188 return metadata
190 def extract_hierarchical_metadata(
191 self, chunk_content: str, chunk_meta: dict[str, Any], document_context
192 ) -> dict[str, Any]:
193 """Extract rich hierarchical metadata following JIRA pattern.
195 Args:
196 chunk_content: The chunk content
197 chunk_meta: Existing chunk metadata
198 document_context: Original document for context
200 Returns:
201 Enhanced metadata with hierarchical relationships
202 """
203 metadata = self.extract_all_metadata(chunk_content, chunk_meta)
205 # Calculate reading speed from configuration
206 words_per_minute = (
207 self.settings.global_config.chunking.strategies.markdown.words_per_minute_reading
208 if self.settings
209 else 200 # Default fallback
210 )
212 # 🔥 JIRA-style relationship metadata
213 metadata.update(
214 {
215 "parent_document_id": document_context.id,
216 "parent_document_title": document_context.title,
217 "parent_document_url": document_context.url,
218 # Enhanced hierarchical context
219 "section_breadcrumb": " > ".join(
220 chunk_meta.get("path", []) + [chunk_meta.get("title", "")]
221 ),
222 "section_depth": len(chunk_meta.get("path", [])) + 1,
223 "section_anchor": self._generate_anchor(chunk_meta.get("title", "")),
224 # Content type analysis
225 "content_type_analysis": {
226 "has_code_blocks": bool(re.search(r"```", chunk_content)),
227 "has_tables": bool(re.search(r"\|.*\|", chunk_content)),
228 "has_images": bool(re.search(r"!\[.*?\]\(.*?\)", chunk_content)),
229 "has_links": bool(re.search(r"\[.*?\]\(.*?\)", chunk_content)),
230 "word_count": len(chunk_content.split()),
231 "char_count": len(chunk_content),
232 "estimated_read_time": max(
233 1, len(chunk_content.split()) // words_per_minute
234 ), # minutes
235 "paragraph_count": len(
236 [p for p in chunk_content.split("\n\n") if p.strip()]
237 ),
238 },
239 # Document hierarchy for search filtering
240 "document_hierarchy": chunk_meta.get("path", [])
241 + [chunk_meta.get("title", "")],
242 # Section type classification
243 "section_type": (
244 f"h{chunk_meta.get('level', 0)}"
245 if chunk_meta.get("level", 0) > 0
246 else "content"
247 ),
248 "section_level": chunk_meta.get("level", 0),
249 "section_title": chunk_meta.get("title", ""),
250 # Excel-specific metadata
251 "is_excel_sheet": chunk_meta.get("is_excel_sheet", False),
252 # Navigation hints (to be enhanced by caller with sibling info)
253 "has_subsections": False, # Will be updated by caller
254 "total_subsections": 0, # Will be updated by caller
255 }
256 )
258 return metadata
260 def _generate_anchor(self, title: str) -> str:
261 """Generate URL anchor from section title.
263 Args:
264 title: Section title
266 Returns:
267 URL-safe anchor string
268 """
269 if not title:
270 return ""
272 # Convert to lowercase, replace spaces and special chars with hyphens
273 anchor = re.sub(r"[^\w\s-]", "", title.lower())
274 anchor = re.sub(r"[-\s]+", "-", anchor)
275 return anchor.strip("-")
277 def extract_section_title(self, chunk: str) -> str:
278 """Extract section title from a chunk.
280 Args:
281 chunk: The text chunk
283 Returns:
284 Section title or default title
285 """
286 # Try to find header at the beginning of the chunk
287 header_match = re.match(r"^(#{1,6})\s+(.*?)(?:\n|$)", chunk)
288 if header_match:
289 return header_match.group(2).strip()
291 # Try to find the first sentence if no header
292 first_sentence_match = re.match(r"^([^\.!?]+[\.!?])", chunk)
293 if first_sentence_match:
294 title = first_sentence_match.group(1).strip()
295 # Truncate if too long
296 if len(title) > 50:
297 title = title[:50] + "..."
298 return title
300 return "Untitled Section"