Coverage for src / qdrant_loader / core / chunking / strategy / markdown / metadata_extractor.py: 82%

96 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-10 09:40 +0000

1"""Metadata extraction for markdown chunks.""" 

2 

3import re 

4from typing import Any 

5 

6import structlog 

7 

8logger = structlog.get_logger(__name__) 

9 

10 

11class CrossReferenceExtractor: 

12 """Extracts cross-references from markdown text.""" 

13 

14 @staticmethod 

15 def extract_cross_references(text: str) -> list[dict[str, str]]: 

16 """Extract cross-references from text. 

17 

18 Args: 

19 text: Text to analyze 

20 

21 Returns: 

22 List of cross-references 

23 """ 

24 # Simple implementation - extract markdown links 

25 references = [] 

26 lines = text.split("\n") 

27 for line in lines: 

28 if "[" in line and "](" in line: 

29 # Extract link text and URL 

30 parts = line.split("](") 

31 if len(parts) == 2: 

32 link_text = parts[0].split("[")[1] 

33 url = parts[1].split(")")[0] 

34 references.append({"text": link_text, "url": url}) 

35 return references 

36 

37 

38class EntityExtractor: 

39 """Extracts named entities from markdown text.""" 

40 

41 @staticmethod 

42 def extract_entities(text: str) -> list[dict[str, str]]: 

43 """Extract named entities from text. 

44 

45 Args: 

46 text: Text to analyze 

47 

48 Returns: 

49 List of entities 

50 """ 

51 # Simple implementation - extract capitalized phrases 

52 entities = [] 

53 words = text.split() 

54 current_entity = [] 

55 

56 for word in words: 

57 if word[0].isupper(): 

58 current_entity.append(word) 

59 elif current_entity: 

60 entities.append( 

61 { 

62 "text": " ".join(current_entity), 

63 "type": "UNKNOWN", # Could be enhanced with NER 

64 } 

65 ) 

66 current_entity = [] 

67 

68 if current_entity: 

69 entities.append({"text": " ".join(current_entity), "type": "UNKNOWN"}) 

70 

71 return entities 

72 

73 

74class HierarchyExtractor: 

75 """Extracts hierarchical relationships from markdown text.""" 

76 

77 @staticmethod 

78 def map_hierarchical_relationships(text: str) -> dict[str, Any]: 

79 """Map hierarchical relationships in text. 

80 

81 Args: 

82 text: Text to analyze 

83 

84 Returns: 

85 Dictionary of hierarchical relationships 

86 """ 

87 hierarchy = {} 

88 current_path = [] 

89 

90 lines = text.split("\n") 

91 for line in lines: 

92 if line.startswith("#"): 

93 level = len(line.split()[0]) 

94 title = line.lstrip("#").strip() 

95 

96 # Update current path 

97 while len(current_path) >= level: 

98 current_path.pop() 

99 current_path.append(title) 

100 

101 # Add to hierarchy 

102 current = hierarchy 

103 for part in current_path[:-1]: 

104 if part not in current: 

105 current[part] = {} 

106 current = current[part] 

107 current[current_path[-1]] = {} 

108 

109 return hierarchy 

110 

111 

112class TopicAnalyzer: 

113 """Analyzes topics in markdown text.""" 

114 

115 @staticmethod 

116 def analyze_topic(text: str) -> dict[str, Any]: 

117 """Analyze topic of text. 

118 

119 Args: 

120 text: Text to analyze 

121 

122 Returns: 

123 Dictionary with topic analysis results 

124 """ 

125 # Simple implementation - return basic topic info 

126 return { 

127 "topics": ["general"], # Could be enhanced with LDA 

128 "coherence": 0.5, # Could be enhanced with topic coherence metrics 

129 } 

130 

131 

132class MetadataExtractor: 

133 """Main metadata extractor that coordinates all extraction components.""" 

134 

135 def __init__(self, settings=None): 

136 """Initialize the metadata extractor. 

137 

138 Args: 

139 settings: Configuration settings containing markdown strategy config 

140 """ 

141 self.settings = settings 

142 self._semantic_analysis_enabled = bool( 

143 getattr( 

144 getattr(getattr(settings, "global_config", None), "chunking", None), 

145 "enable_semantic_analysis", 

146 True, 

147 ) 

148 ) 

149 self.cross_reference_extractor = CrossReferenceExtractor() 

150 self.entity_extractor = EntityExtractor() 

151 self.hierarchy_extractor = HierarchyExtractor() 

152 self.topic_analyzer = TopicAnalyzer() 

153 

154 def extract_all_metadata( 

155 self, chunk_content: str, chunk_meta: dict[str, Any] 

156 ) -> dict[str, Any]: 

157 """Extract all metadata for a chunk. 

158 

159 Args: 

160 chunk_content: The chunk content 

161 chunk_meta: Existing chunk metadata 

162 

163 Returns: 

164 Enhanced metadata dictionary 

165 """ 

166 metadata = chunk_meta.copy() 

167 

168 # Extract cross-references 

169 metadata["cross_references"] = ( 

170 self.cross_reference_extractor.extract_cross_references(chunk_content) 

171 ) 

172 

173 # Respect global semantic-analysis master switch. 

174 if self._semantic_analysis_enabled: 

175 metadata["entities"] = self.entity_extractor.extract_entities(chunk_content) 

176 metadata["topic_analysis"] = self.topic_analyzer.analyze_topic( 

177 chunk_content 

178 ) 

179 else: 

180 metadata["entities"] = [] 

181 metadata["topic_analysis"] = {"topics": [], "coherence": 0.0} 

182 

183 # Extract hierarchical relationships 

184 metadata["hierarchy"] = self.hierarchy_extractor.map_hierarchical_relationships( 

185 chunk_content 

186 ) 

187 

188 return metadata 

189 

190 def extract_hierarchical_metadata( 

191 self, chunk_content: str, chunk_meta: dict[str, Any], document_context 

192 ) -> dict[str, Any]: 

193 """Extract rich hierarchical metadata following JIRA pattern. 

194 

195 Args: 

196 chunk_content: The chunk content 

197 chunk_meta: Existing chunk metadata 

198 document_context: Original document for context 

199 

200 Returns: 

201 Enhanced metadata with hierarchical relationships 

202 """ 

203 metadata = self.extract_all_metadata(chunk_content, chunk_meta) 

204 

205 # Calculate reading speed from configuration 

206 words_per_minute = ( 

207 self.settings.global_config.chunking.strategies.markdown.words_per_minute_reading 

208 if self.settings 

209 else 200 # Default fallback 

210 ) 

211 

212 # 🔥 JIRA-style relationship metadata 

213 metadata.update( 

214 { 

215 "parent_document_id": document_context.id, 

216 "parent_document_title": document_context.title, 

217 "parent_document_url": document_context.url, 

218 # Enhanced hierarchical context 

219 "section_breadcrumb": " > ".join( 

220 chunk_meta.get("path", []) + [chunk_meta.get("title", "")] 

221 ), 

222 "section_depth": len(chunk_meta.get("path", [])) + 1, 

223 "section_anchor": self._generate_anchor(chunk_meta.get("title", "")), 

224 # Content type analysis 

225 "content_type_analysis": { 

226 "has_code_blocks": bool(re.search(r"```", chunk_content)), 

227 "has_tables": bool(re.search(r"\|.*\|", chunk_content)), 

228 "has_images": bool(re.search(r"!\[.*?\]\(.*?\)", chunk_content)), 

229 "has_links": bool(re.search(r"\[.*?\]\(.*?\)", chunk_content)), 

230 "word_count": len(chunk_content.split()), 

231 "char_count": len(chunk_content), 

232 "estimated_read_time": max( 

233 1, len(chunk_content.split()) // words_per_minute 

234 ), # minutes 

235 "paragraph_count": len( 

236 [p for p in chunk_content.split("\n\n") if p.strip()] 

237 ), 

238 }, 

239 # Document hierarchy for search filtering 

240 "document_hierarchy": chunk_meta.get("path", []) 

241 + [chunk_meta.get("title", "")], 

242 # Section type classification 

243 "section_type": ( 

244 f"h{chunk_meta.get('level', 0)}" 

245 if chunk_meta.get("level", 0) > 0 

246 else "content" 

247 ), 

248 "section_level": chunk_meta.get("level", 0), 

249 "section_title": chunk_meta.get("title", ""), 

250 # Excel-specific metadata 

251 "is_excel_sheet": chunk_meta.get("is_excel_sheet", False), 

252 # Navigation hints (to be enhanced by caller with sibling info) 

253 "has_subsections": False, # Will be updated by caller 

254 "total_subsections": 0, # Will be updated by caller 

255 } 

256 ) 

257 

258 return metadata 

259 

260 def _generate_anchor(self, title: str) -> str: 

261 """Generate URL anchor from section title. 

262 

263 Args: 

264 title: Section title 

265 

266 Returns: 

267 URL-safe anchor string 

268 """ 

269 if not title: 

270 return "" 

271 

272 # Convert to lowercase, replace spaces and special chars with hyphens 

273 anchor = re.sub(r"[^\w\s-]", "", title.lower()) 

274 anchor = re.sub(r"[-\s]+", "-", anchor) 

275 return anchor.strip("-") 

276 

277 def extract_section_title(self, chunk: str) -> str: 

278 """Extract section title from a chunk. 

279 

280 Args: 

281 chunk: The text chunk 

282 

283 Returns: 

284 Section title or default title 

285 """ 

286 # Try to find header at the beginning of the chunk 

287 header_match = re.match(r"^(#{1,6})\s+(.*?)(?:\n|$)", chunk) 

288 if header_match: 

289 return header_match.group(2).strip() 

290 

291 # Try to find the first sentence if no header 

292 first_sentence_match = re.match(r"^([^\.!?]+[\.!?])", chunk) 

293 if first_sentence_match: 

294 title = first_sentence_match.group(1).strip() 

295 # Truncate if too long 

296 if len(title) > 50: 

297 title = title[:50] + "..." 

298 return title 

299 

300 return "Untitled Section"