Coverage for src/qdrant_loader/core/chunking/strategy/markdown/metadata_extractor.py: 79%

92 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""Metadata extraction for markdown chunks.""" 

2 

3import re 

4from typing import Any 

5 

6import structlog 

7 

8logger = structlog.get_logger(__name__) 

9 

10 

11class CrossReferenceExtractor: 

12 """Extracts cross-references from markdown text.""" 

13 

14 @staticmethod 

15 def extract_cross_references(text: str) -> list[dict[str, str]]: 

16 """Extract cross-references from text. 

17 

18 Args: 

19 text: Text to analyze 

20 

21 Returns: 

22 List of cross-references 

23 """ 

24 # Simple implementation - extract markdown links 

25 references = [] 

26 lines = text.split("\n") 

27 for line in lines: 

28 if "[" in line and "](" in line: 

29 # Extract link text and URL 

30 parts = line.split("](") 

31 if len(parts) == 2: 

32 link_text = parts[0].split("[")[1] 

33 url = parts[1].split(")")[0] 

34 references.append({"text": link_text, "url": url}) 

35 return references 

36 

37 

38class EntityExtractor: 

39 """Extracts named entities from markdown text.""" 

40 

41 @staticmethod 

42 def extract_entities(text: str) -> list[dict[str, str]]: 

43 """Extract named entities from text. 

44 

45 Args: 

46 text: Text to analyze 

47 

48 Returns: 

49 List of entities 

50 """ 

51 # Simple implementation - extract capitalized phrases 

52 entities = [] 

53 words = text.split() 

54 current_entity = [] 

55 

56 for word in words: 

57 if word[0].isupper(): 

58 current_entity.append(word) 

59 elif current_entity: 

60 entities.append( 

61 { 

62 "text": " ".join(current_entity), 

63 "type": "UNKNOWN", # Could be enhanced with NER 

64 } 

65 ) 

66 current_entity = [] 

67 

68 if current_entity: 

69 entities.append({"text": " ".join(current_entity), "type": "UNKNOWN"}) 

70 

71 return entities 

72 

73 

74class HierarchyExtractor: 

75 """Extracts hierarchical relationships from markdown text.""" 

76 

77 @staticmethod 

78 def map_hierarchical_relationships(text: str) -> dict[str, Any]: 

79 """Map hierarchical relationships in text. 

80 

81 Args: 

82 text: Text to analyze 

83 

84 Returns: 

85 Dictionary of hierarchical relationships 

86 """ 

87 hierarchy = {} 

88 current_path = [] 

89 

90 lines = text.split("\n") 

91 for line in lines: 

92 if line.startswith("#"): 

93 level = len(line.split()[0]) 

94 title = line.lstrip("#").strip() 

95 

96 # Update current path 

97 while len(current_path) >= level: 

98 current_path.pop() 

99 current_path.append(title) 

100 

101 # Add to hierarchy 

102 current = hierarchy 

103 for part in current_path[:-1]: 

104 if part not in current: 

105 current[part] = {} 

106 current = current[part] 

107 current[current_path[-1]] = {} 

108 

109 return hierarchy 

110 

111 

112class TopicAnalyzer: 

113 """Analyzes topics in markdown text.""" 

114 

115 @staticmethod 

116 def analyze_topic(text: str) -> dict[str, Any]: 

117 """Analyze topic of text. 

118 

119 Args: 

120 text: Text to analyze 

121 

122 Returns: 

123 Dictionary with topic analysis results 

124 """ 

125 # Simple implementation - return basic topic info 

126 return { 

127 "topics": ["general"], # Could be enhanced with LDA 

128 "coherence": 0.5, # Could be enhanced with topic coherence metrics 

129 } 

130 

131 

132class MetadataExtractor: 

133 """Main metadata extractor that coordinates all extraction components.""" 

134 

135 def __init__(self, settings=None): 

136 """Initialize the metadata extractor. 

137 

138 Args: 

139 settings: Configuration settings containing markdown strategy config 

140 """ 

141 self.settings = settings 

142 self.cross_reference_extractor = CrossReferenceExtractor() 

143 self.entity_extractor = EntityExtractor() 

144 self.hierarchy_extractor = HierarchyExtractor() 

145 self.topic_analyzer = TopicAnalyzer() 

146 

147 def extract_all_metadata( 

148 self, chunk_content: str, chunk_meta: dict[str, Any] 

149 ) -> dict[str, Any]: 

150 """Extract all metadata for a chunk. 

151 

152 Args: 

153 chunk_content: The chunk content 

154 chunk_meta: Existing chunk metadata 

155 

156 Returns: 

157 Enhanced metadata dictionary 

158 """ 

159 metadata = chunk_meta.copy() 

160 

161 # Extract cross-references 

162 metadata["cross_references"] = ( 

163 self.cross_reference_extractor.extract_cross_references(chunk_content) 

164 ) 

165 

166 # Extract entities 

167 metadata["entities"] = self.entity_extractor.extract_entities(chunk_content) 

168 

169 # Extract hierarchical relationships 

170 metadata["hierarchy"] = self.hierarchy_extractor.map_hierarchical_relationships( 

171 chunk_content 

172 ) 

173 

174 # Analyze topics 

175 metadata["topic_analysis"] = self.topic_analyzer.analyze_topic(chunk_content) 

176 

177 return metadata 

178 

179 def extract_hierarchical_metadata( 

180 self, chunk_content: str, chunk_meta: dict[str, Any], document_context 

181 ) -> dict[str, Any]: 

182 """Extract rich hierarchical metadata following JIRA pattern. 

183 

184 Args: 

185 chunk_content: The chunk content 

186 chunk_meta: Existing chunk metadata 

187 document_context: Original document for context 

188 

189 Returns: 

190 Enhanced metadata with hierarchical relationships 

191 """ 

192 metadata = self.extract_all_metadata(chunk_content, chunk_meta) 

193 

194 # Calculate reading speed from configuration 

195 words_per_minute = ( 

196 self.settings.global_config.chunking.strategies.markdown.words_per_minute_reading 

197 if self.settings 

198 else 200 # Default fallback 

199 ) 

200 

201 # 🔥 JIRA-style relationship metadata 

202 metadata.update( 

203 { 

204 "parent_document_id": document_context.id, 

205 "parent_document_title": document_context.title, 

206 "parent_document_url": document_context.url, 

207 # Enhanced hierarchical context 

208 "section_breadcrumb": " > ".join( 

209 chunk_meta.get("path", []) + [chunk_meta.get("title", "")] 

210 ), 

211 "section_depth": len(chunk_meta.get("path", [])) + 1, 

212 "section_anchor": self._generate_anchor(chunk_meta.get("title", "")), 

213 # Content type analysis 

214 "content_type_analysis": { 

215 "has_code_blocks": bool(re.search(r"```", chunk_content)), 

216 "has_tables": bool(re.search(r"\|.*\|", chunk_content)), 

217 "has_images": bool(re.search(r"!\[.*?\]\(.*?\)", chunk_content)), 

218 "has_links": bool(re.search(r"\[.*?\]\(.*?\)", chunk_content)), 

219 "word_count": len(chunk_content.split()), 

220 "char_count": len(chunk_content), 

221 "estimated_read_time": max( 

222 1, len(chunk_content.split()) // words_per_minute 

223 ), # minutes 

224 "paragraph_count": len( 

225 [p for p in chunk_content.split("\n\n") if p.strip()] 

226 ), 

227 }, 

228 # Document hierarchy for search filtering 

229 "document_hierarchy": chunk_meta.get("path", []) 

230 + [chunk_meta.get("title", "")], 

231 # Section type classification 

232 "section_type": ( 

233 f"h{chunk_meta.get('level', 0)}" 

234 if chunk_meta.get("level", 0) > 0 

235 else "content" 

236 ), 

237 "section_level": chunk_meta.get("level", 0), 

238 "section_title": chunk_meta.get("title", ""), 

239 # Excel-specific metadata 

240 "is_excel_sheet": chunk_meta.get("is_excel_sheet", False), 

241 # Navigation hints (to be enhanced by caller with sibling info) 

242 "has_subsections": False, # Will be updated by caller 

243 "total_subsections": 0, # Will be updated by caller 

244 } 

245 ) 

246 

247 return metadata 

248 

249 def _generate_anchor(self, title: str) -> str: 

250 """Generate URL anchor from section title. 

251 

252 Args: 

253 title: Section title 

254 

255 Returns: 

256 URL-safe anchor string 

257 """ 

258 if not title: 

259 return "" 

260 

261 # Convert to lowercase, replace spaces and special chars with hyphens 

262 anchor = re.sub(r"[^\w\s-]", "", title.lower()) 

263 anchor = re.sub(r"[-\s]+", "-", anchor) 

264 return anchor.strip("-") 

265 

266 def extract_section_title(self, chunk: str) -> str: 

267 """Extract section title from a chunk. 

268 

269 Args: 

270 chunk: The text chunk 

271 

272 Returns: 

273 Section title or default title 

274 """ 

275 # Try to find header at the beginning of the chunk 

276 header_match = re.match(r"^(#{1,6})\s+(.*?)(?:\n|$)", chunk) 

277 if header_match: 

278 return header_match.group(2).strip() 

279 

280 # Try to find the first sentence if no header 

281 first_sentence_match = re.match(r"^([^\.!?]+[\.!?])", chunk) 

282 if first_sentence_match: 

283 title = first_sentence_match.group(1).strip() 

284 # Truncate if too long 

285 if len(title) > 50: 

286 title = title[:50] + "..." 

287 return title 

288 

289 return "Untitled Section"