Coverage for src/qdrant_loader/core/chunking/strategy/markdown/metadata_extractor.py: 87%

90 statements  

« prev     ^ index     » next       coverage.py v7.10.0, created at 2025-07-25 11:39 +0000

1"""Metadata extraction for markdown chunks.""" 

2 

3import re 

4from typing import Any 

5 

6import structlog 

7 

8logger = structlog.get_logger(__name__) 

9 

10 

11class CrossReferenceExtractor: 

12 """Extracts cross-references from markdown text.""" 

13 

14 @staticmethod 

15 def extract_cross_references(text: str) -> list[dict[str, str]]: 

16 """Extract cross-references from text. 

17 

18 Args: 

19 text: Text to analyze 

20 

21 Returns: 

22 List of cross-references 

23 """ 

24 # Simple implementation - extract markdown links 

25 references = [] 

26 lines = text.split("\n") 

27 for line in lines: 

28 if "[" in line and "](" in line: 

29 # Extract link text and URL 

30 parts = line.split("](") 

31 if len(parts) == 2: 

32 link_text = parts[0].split("[")[1] 

33 url = parts[1].split(")")[0] 

34 references.append({"text": link_text, "url": url}) 

35 return references 

36 

37 

38class EntityExtractor: 

39 """Extracts named entities from markdown text.""" 

40 

41 @staticmethod 

42 def extract_entities(text: str) -> list[dict[str, str]]: 

43 """Extract named entities from text. 

44 

45 Args: 

46 text: Text to analyze 

47 

48 Returns: 

49 List of entities 

50 """ 

51 # Simple implementation - extract capitalized phrases 

52 entities = [] 

53 words = text.split() 

54 current_entity = [] 

55 

56 for word in words: 

57 if word[0].isupper(): 

58 current_entity.append(word) 

59 elif current_entity: 

60 entities.append( 

61 { 

62 "text": " ".join(current_entity), 

63 "type": "UNKNOWN", # Could be enhanced with NER 

64 } 

65 ) 

66 current_entity = [] 

67 

68 if current_entity: 

69 entities.append({"text": " ".join(current_entity), "type": "UNKNOWN"}) 

70 

71 return entities 

72 

73 

74class HierarchyExtractor: 

75 """Extracts hierarchical relationships from markdown text.""" 

76 

77 @staticmethod 

78 def map_hierarchical_relationships(text: str) -> dict[str, Any]: 

79 """Map hierarchical relationships in text. 

80 

81 Args: 

82 text: Text to analyze 

83 

84 Returns: 

85 Dictionary of hierarchical relationships 

86 """ 

87 hierarchy = {} 

88 current_path = [] 

89 

90 lines = text.split("\n") 

91 for line in lines: 

92 if line.startswith("#"): 

93 level = len(line.split()[0]) 

94 title = line.lstrip("#").strip() 

95 

96 # Update current path 

97 while len(current_path) >= level: 

98 current_path.pop() 

99 current_path.append(title) 

100 

101 # Add to hierarchy 

102 current = hierarchy 

103 for part in current_path[:-1]: 

104 if part not in current: 

105 current[part] = {} 

106 current = current[part] 

107 current[current_path[-1]] = {} 

108 

109 return hierarchy 

110 

111 

112class TopicAnalyzer: 

113 """Analyzes topics in markdown text.""" 

114 

115 @staticmethod 

116 def analyze_topic(text: str) -> dict[str, Any]: 

117 """Analyze topic of text. 

118 

119 Args: 

120 text: Text to analyze 

121 

122 Returns: 

123 Dictionary with topic analysis results 

124 """ 

125 # Simple implementation - return basic topic info 

126 return { 

127 "topics": ["general"], # Could be enhanced with LDA 

128 "coherence": 0.5, # Could be enhanced with topic coherence metrics 

129 } 

130 

131 

132class MetadataExtractor: 

133 """Main metadata extractor that coordinates all extraction components.""" 

134 

135 def __init__(self): 

136 """Initialize the metadata extractor.""" 

137 self.cross_reference_extractor = CrossReferenceExtractor() 

138 self.entity_extractor = EntityExtractor() 

139 self.hierarchy_extractor = HierarchyExtractor() 

140 self.topic_analyzer = TopicAnalyzer() 

141 

142 def extract_all_metadata(self, chunk_content: str, chunk_meta: dict[str, Any]) -> dict[str, Any]: 

143 """Extract all metadata for a chunk. 

144 

145 Args: 

146 chunk_content: The chunk content 

147 chunk_meta: Existing chunk metadata 

148 

149 Returns: 

150 Enhanced metadata dictionary 

151 """ 

152 metadata = chunk_meta.copy() 

153 

154 # Extract cross-references 

155 metadata["cross_references"] = self.cross_reference_extractor.extract_cross_references( 

156 chunk_content 

157 ) 

158 

159 # Extract entities 

160 metadata["entities"] = self.entity_extractor.extract_entities(chunk_content) 

161 

162 # Extract hierarchical relationships 

163 metadata["hierarchy"] = self.hierarchy_extractor.map_hierarchical_relationships( 

164 chunk_content 

165 ) 

166 

167 # Analyze topics 

168 metadata["topic_analysis"] = self.topic_analyzer.analyze_topic(chunk_content) 

169 

170 return metadata 

171 

172 def extract_hierarchical_metadata(self, chunk_content: str, chunk_meta: dict[str, Any], document_context) -> dict[str, Any]: 

173 """Extract rich hierarchical metadata following JIRA pattern. 

174 

175 Args: 

176 chunk_content: The chunk content 

177 chunk_meta: Existing chunk metadata  

178 document_context: Original document for context 

179 

180 Returns: 

181 Enhanced metadata with hierarchical relationships 

182 """ 

183 metadata = self.extract_all_metadata(chunk_content, chunk_meta) 

184 

185 # 🔥 JIRA-style relationship metadata 

186 metadata.update({ 

187 "parent_document_id": document_context.id, 

188 "parent_document_title": document_context.title, 

189 "parent_document_url": document_context.url, 

190 

191 # Enhanced hierarchical context 

192 "section_breadcrumb": " > ".join(chunk_meta.get("path", []) + [chunk_meta.get("title", "")]), 

193 "section_depth": len(chunk_meta.get("path", [])) + 1, 

194 "section_anchor": self._generate_anchor(chunk_meta.get("title", "")), 

195 

196 # Content type analysis 

197 "content_type_analysis": { 

198 "has_code_blocks": bool(re.search(r"```", chunk_content)), 

199 "has_tables": bool(re.search(r"\|.*\|", chunk_content)), 

200 "has_images": bool(re.search(r"!\[.*?\]\(.*?\)", chunk_content)), 

201 "has_links": bool(re.search(r"\[.*?\]\(.*?\)", chunk_content)), 

202 "word_count": len(chunk_content.split()), 

203 "char_count": len(chunk_content), 

204 "estimated_read_time": max(1, len(chunk_content.split()) // 200), # minutes 

205 "paragraph_count": len([p for p in chunk_content.split('\n\n') if p.strip()]), 

206 }, 

207 

208 # Document hierarchy for search filtering 

209 "document_hierarchy": chunk_meta.get("path", []) + [chunk_meta.get("title", "")], 

210 

211 # Section type classification 

212 "section_type": f"h{chunk_meta.get('level', 0)}" if chunk_meta.get('level', 0) > 0 else "content", 

213 "section_level": chunk_meta.get("level", 0), 

214 "section_title": chunk_meta.get("title", ""), 

215 

216 # Excel-specific metadata 

217 "is_excel_sheet": chunk_meta.get("is_excel_sheet", False), 

218 

219 # Navigation hints (to be enhanced by caller with sibling info) 

220 "has_subsections": False, # Will be updated by caller 

221 "total_subsections": 0, # Will be updated by caller 

222 }) 

223 

224 return metadata 

225 

226 def _generate_anchor(self, title: str) -> str: 

227 """Generate URL anchor from section title. 

228  

229 Args: 

230 title: Section title 

231  

232 Returns: 

233 URL-safe anchor string 

234 """ 

235 if not title: 

236 return "" 

237 

238 # Convert to lowercase, replace spaces and special chars with hyphens 

239 anchor = re.sub(r'[^\w\s-]', '', title.lower()) 

240 anchor = re.sub(r'[-\s]+', '-', anchor) 

241 return anchor.strip('-') 

242 

243 def extract_section_title(self, chunk: str) -> str: 

244 """Extract section title from a chunk. 

245 

246 Args: 

247 chunk: The text chunk 

248 

249 Returns: 

250 Section title or default title 

251 """ 

252 # Try to find header at the beginning of the chunk 

253 header_match = re.match(r"^(#{1,6})\s+(.*?)(?:\n|$)", chunk) 

254 if header_match: 

255 return header_match.group(2).strip() 

256 

257 # Try to find the first sentence if no header 

258 first_sentence_match = re.match(r"^([^\.!?]+[\.!?])", chunk) 

259 if first_sentence_match: 

260 title = first_sentence_match.group(1).strip() 

261 # Truncate if too long 

262 if len(title) > 50: 

263 title = title[:50] + "..." 

264 return title 

265 

266 return "Untitled Section"