Coverage for src/qdrant_loader/core/chunking/strategy/markdown/metadata

1"""Metadata extraction for markdown chunks."""

3import re

4from typing import Any

6import structlog

8logger = structlog.get_logger(__name__)

11class CrossReferenceExtractor:

12 """Extracts cross-references from markdown text."""

14 @staticmethod

15 def extract_cross_references(text: str) -> list[dict[str, str]]:

16 """Extract cross-references from text.

18 Args:

19 text: Text to analyze

21 Returns:

22 List of cross-references

23 """

24 # Simple implementation - extract markdown links

25 references = []

26 lines = text.split("\n")

27 for line in lines:

28 if "[" in line and "](" in line:

29 # Extract link text and URL

30 parts = line.split("](")

31 if len(parts) == 2:

32 link_text = parts[0].split("[")[1]

33 url = parts[1].split(")")[0]

34 references.append({"text": link_text, "url": url})

35 return references

38class EntityExtractor:

39 """Extracts named entities from markdown text."""

41 @staticmethod

42 def extract_entities(text: str) -> list[dict[str, str]]:

43 """Extract named entities from text.

45 Args:

46 text: Text to analyze

48 Returns:

49 List of entities

50 """

51 # Simple implementation - extract capitalized phrases

52 entities = []

53 words = text.split()

54 current_entity = []

56 for word in words:

57 if word[0].isupper():

58 current_entity.append(word)

59 elif current_entity:

60 entities.append(

61 {

62 "text": " ".join(current_entity),

63 "type": "UNKNOWN", # Could be enhanced with NER

64 }

65 )

66 current_entity = []

68 if current_entity:

69 entities.append({"text": " ".join(current_entity), "type": "UNKNOWN"})

71 return entities

74class HierarchyExtractor:

75 """Extracts hierarchical relationships from markdown text."""

77 @staticmethod

78 def map_hierarchical_relationships(text: str) -> dict[str, Any]:

79 """Map hierarchical relationships in text.

81 Args:

82 text: Text to analyze

84 Returns:

85 Dictionary of hierarchical relationships

86 """

87 hierarchy = {}

88 current_path = []

90 lines = text.split("\n")

91 for line in lines:

92 if line.startswith("#"):

93 level = len(line.split()[0])

94 title = line.lstrip("#").strip()

96 # Update current path

97 while len(current_path) >= level:

98 current_path.pop()

99 current_path.append(title)

100

101 # Add to hierarchy

102 current = hierarchy

103 for part in current_path[:-1]:

104 if part not in current:

105 current[part] = {}

106 current = current[part]

107 current[current_path[-1]] = {}

108

109 return hierarchy

110

111

112class TopicAnalyzer:

113 """Analyzes topics in markdown text."""

114

115 @staticmethod

116 def analyze_topic(text: str) -> dict[str, Any]:

117 """Analyze topic of text.

118

119 Args:

120 text: Text to analyze

121

122 Returns:

123 Dictionary with topic analysis results

124 """

125 # Simple implementation - return basic topic info

126 return {

127 "topics": ["general"], # Could be enhanced with LDA

128 "coherence": 0.5, # Could be enhanced with topic coherence metrics

129 }

130

131

132class MetadataExtractor:

133 """Main metadata extractor that coordinates all extraction components."""

134

135 def __init__(self, settings=None):

136 """Initialize the metadata extractor.

137

138 Args:

139 settings: Configuration settings containing markdown strategy config

140 """

141 self.settings = settings

142 self.cross_reference_extractor = CrossReferenceExtractor()

143 self.entity_extractor = EntityExtractor()

144 self.hierarchy_extractor = HierarchyExtractor()

145 self.topic_analyzer = TopicAnalyzer()

146

147 def extract_all_metadata(

148 self, chunk_content: str, chunk_meta: dict[str, Any]

149 ) -> dict[str, Any]:

150 """Extract all metadata for a chunk.

151

152 Args:

153 chunk_content: The chunk content

154 chunk_meta: Existing chunk metadata

155

156 Returns:

157 Enhanced metadata dictionary

158 """

159 metadata = chunk_meta.copy()

160

161 # Extract cross-references

162 metadata["cross_references"] = (

163 self.cross_reference_extractor.extract_cross_references(chunk_content)

164 )

165

166 # Extract entities

167 metadata["entities"] = self.entity_extractor.extract_entities(chunk_content)

168

169 # Extract hierarchical relationships

170 metadata["hierarchy"] = self.hierarchy_extractor.map_hierarchical_relationships(

171 chunk_content

172 )

173

174 # Analyze topics

175 metadata["topic_analysis"] = self.topic_analyzer.analyze_topic(chunk_content)

176

177 return metadata

178

179 def extract_hierarchical_metadata(

180 self, chunk_content: str, chunk_meta: dict[str, Any], document_context

181 ) -> dict[str, Any]:

182 """Extract rich hierarchical metadata following JIRA pattern.

183

184 Args:

185 chunk_content: The chunk content

186 chunk_meta: Existing chunk metadata

187 document_context: Original document for context

188

189 Returns:

190 Enhanced metadata with hierarchical relationships

191 """

192 metadata = self.extract_all_metadata(chunk_content, chunk_meta)

193

194 # Calculate reading speed from configuration

195 words_per_minute = (

196 self.settings.global_config.chunking.strategies.markdown.words_per_minute_reading

197 if self.settings

198 else 200 # Default fallback

199 )

200

201 # 🔥 JIRA-style relationship metadata

202 metadata.update(

203 {

204 "parent_document_id": document_context.id,

205 "parent_document_title": document_context.title,

206 "parent_document_url": document_context.url,

207 # Enhanced hierarchical context

208 "section_breadcrumb": " > ".join(

209 chunk_meta.get("path", []) + [chunk_meta.get("title", "")]

210 ),

211 "section_depth": len(chunk_meta.get("path", [])) + 1,

212 "section_anchor": self._generate_anchor(chunk_meta.get("title", "")),

213 # Content type analysis

214 "content_type_analysis": {

215 "has_code_blocks": bool(re.search(r"```", chunk_content)),

216 "has_tables": bool(re.search(r"\|.*\|", chunk_content)),

217 "has_images": bool(re.search(r"!\[.*?\]$.*?$", chunk_content)),

218 "has_links": bool(re.search(r"\[.*?\]$.*?$", chunk_content)),

219 "word_count": len(chunk_content.split()),

220 "char_count": len(chunk_content),

221 "estimated_read_time": max(

222 1, len(chunk_content.split()) // words_per_minute

223 ), # minutes

224 "paragraph_count": len(

225 [p for p in chunk_content.split("\n\n") if p.strip()]

226 ),

227 },

228 # Document hierarchy for search filtering

229 "document_hierarchy": chunk_meta.get("path", [])

230 + [chunk_meta.get("title", "")],

231 # Section type classification

232 "section_type": (

233 f"h{chunk_meta.get('level', 0)}"

234 if chunk_meta.get("level", 0) > 0

235 else "content"

236 ),

237 "section_level": chunk_meta.get("level", 0),

238 "section_title": chunk_meta.get("title", ""),

239 # Excel-specific metadata

240 "is_excel_sheet": chunk_meta.get("is_excel_sheet", False),

241 # Navigation hints (to be enhanced by caller with sibling info)

242 "has_subsections": False, # Will be updated by caller

243 "total_subsections": 0, # Will be updated by caller

244 }

245 )

246

247 return metadata

248

249 def _generate_anchor(self, title: str) -> str:

250 """Generate URL anchor from section title.

251

252 Args:

253 title: Section title

254

255 Returns:

256 URL-safe anchor string

257 """

258 if not title:

259 return ""

260

261 # Convert to lowercase, replace spaces and special chars with hyphens

262 anchor = re.sub(r"[^\w\s-]", "", title.lower())

263 anchor = re.sub(r"[-\s]+", "-", anchor)

264 return anchor.strip("-")

265

266 def extract_section_title(self, chunk: str) -> str:

267 """Extract section title from a chunk.

268

269 Args:

270 chunk: The text chunk

271

272 Returns:

273 Section title or default title

274 """

275 # Try to find header at the beginning of the chunk

276 header_match = re.match(r"^(#{1,6})\s+(.*?)(?:\n|$)", chunk)

277 if header_match:

278 return header_match.group(2).strip()

279

280 # Try to find the first sentence if no header

281 first_sentence_match = re.match(r"^([^\.!?]+[\.!?])", chunk)

282 if first_sentence_match:

283 title = first_sentence_match.group(1).strip()

284 # Truncate if too long

285 if len(title) > 50:

286 title = title[:50] + "..."

287 return title

288

289 return "Untitled Section"

Coverage for src / qdrant_loader / core / chunking / strategy / markdown / metadata_extractor.py: 79%

92 statements