Coverage for src/qdrant_loader/core/chunking/strategy/markdown/document_parser.py: 96%
113 statements
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:39 +0000
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:39 +0000
1"""Document parsing for markdown chunking strategy."""
3import re
4from dataclasses import dataclass, field
5from enum import Enum
6from typing import Any, Optional
8import structlog
10logger = structlog.get_logger(__name__)
13class SectionType(Enum):
14 """Types of sections in a markdown document."""
16 HEADER = "header"
17 CODE_BLOCK = "code_block"
18 LIST = "list"
19 TABLE = "table"
20 QUOTE = "quote"
21 PARAGRAPH = "paragraph"
24@dataclass
25class Section:
26 """Represents a section in a markdown document."""
28 content: str
29 level: int = 0
30 type: SectionType = SectionType.PARAGRAPH
31 parent: Optional["Section"] = None
32 children: list["Section"] = field(default_factory=list)
34 def add_child(self, child: "Section"):
35 """Add a child section."""
36 self.children.append(child)
37 child.parent = self
40class SectionIdentifier:
41 """Identifies section types based on content patterns."""
43 @staticmethod
44 def identify_section_type(content: str) -> SectionType:
45 """Identify the type of section based on its content.
47 Args:
48 content: The section content to analyze
50 Returns:
51 SectionType enum indicating the type of section
52 """
53 if re.match(r"^#{1,6}\s+", content):
54 return SectionType.HEADER
55 elif re.match(r"^```", content):
56 return SectionType.CODE_BLOCK
57 elif re.match(r"^[*-]\s+", content):
58 return SectionType.LIST
59 elif re.match(r"^\|", content):
60 return SectionType.TABLE
61 elif re.match(r"^>", content):
62 return SectionType.QUOTE
63 return SectionType.PARAGRAPH
66class HierarchyBuilder:
67 """Builds hierarchical section relationships."""
69 @staticmethod
70 def build_section_breadcrumb(section: Section) -> str:
71 """Build a breadcrumb path of section titles to capture hierarchy.
73 Args:
74 section: The section to build breadcrumb for
76 Returns:
77 String representing the hierarchical path
78 """
79 breadcrumb_parts = []
80 current = section
82 # Walk up the parent chain to build the breadcrumb
83 while current.parent:
84 header_match = re.match(r"^(#+)\s+(.*?)(?:\n|$)", current.parent.content)
85 if header_match:
86 parent_title = header_match.group(2).strip()
87 breadcrumb_parts.insert(0, parent_title)
88 current = current.parent
90 # Add current section
91 header_match = re.match(r"^(#+)\s+(.*?)(?:\n|$)", section.content)
92 if header_match:
93 title = header_match.group(2).strip()
94 breadcrumb_parts.append(title)
96 return " > ".join(breadcrumb_parts)
98 @staticmethod
99 def get_section_path(
100 header_item: dict[str, Any], structure: list[dict[str, Any]]
101 ) -> list[str]:
102 """Get the path of parent headers for a section.
104 Args:
105 header_item: The header item
106 structure: The document structure
108 Returns:
109 List of parent section titles
110 """
111 path = []
112 current_level = header_item["level"]
114 # Go backward through structure to find parent headers
115 for item in reversed(structure[: structure.index(header_item)]):
116 if item["type"] == "header" and item["level"] < current_level:
117 path.insert(0, item["title"])
118 current_level = item["level"]
120 return path
123class DocumentParser:
124 """Parses markdown documents into structured representations."""
126 def __init__(self):
127 """Initialize the document parser."""
128 self.section_identifier = SectionIdentifier()
129 self.hierarchy_builder = HierarchyBuilder()
131 def parse_document_structure(self, text: str) -> list[dict[str, Any]]:
132 """Parse document into a structured representation.
134 Args:
135 text: The document text
137 Returns:
138 List of dictionaries representing document elements
139 """
140 elements = []
141 lines = text.split("\n")
142 current_block = []
143 in_code_block = False
145 for line in lines:
146 # Check for code block markers
147 if line.startswith("```"):
148 in_code_block = not in_code_block
149 current_block.append(line)
150 continue
152 # Inside code block, just accumulate lines
153 if in_code_block:
154 current_block.append(line)
155 continue
157 # Check for headers
158 header_match = re.match(r"^(#{1,6})\s+(.*?)$", line)
159 if header_match and not in_code_block:
160 # If we have a current block, save it
161 if current_block:
162 elements.append(
163 {
164 "type": "content",
165 "text": "\n".join(current_block),
166 "level": 0,
167 }
168 )
169 current_block = []
171 # Save the header
172 level = len(header_match.group(1))
173 elements.append(
174 {
175 "type": "header",
176 "text": line,
177 "level": level,
178 "title": header_match.group(2).strip(),
179 }
180 )
181 else:
182 current_block.append(line)
184 # Save the last block if not empty
185 if current_block:
186 elements.append(
187 {"type": "content", "text": "\n".join(current_block), "level": 0}
188 )
190 return elements
192 def extract_section_metadata(self, section: Section) -> dict[str, Any]:
193 """Extract metadata from a section.
195 Args:
196 section: The section to analyze
198 Returns:
199 Dictionary containing section metadata
200 """
201 metadata = {
202 "type": section.type.value,
203 "level": section.level,
204 "word_count": len(section.content.split()),
205 "char_count": len(section.content),
206 "has_code": bool(re.search(r"```", section.content)),
207 "has_links": bool(re.search(r"\[.*?\]\(.*?\)", section.content)),
208 "has_images": bool(re.search(r"!\[.*?\]\(.*?\)", section.content)),
209 "is_top_level": section.level <= 2, # Mark top-level sections
210 }
212 # Add parent section info if available
213 if section.parent:
214 header_match = re.match(r"^(#+)\s+(.*?)(?:\n|$)", section.parent.content)
215 if header_match:
216 parent_title = header_match.group(2).strip()
217 metadata["parent_title"] = parent_title
218 metadata["parent_level"] = section.parent.level
220 # Add breadcrumb path for hierarchical context
221 breadcrumb = self.hierarchy_builder.build_section_breadcrumb(section)
222 if breadcrumb:
223 metadata["breadcrumb"] = breadcrumb
225 return metadata
227 def extract_section_title(self, chunk: str) -> str:
228 """Extract section title from a chunk.
230 Args:
231 chunk: The text chunk
233 Returns:
234 Section title or default title
235 """
236 # Try to find header at the beginning of the chunk
237 header_match = re.match(r"^(#{1,6})\s+(.*?)(?:\n|$)", chunk)
238 if header_match:
239 return header_match.group(2).strip()
241 # Try to find the first sentence if no header
242 first_sentence_match = re.match(r"^([^\.!?]+[\.!?])", chunk)
243 if first_sentence_match:
244 title = first_sentence_match.group(1).strip()
245 # Truncate if too long
246 if len(title) > 50:
247 title = title[:50] + "..."
248 return title
250 return "Untitled Section"