Coverage for src/qdrant_loader/core/chunking/strategy/markdown/splitters/standard.py: 96%
56 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1"""Standard splitter implementation extracted from `section_splitter`."""
3import re
5from qdrant_loader.core.chunking.strategy.markdown.splitters.base import BaseSplitter
8class StandardSplitter(BaseSplitter):
9 """Standard markdown text splitter that preserves structure."""
11 def split_content(self, content: str, max_size: int) -> list[str]:
12 """Split a large section into smaller chunks while preserving markdown structure.
14 Args:
15 content: Section content to split
16 max_size: Maximum chunk size
18 Returns:
19 List of content chunks
20 """
21 chunks: list[str] = []
23 max_chunks_per_section = min(
24 self.settings.global_config.chunking.strategies.markdown.max_chunks_per_section,
25 self.settings.global_config.chunking.max_chunks_per_document // 2,
26 )
28 paragraphs = re.split(r"\n\s*\n", content)
30 text_units: list[str] = []
31 for para in paragraphs:
32 para = para.strip()
33 if not para:
34 continue
35 if len(para) > max_size:
36 sentences = re.split(r"(?<=[.!?])\s+", para)
37 text_units.extend([s.strip() for s in sentences if s.strip()])
38 else:
39 text_units.append(para)
41 i = 0
42 while i < len(text_units) and len(chunks) < max_chunks_per_section:
43 current_chunk = ""
44 units_in_chunk = 0
46 j = i
47 while j < len(text_units):
48 unit = text_units[j]
50 if current_chunk and len(current_chunk) + len(unit) + 2 > max_size:
51 break
53 if current_chunk:
54 current_chunk += "\n\n" + unit
55 else:
56 current_chunk = unit
58 units_in_chunk += 1
59 j += 1
61 if current_chunk.strip():
62 chunks.append(current_chunk.strip())
64 if units_in_chunk > 0:
65 if self.chunk_overlap == 0:
66 advance = units_in_chunk
67 else:
68 max_overlap_percent = (
69 self.settings.global_config.chunking.strategies.markdown.max_overlap_percentage
70 )
71 max_overlap_chars = int(len(current_chunk) * max_overlap_percent)
72 overlap_chars = min(self.chunk_overlap, max_overlap_chars)
74 if overlap_chars > 0 and len(current_chunk) > overlap_chars:
75 overlap_units = 0
76 overlap_size = 0
77 for k in range(j - 1, i - 1, -1):
78 unit_size = len(text_units[k])
79 if overlap_size + unit_size <= overlap_chars:
80 overlap_size += unit_size
81 overlap_units += 1
82 else:
83 break
85 advance = max(1, units_in_chunk - overlap_units)
86 else:
87 advance = max(1, units_in_chunk)
89 i += advance
90 else:
91 i += 1
93 if i < len(text_units) and len(chunks) >= max_chunks_per_section:
94 from qdrant_loader.core.chunking.strategy.markdown import (
95 section_splitter as _section_module,
96 )
98 _section_module.logger.warning(
99 f"Section reached maximum chunks limit ({max_chunks_per_section}), truncating remaining content",
100 extra={
101 "remaining_units": len(text_units) - i,
102 "max_chunks_per_section": max_chunks_per_section,
103 },
104 )
106 return chunks
109__all__ = ["StandardSplitter"]