Coverage for src/qdrant_loader/core/chunking/strategy/markdown/splitters/standard.py: 96%

56 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""Standard splitter implementation extracted from `section_splitter`.""" 

2 

3import re 

4 

5from qdrant_loader.core.chunking.strategy.markdown.splitters.base import BaseSplitter 

6 

7 

8class StandardSplitter(BaseSplitter): 

9 """Standard markdown text splitter that preserves structure.""" 

10 

11 def split_content(self, content: str, max_size: int) -> list[str]: 

12 """Split a large section into smaller chunks while preserving markdown structure. 

13 

14 Args: 

15 content: Section content to split 

16 max_size: Maximum chunk size 

17 

18 Returns: 

19 List of content chunks 

20 """ 

21 chunks: list[str] = [] 

22 

23 max_chunks_per_section = min( 

24 self.settings.global_config.chunking.strategies.markdown.max_chunks_per_section, 

25 self.settings.global_config.chunking.max_chunks_per_document // 2, 

26 ) 

27 

28 paragraphs = re.split(r"\n\s*\n", content) 

29 

30 text_units: list[str] = [] 

31 for para in paragraphs: 

32 para = para.strip() 

33 if not para: 

34 continue 

35 if len(para) > max_size: 

36 sentences = re.split(r"(?<=[.!?])\s+", para) 

37 text_units.extend([s.strip() for s in sentences if s.strip()]) 

38 else: 

39 text_units.append(para) 

40 

41 i = 0 

42 while i < len(text_units) and len(chunks) < max_chunks_per_section: 

43 current_chunk = "" 

44 units_in_chunk = 0 

45 

46 j = i 

47 while j < len(text_units): 

48 unit = text_units[j] 

49 

50 if current_chunk and len(current_chunk) + len(unit) + 2 > max_size: 

51 break 

52 

53 if current_chunk: 

54 current_chunk += "\n\n" + unit 

55 else: 

56 current_chunk = unit 

57 

58 units_in_chunk += 1 

59 j += 1 

60 

61 if current_chunk.strip(): 

62 chunks.append(current_chunk.strip()) 

63 

64 if units_in_chunk > 0: 

65 if self.chunk_overlap == 0: 

66 advance = units_in_chunk 

67 else: 

68 max_overlap_percent = ( 

69 self.settings.global_config.chunking.strategies.markdown.max_overlap_percentage 

70 ) 

71 max_overlap_chars = int(len(current_chunk) * max_overlap_percent) 

72 overlap_chars = min(self.chunk_overlap, max_overlap_chars) 

73 

74 if overlap_chars > 0 and len(current_chunk) > overlap_chars: 

75 overlap_units = 0 

76 overlap_size = 0 

77 for k in range(j - 1, i - 1, -1): 

78 unit_size = len(text_units[k]) 

79 if overlap_size + unit_size <= overlap_chars: 

80 overlap_size += unit_size 

81 overlap_units += 1 

82 else: 

83 break 

84 

85 advance = max(1, units_in_chunk - overlap_units) 

86 else: 

87 advance = max(1, units_in_chunk) 

88 

89 i += advance 

90 else: 

91 i += 1 

92 

93 if i < len(text_units) and len(chunks) >= max_chunks_per_section: 

94 from qdrant_loader.core.chunking.strategy.markdown import ( 

95 section_splitter as _section_module, 

96 ) 

97 

98 _section_module.logger.warning( 

99 f"Section reached maximum chunks limit ({max_chunks_per_section}), truncating remaining content", 

100 extra={ 

101 "remaining_units": len(text_units) - i, 

102 "max_chunks_per_section": max_chunks_per_section, 

103 }, 

104 ) 

105 

106 return chunks 

107 

108 

109__all__ = ["StandardSplitter"]