Coverage for src/qdrant_loader/core/chunking/strategy/base/section_splitter.py: 92%

71 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-13 09:19 +0000

1"""Base class for section splitting strategies.""" 

2 

3from abc import ABC, abstractmethod 

4from typing import TYPE_CHECKING, Any, Optional 

5 

6if TYPE_CHECKING: 

7 from qdrant_loader.config import Settings 

8 from qdrant_loader.core.document import Document 

9 

10 

11class BaseSectionSplitter(ABC): 

12 """Base class for section splitting strategies. 

13 

14 This class defines the interface for splitting document content into sections 

15 based on different strategies (size-based, semantic, hybrid, etc.). 

16 Each strategy implements its own splitting logic while following common patterns. 

17 """ 

18 

19 def __init__(self, settings: "Settings"): 

20 """Initialize the section splitter. 

21 

22 Args: 

23 settings: Configuration settings containing chunking parameters 

24 """ 

25 self.settings = settings 

26 self.chunk_size = settings.global_config.chunking.chunk_size 

27 self.chunk_overlap = settings.global_config.chunking.chunk_overlap 

28 self.max_chunks_per_document = ( 

29 settings.global_config.chunking.max_chunks_per_document 

30 ) 

31 

32 @abstractmethod 

33 def split_sections( 

34 self, content: str, document: Optional["Document"] = None 

35 ) -> list[dict[str, Any]]: 

36 """Split content into sections based on strategy-specific rules. 

37 

38 This method should split the content into meaningful sections while preserving 

39 semantic structure and adding relevant metadata to each section. 

40 

41 Args: 

42 content: The content to split into sections 

43 document: Optional document object for additional context 

44 

45 Returns: 

46 List of dictionaries containing section content and metadata 

47 Each dictionary should have at least: 

48 - "content": The section content 

49 - "index": Section index 

50 - Additional strategy-specific metadata 

51 

52 Raises: 

53 NotImplementedError: If the splitter doesn't implement this method 

54 """ 

55 raise NotImplementedError( 

56 "Section splitter must implement split_sections method" 

57 ) 

58 

59 def validate_section_size(self, content: str) -> bool: 

60 """Validate that section content is within acceptable size limits. 

61 

62 Args: 

63 content: The section content to validate 

64 

65 Returns: 

66 True if section is within limits, False otherwise 

67 """ 

68 return ( 

69 len(content) <= self.chunk_size * 2 

70 ) # Allow up to 2x chunk size for sections 

71 

72 def calculate_split_points(self, content: str, target_size: int) -> list[int]: 

73 """Calculate optimal split points for content. 

74 

75 This is a default implementation that finds split points based on 

76 natural boundaries (sentences, paragraphs). Can be overridden by 

77 specific splitters for more sophisticated splitting. 

78 

79 Args: 

80 content: The content to split 

81 target_size: Target size for each split 

82 

83 Returns: 

84 List of character positions where content should be split 

85 """ 

86 if len(content) <= target_size: 

87 return [len(content)] 

88 

89 split_points = [] 

90 current_pos = 0 

91 

92 while current_pos < len(content): 

93 # Find the ideal split position 

94 ideal_end = min(current_pos + target_size, len(content)) 

95 

96 if ideal_end >= len(content): 

97 split_points.append(len(content)) 

98 break 

99 

100 # Look for natural boundaries within a reasonable range 

101 # Start search a bit before ideal end to find good boundaries 

102 search_start = max(current_pos + target_size // 2, current_pos + 1) 

103 search_end = min(ideal_end + target_size // 4, len(content)) 

104 

105 # Ensure search range is valid 

106 if search_start >= search_end: 

107 search_start = current_pos + 1 

108 search_end = ideal_end 

109 

110 boundary_pos = self._find_natural_boundary( 

111 content, search_start, search_end 

112 ) 

113 

114 if boundary_pos > current_pos: 

115 split_points.append(boundary_pos) 

116 current_pos = boundary_pos 

117 else: 

118 # Fallback to ideal position if no good boundary found 

119 split_points.append(ideal_end) 

120 current_pos = ideal_end 

121 

122 return split_points 

123 

124 def _find_natural_boundary(self, content: str, start: int, end: int) -> int: 

125 """Find a natural boundary for splitting within a range. 

126 

127 Args: 

128 content: The content to search 

129 start: Start position to search from 

130 end: End position to search to 

131 

132 Returns: 

133 Position of the best boundary, or start if none found 

134 """ 

135 # Look for paragraph breaks first (double newline) 

136 for i in range(end - 1, start - 1, -1): 

137 if i + 1 < len(content) and content[i : i + 2] == "\n\n": 

138 return i + 2 

139 

140 # Look for sentence endings 

141 sentence_endings = [".", "!", "?"] 

142 for i in range(end - 1, start - 1, -1): 

143 if content[i] in sentence_endings and i + 1 < len(content): 

144 if content[i + 1] in [" ", "\n"]: 

145 return i + 1 

146 

147 # Look for line breaks 

148 for i in range(end - 1, start - 1, -1): 

149 if content[i] == "\n": 

150 return i + 1 

151 

152 # Look for word boundaries (spaces) 

153 for i in range(end - 1, start - 1, -1): 

154 if content[i] == " ": 

155 return i + 1 

156 

157 return start 

158 

159 def create_section_metadata( 

160 self, content: str, index: int, section_type: str = "content" 

161 ) -> dict[str, Any]: 

162 """Create basic metadata for a section. 

163 

164 Args: 

165 content: The section content 

166 index: Section index 

167 section_type: Type of section (content, header, code, etc.) 

168 

169 Returns: 

170 Dictionary containing section metadata 

171 """ 

172 return { 

173 "content": content, 

174 "index": index, 

175 "section_type": section_type, 

176 "length": len(content), 

177 "word_count": len(content.split()), 

178 "line_count": len(content.split("\n")), 

179 } 

180 

181 def split_content_by_size(self, content: str, max_size: int) -> list[str]: 

182 """Split content into chunks based on size with overlap. 

183 

184 This is a utility method that can be used by different splitters 

185 for fallback or hybrid splitting strategies. 

186 

187 Args: 

188 content: Content to split 

189 max_size: Maximum size for each chunk 

190 

191 Returns: 

192 List of content chunks 

193 """ 

194 if len(content) <= max_size: 

195 return [content] 

196 

197 chunks = [] 

198 start = 0 

199 

200 while start < len(content): 

201 # Calculate end position 

202 end = min(start + max_size, len(content)) 

203 

204 # Find a good boundary if not at the end 

205 if end < len(content): 

206 boundary_pos = self._find_natural_boundary( 

207 content, end - max_size // 4, end 

208 ) 

209 if boundary_pos > start: 

210 end = boundary_pos 

211 

212 chunk = content[start:end] 

213 chunks.append(chunk) 

214 

215 # Calculate next start position with overlap 

216 if end >= len(content): 

217 break 

218 

219 advance = max(1, max_size - self.chunk_overlap) 

220 start += advance 

221 

222 return chunks