Coverage for src/qdrant_loader/core/chunking/strategy/base/section

1"""Base class for section splitting strategies."""

3from abc import ABC, abstractmethod

4from typing import TYPE_CHECKING, Any, Optional

6if TYPE_CHECKING:

7 from qdrant_loader.config import Settings

8 from qdrant_loader.core.document import Document

11class BaseSectionSplitter(ABC):

12 """Base class for section splitting strategies.

14 This class defines the interface for splitting document content into sections

15 based on different strategies (size-based, semantic, hybrid, etc.).

16 Each strategy implements its own splitting logic while following common patterns.

17 """

19 def __init__(self, settings: "Settings"):

20 """Initialize the section splitter.

22 Args:

23 settings: Configuration settings containing chunking parameters

24 """

25 self.settings = settings

26 self.chunk_size = settings.global_config.chunking.chunk_size

27 self.chunk_overlap = settings.global_config.chunking.chunk_overlap

28 self.max_chunks_per_document = (

29 settings.global_config.chunking.max_chunks_per_document

30 )

32 @abstractmethod

33 def split_sections(

34 self, content: str, document: Optional["Document"] = None

35 ) -> list[dict[str, Any]]:

36 """Split content into sections based on strategy-specific rules.

38 This method should split the content into meaningful sections while preserving

39 semantic structure and adding relevant metadata to each section.

41 Args:

42 content: The content to split into sections

43 document: Optional document object for additional context

45 Returns:

46 List of dictionaries containing section content and metadata

47 Each dictionary should have at least:

48 - "content": The section content

49 - "index": Section index

50 - Additional strategy-specific metadata

52 Raises:

53 NotImplementedError: If the splitter doesn't implement this method

54 """

55 raise NotImplementedError(

56 "Section splitter must implement split_sections method"

57 )

59 def validate_section_size(self, content: str) -> bool:

60 """Validate that section content is within acceptable size limits.

62 Args:

63 content: The section content to validate

65 Returns:

66 True if section is within limits, False otherwise

67 """

68 return (

69 len(content) <= self.chunk_size * 2

70 ) # Allow up to 2x chunk size for sections

72 def calculate_split_points(self, content: str, target_size: int) -> list[int]:

73 """Calculate optimal split points for content.

75 This is a default implementation that finds split points based on

76 natural boundaries (sentences, paragraphs). Can be overridden by

77 specific splitters for more sophisticated splitting.

79 Args:

80 content: The content to split

81 target_size: Target size for each split

83 Returns:

84 List of character positions where content should be split

85 """

86 if len(content) <= target_size:

87 return [len(content)]

89 split_points = []

90 current_pos = 0

92 while current_pos < len(content):

93 # Find the ideal split position

94 ideal_end = min(current_pos + target_size, len(content))

96 if ideal_end >= len(content):

97 split_points.append(len(content))

98 break

100 # Look for natural boundaries within a reasonable range

101 # Start search a bit before ideal end to find good boundaries

102 search_start = max(current_pos + target_size // 2, current_pos + 1)

103 search_end = min(ideal_end + target_size // 4, len(content))

104

105 # Ensure search range is valid

106 if search_start >= search_end:

107 search_start = current_pos + 1

108 search_end = ideal_end

109

110 boundary_pos = self._find_natural_boundary(

111 content, search_start, search_end

112 )

113

114 if boundary_pos > current_pos:

115 split_points.append(boundary_pos)

116 current_pos = boundary_pos

117 else:

118 # Fallback to ideal position if no good boundary found

119 split_points.append(ideal_end)

120 current_pos = ideal_end

121

122 return split_points

123

124 def _find_natural_boundary(self, content: str, start: int, end: int) -> int:

125 """Find a natural boundary for splitting within a range.

126

127 Args:

128 content: The content to search

129 start: Start position to search from

130 end: End position to search to

131

132 Returns:

133 Position of the best boundary, or start if none found

134 """

135 # Look for paragraph breaks first (double newline)

136 for i in range(end - 1, start - 1, -1):

137 if i + 1 < len(content) and content[i : i + 2] == "\n\n":

138 return i + 2

139

140 # Look for sentence endings

141 sentence_endings = [".", "!", "?"]

142 for i in range(end - 1, start - 1, -1):

143 if content[i] in sentence_endings and i + 1 < len(content):

144 if content[i + 1] in [" ", "\n"]:

145 return i + 1

146

147 # Look for line breaks

148 for i in range(end - 1, start - 1, -1):

149 if content[i] == "\n":

150 return i + 1

151

152 # Look for word boundaries (spaces)

153 for i in range(end - 1, start - 1, -1):

154 if content[i] == " ":

155 return i + 1

156

157 return start

158

159 def create_section_metadata(

160 self, content: str, index: int, section_type: str = "content"

161 ) -> dict[str, Any]:

162 """Create basic metadata for a section.

163

164 Args:

165 content: The section content

166 index: Section index

167 section_type: Type of section (content, header, code, etc.)

168

169 Returns:

170 Dictionary containing section metadata

171 """

172 return {

173 "content": content,

174 "index": index,

175 "section_type": section_type,

176 "length": len(content),

177 "word_count": len(content.split()),

178 "line_count": len(content.split("\n")),

179 }

180

181 def split_content_by_size(self, content: str, max_size: int) -> list[str]:

182 """Split content into chunks based on size with overlap.

183

184 This is a utility method that can be used by different splitters

185 for fallback or hybrid splitting strategies.

186

187 Args:

188 content: Content to split

189 max_size: Maximum size for each chunk

190

191 Returns:

192 List of content chunks

193 """

194 if len(content) <= max_size:

195 return [content]

196

197 chunks = []

198 start = 0

199

200 while start < len(content):

201 # Calculate end position

202 end = min(start + max_size, len(content))

203

204 # Find a good boundary if not at the end

205 if end < len(content):

206 boundary_pos = self._find_natural_boundary(

207 content, end - max_size // 4, end

208 )

209 if boundary_pos > start:

210 end = boundary_pos

211

212 chunk = content[start:end]

213 chunks.append(chunk)

214

215 # Calculate next start position with overlap

216 if end >= len(content):

217 break

218

219 advance = max(1, max_size - self.chunk_overlap)

220 start += advance

221

222 return chunks

Coverage for src/qdrant_loader/core/chunking/strategy/base/section_splitter.py: 92%

71 statements