Coverage for src/qdrant_loader/core/chunking/strategy/base/section_splitter.py: 92%
71 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-11 07:21 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-11 07:21 +0000
1"""Base class for section splitting strategies."""
3from abc import ABC, abstractmethod
4from typing import TYPE_CHECKING, Any, Optional
6if TYPE_CHECKING:
7 from qdrant_loader.config import Settings
8 from qdrant_loader.core.document import Document
11class BaseSectionSplitter(ABC):
12 """Base class for section splitting strategies.
14 This class defines the interface for splitting document content into sections
15 based on different strategies (size-based, semantic, hybrid, etc.).
16 Each strategy implements its own splitting logic while following common patterns.
17 """
19 def __init__(self, settings: "Settings"):
20 """Initialize the section splitter.
22 Args:
23 settings: Configuration settings containing chunking parameters
24 """
25 self.settings = settings
26 self.chunk_size = settings.global_config.chunking.chunk_size
27 self.chunk_overlap = settings.global_config.chunking.chunk_overlap
28 self.max_chunks_per_document = (
29 settings.global_config.chunking.max_chunks_per_document
30 )
32 @abstractmethod
33 def split_sections(
34 self, content: str, document: Optional["Document"] = None
35 ) -> list[dict[str, Any]]:
36 """Split content into sections based on strategy-specific rules.
38 This method should split the content into meaningful sections while preserving
39 semantic structure and adding relevant metadata to each section.
41 Args:
42 content: The content to split into sections
43 document: Optional document object for additional context
45 Returns:
46 List of dictionaries containing section content and metadata
47 Each dictionary should have at least:
48 - "content": The section content
49 - "index": Section index
50 - Additional strategy-specific metadata
52 Raises:
53 NotImplementedError: If the splitter doesn't implement this method
54 """
55 raise NotImplementedError(
56 "Section splitter must implement split_sections method"
57 )
59 def validate_section_size(self, content: str) -> bool:
60 """Validate that section content is within acceptable size limits.
62 Args:
63 content: The section content to validate
65 Returns:
66 True if section is within limits, False otherwise
67 """
68 return (
69 len(content) <= self.chunk_size * 2
70 ) # Allow up to 2x chunk size for sections
72 def calculate_split_points(self, content: str, target_size: int) -> list[int]:
73 """Calculate optimal split points for content.
75 This is a default implementation that finds split points based on
76 natural boundaries (sentences, paragraphs). Can be overridden by
77 specific splitters for more sophisticated splitting.
79 Args:
80 content: The content to split
81 target_size: Target size for each split
83 Returns:
84 List of character positions where content should be split
85 """
86 if len(content) <= target_size:
87 return [len(content)]
89 split_points = []
90 current_pos = 0
92 while current_pos < len(content):
93 # Find the ideal split position
94 ideal_end = min(current_pos + target_size, len(content))
96 if ideal_end >= len(content):
97 split_points.append(len(content))
98 break
100 # Look for natural boundaries within a reasonable range
101 # Start search a bit before ideal end to find good boundaries
102 search_start = max(current_pos + target_size // 2, current_pos + 1)
103 search_end = min(ideal_end + target_size // 4, len(content))
105 # Ensure search range is valid
106 if search_start >= search_end:
107 search_start = current_pos + 1
108 search_end = ideal_end
110 boundary_pos = self._find_natural_boundary(
111 content, search_start, search_end
112 )
114 if boundary_pos > current_pos:
115 split_points.append(boundary_pos)
116 current_pos = boundary_pos
117 else:
118 # Fallback to ideal position if no good boundary found
119 split_points.append(ideal_end)
120 current_pos = ideal_end
122 return split_points
124 def _find_natural_boundary(self, content: str, start: int, end: int) -> int:
125 """Find a natural boundary for splitting within a range.
127 Args:
128 content: The content to search
129 start: Start position to search from
130 end: End position to search to
132 Returns:
133 Position of the best boundary, or start if none found
134 """
135 # Look for paragraph breaks first (double newline)
136 for i in range(end - 1, start - 1, -1):
137 if i + 1 < len(content) and content[i : i + 2] == "\n\n":
138 return i + 2
140 # Look for sentence endings
141 sentence_endings = [".", "!", "?"]
142 for i in range(end - 1, start - 1, -1):
143 if content[i] in sentence_endings and i + 1 < len(content):
144 if content[i + 1] in [" ", "\n"]:
145 return i + 1
147 # Look for line breaks
148 for i in range(end - 1, start - 1, -1):
149 if content[i] == "\n":
150 return i + 1
152 # Look for word boundaries (spaces)
153 for i in range(end - 1, start - 1, -1):
154 if content[i] == " ":
155 return i + 1
157 return start
159 def create_section_metadata(
160 self, content: str, index: int, section_type: str = "content"
161 ) -> dict[str, Any]:
162 """Create basic metadata for a section.
164 Args:
165 content: The section content
166 index: Section index
167 section_type: Type of section (content, header, code, etc.)
169 Returns:
170 Dictionary containing section metadata
171 """
172 return {
173 "content": content,
174 "index": index,
175 "section_type": section_type,
176 "length": len(content),
177 "word_count": len(content.split()),
178 "line_count": len(content.split("\n")),
179 }
181 def split_content_by_size(self, content: str, max_size: int) -> list[str]:
182 """Split content into chunks based on size with overlap.
184 This is a utility method that can be used by different splitters
185 for fallback or hybrid splitting strategies.
187 Args:
188 content: Content to split
189 max_size: Maximum size for each chunk
191 Returns:
192 List of content chunks
193 """
194 if len(content) <= max_size:
195 return [content]
197 chunks = []
198 start = 0
200 while start < len(content):
201 # Calculate end position
202 end = min(start + max_size, len(content))
204 # Find a good boundary if not at the end
205 if end < len(content):
206 boundary_pos = self._find_natural_boundary(
207 content, end - max_size // 4, end
208 )
209 if boundary_pos > start:
210 end = boundary_pos
212 chunk = content[start:end]
213 chunks.append(chunk)
215 # Calculate next start position with overlap
216 if end >= len(content):
217 break
219 advance = max(1, max_size - self.chunk_overlap)
220 start += advance
222 return chunks