Coverage for src / qdrant_loader / core / chunking / strategy / markdown / splitters / excel.py: 95%
85 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-18 04:48 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-18 04:48 +0000
1"""Excel splitter implementation extracted from `section_splitter`."""
3import re
5from qdrant_loader.core.chunking.strategy.markdown.splitters.base import BaseSplitter
8class ExcelSplitter(BaseSplitter):
9 """Excel-specific splitter that preserves table structure."""
11 @staticmethod
12 def _is_table_line(line: str) -> bool:
13 """Check whether a line is part of a markdown table."""
14 return bool(re.match(r"^\|.*\|$", line)) or bool(re.match(r"^[|\-\s:]+$", line))
16 @staticmethod
17 def _is_table_unit(unit: str) -> bool:
18 """Check whether a logical unit is a markdown table block."""
19 non_empty_lines = [line.strip() for line in unit.split("\n") if line.strip()]
20 if not non_empty_lines:
21 return False
23 return all(ExcelSplitter._is_table_line(line) for line in non_empty_lines)
25 def split_content(self, content: str, max_size: int) -> list[str]:
26 """Split Excel sheet content into chunks, preserving table structure where possible.
28 Args:
29 content: Excel sheet content to split
30 max_size: Maximum chunk size
32 Returns:
33 List of content chunks
34 """
35 chunks: list[str] = []
37 max_chunks_per_section = min(
38 self.settings.global_config.chunking.strategies.markdown.max_chunks_per_section,
39 self.settings.global_config.chunking.max_chunks_per_document // 2,
40 )
42 logical_units: list[str] = []
43 lines = content.split("\n")
44 current_unit: list[str] = []
45 in_table = False
47 for line in lines:
48 line = line.strip()
50 is_table_line = self._is_table_line(line)
52 if is_table_line and not in_table:
53 if current_unit:
54 logical_units.append("\n".join(current_unit))
55 current_unit = []
56 in_table = True
57 current_unit.append(line)
58 elif not is_table_line and in_table:
59 if current_unit:
60 logical_units.append("\n".join(current_unit))
61 current_unit = []
62 in_table = False
63 if line:
64 current_unit.append(line)
65 else:
66 if line or current_unit:
67 current_unit.append(line)
69 if current_unit:
70 logical_units.append("\n".join(current_unit))
72 split_logical_units: list[str] = []
73 for unit in logical_units:
74 if len(unit) > max_size:
75 lines = unit.split("\n")
76 current_sub_unit: list[str] = []
78 for line in lines:
79 test_unit = "\n".join(current_sub_unit + [line])
80 if current_sub_unit and len(test_unit) > max_size:
81 split_logical_units.append("\n".join(current_sub_unit))
82 current_sub_unit = [line]
83 else:
84 current_sub_unit.append(line)
86 if current_sub_unit:
87 split_logical_units.append("\n".join(current_sub_unit))
88 else:
89 split_logical_units.append(unit)
91 logical_units = split_logical_units
93 i = 0
94 while i < len(logical_units) and len(chunks) < max_chunks_per_section:
95 current_chunk = ""
96 units_in_chunk = 0
98 j = i
99 while j < len(logical_units):
100 unit = logical_units[j]
102 if current_chunk and len(current_chunk) + len(unit) + 2 > max_size:
103 break
105 if current_chunk:
106 current_chunk += "\n\n" + unit
107 else:
108 current_chunk = unit
110 units_in_chunk += 1
111 j += 1
113 if current_chunk.strip():
114 chunks.append(current_chunk.strip())
116 if units_in_chunk > 0:
117 if self.chunk_overlap == 0:
118 advance = units_in_chunk
119 else:
120 is_non_table_then_table_boundary = (
121 i == 0
122 and units_in_chunk >= 2
123 and not self._is_table_unit(logical_units[i])
124 and self._is_table_unit(logical_units[i + 1])
125 )
127 if is_non_table_then_table_boundary:
128 overlap_units = 0
129 else:
130 overlap_units = min(1, units_in_chunk // 2)
132 advance = max(1, units_in_chunk - overlap_units)
134 i += advance
135 else:
136 i += 1
138 if i < len(logical_units) and len(chunks) >= max_chunks_per_section:
139 from qdrant_loader.core.chunking.strategy.markdown import (
140 section_splitter as _section_module,
141 )
143 _section_module.logger.warning(
144 f"Excel sheet reached maximum chunks limit ({max_chunks_per_section}), truncating remaining content",
145 extra={
146 "remaining_units": len(logical_units) - i,
147 "max_chunks_per_section": max_chunks_per_section,
148 },
149 )
151 return chunks
154__all__ = ["ExcelSplitter"]