Coverage for src/qdrant_loader/core/chunking/strategy/markdown/splitters/excel.py: 93%
73 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1"""Excel splitter implementation extracted from `section_splitter`."""
3import re
5from qdrant_loader.core.chunking.strategy.markdown.splitters.base import BaseSplitter
8class ExcelSplitter(BaseSplitter):
9 """Excel-specific splitter that preserves table structure."""
11 def split_content(self, content: str, max_size: int) -> list[str]:
12 """Split Excel sheet content into chunks, preserving table structure where possible.
14 Args:
15 content: Excel sheet content to split
16 max_size: Maximum chunk size
18 Returns:
19 List of content chunks
20 """
21 chunks: list[str] = []
23 max_chunks_per_section = min(
24 self.settings.global_config.chunking.strategies.markdown.max_chunks_per_section,
25 self.settings.global_config.chunking.max_chunks_per_document // 2,
26 )
28 logical_units: list[str] = []
29 lines = content.split("\n")
30 current_unit: list[str] = []
31 in_table = False
33 for line in lines:
34 line = line.strip()
36 is_table_line = bool(re.match(r"^\|.*\|$", line)) or bool(
37 re.match(r"^[|\-\s:]+$", line)
38 )
40 if is_table_line and not in_table:
41 if current_unit:
42 logical_units.append("\n".join(current_unit))
43 current_unit = []
44 in_table = True
45 current_unit.append(line)
46 elif not is_table_line and in_table:
47 if current_unit:
48 logical_units.append("\n".join(current_unit))
49 current_unit = []
50 in_table = False
51 if line:
52 current_unit.append(line)
53 else:
54 if line or current_unit:
55 current_unit.append(line)
57 if current_unit:
58 logical_units.append("\n".join(current_unit))
60 split_logical_units: list[str] = []
61 for unit in logical_units:
62 if len(unit) > max_size:
63 lines = unit.split("\n")
64 current_sub_unit: list[str] = []
66 for line in lines:
67 test_unit = "\n".join(current_sub_unit + [line])
68 if current_sub_unit and len(test_unit) > max_size:
69 split_logical_units.append("\n".join(current_sub_unit))
70 current_sub_unit = [line]
71 else:
72 current_sub_unit.append(line)
74 if current_sub_unit:
75 split_logical_units.append("\n".join(current_sub_unit))
76 else:
77 split_logical_units.append(unit)
79 logical_units = split_logical_units
81 i = 0
82 while i < len(logical_units) and len(chunks) < max_chunks_per_section:
83 current_chunk = ""
84 units_in_chunk = 0
86 j = i
87 while j < len(logical_units):
88 unit = logical_units[j]
90 if current_chunk and len(current_chunk) + len(unit) + 2 > max_size:
91 break
93 if current_chunk:
94 current_chunk += "\n\n" + unit
95 else:
96 current_chunk = unit
98 units_in_chunk += 1
99 j += 1
101 if current_chunk.strip():
102 chunks.append(current_chunk.strip())
104 if units_in_chunk > 0:
105 if self.chunk_overlap == 0:
106 advance = units_in_chunk
107 else:
108 overlap_units = min(1, units_in_chunk // 2)
109 advance = max(1, units_in_chunk - overlap_units)
111 i += advance
112 else:
113 i += 1
115 if i < len(logical_units) and len(chunks) >= max_chunks_per_section:
116 from qdrant_loader.core.chunking.strategy.markdown import (
117 section_splitter as _section_module,
118 )
120 _section_module.logger.warning(
121 f"Excel sheet reached maximum chunks limit ({max_chunks_per_section}), truncating remaining content",
122 extra={
123 "remaining_units": len(logical_units) - i,
124 "max_chunks_per_section": max_chunks_per_section,
125 },
126 )
128 return chunks
131__all__ = ["ExcelSplitter"]