Coverage for src / qdrant_loader / core / chunking / strategy / markdown / splitters / excel.py: 95%

85 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-18 04:48 +0000

1"""Excel splitter implementation extracted from `section_splitter`.""" 

2 

3import re 

4 

5from qdrant_loader.core.chunking.strategy.markdown.splitters.base import BaseSplitter 

6 

7 

8class ExcelSplitter(BaseSplitter): 

9 """Excel-specific splitter that preserves table structure.""" 

10 

11 @staticmethod 

12 def _is_table_line(line: str) -> bool: 

13 """Check whether a line is part of a markdown table.""" 

14 return bool(re.match(r"^\|.*\|$", line)) or bool(re.match(r"^[|\-\s:]+$", line)) 

15 

16 @staticmethod 

17 def _is_table_unit(unit: str) -> bool: 

18 """Check whether a logical unit is a markdown table block.""" 

19 non_empty_lines = [line.strip() for line in unit.split("\n") if line.strip()] 

20 if not non_empty_lines: 

21 return False 

22 

23 return all(ExcelSplitter._is_table_line(line) for line in non_empty_lines) 

24 

25 def split_content(self, content: str, max_size: int) -> list[str]: 

26 """Split Excel sheet content into chunks, preserving table structure where possible. 

27 

28 Args: 

29 content: Excel sheet content to split 

30 max_size: Maximum chunk size 

31 

32 Returns: 

33 List of content chunks 

34 """ 

35 chunks: list[str] = [] 

36 

37 max_chunks_per_section = min( 

38 self.settings.global_config.chunking.strategies.markdown.max_chunks_per_section, 

39 self.settings.global_config.chunking.max_chunks_per_document // 2, 

40 ) 

41 

42 logical_units: list[str] = [] 

43 lines = content.split("\n") 

44 current_unit: list[str] = [] 

45 in_table = False 

46 

47 for line in lines: 

48 line = line.strip() 

49 

50 is_table_line = self._is_table_line(line) 

51 

52 if is_table_line and not in_table: 

53 if current_unit: 

54 logical_units.append("\n".join(current_unit)) 

55 current_unit = [] 

56 in_table = True 

57 current_unit.append(line) 

58 elif not is_table_line and in_table: 

59 if current_unit: 

60 logical_units.append("\n".join(current_unit)) 

61 current_unit = [] 

62 in_table = False 

63 if line: 

64 current_unit.append(line) 

65 else: 

66 if line or current_unit: 

67 current_unit.append(line) 

68 

69 if current_unit: 

70 logical_units.append("\n".join(current_unit)) 

71 

72 split_logical_units: list[str] = [] 

73 for unit in logical_units: 

74 if len(unit) > max_size: 

75 lines = unit.split("\n") 

76 current_sub_unit: list[str] = [] 

77 

78 for line in lines: 

79 test_unit = "\n".join(current_sub_unit + [line]) 

80 if current_sub_unit and len(test_unit) > max_size: 

81 split_logical_units.append("\n".join(current_sub_unit)) 

82 current_sub_unit = [line] 

83 else: 

84 current_sub_unit.append(line) 

85 

86 if current_sub_unit: 

87 split_logical_units.append("\n".join(current_sub_unit)) 

88 else: 

89 split_logical_units.append(unit) 

90 

91 logical_units = split_logical_units 

92 

93 i = 0 

94 while i < len(logical_units) and len(chunks) < max_chunks_per_section: 

95 current_chunk = "" 

96 units_in_chunk = 0 

97 

98 j = i 

99 while j < len(logical_units): 

100 unit = logical_units[j] 

101 

102 if current_chunk and len(current_chunk) + len(unit) + 2 > max_size: 

103 break 

104 

105 if current_chunk: 

106 current_chunk += "\n\n" + unit 

107 else: 

108 current_chunk = unit 

109 

110 units_in_chunk += 1 

111 j += 1 

112 

113 if current_chunk.strip(): 

114 chunks.append(current_chunk.strip()) 

115 

116 if units_in_chunk > 0: 

117 if self.chunk_overlap == 0: 

118 advance = units_in_chunk 

119 else: 

120 is_non_table_then_table_boundary = ( 

121 i == 0 

122 and units_in_chunk >= 2 

123 and not self._is_table_unit(logical_units[i]) 

124 and self._is_table_unit(logical_units[i + 1]) 

125 ) 

126 

127 if is_non_table_then_table_boundary: 

128 overlap_units = 0 

129 else: 

130 overlap_units = min(1, units_in_chunk // 2) 

131 

132 advance = max(1, units_in_chunk - overlap_units) 

133 

134 i += advance 

135 else: 

136 i += 1 

137 

138 if i < len(logical_units) and len(chunks) >= max_chunks_per_section: 

139 from qdrant_loader.core.chunking.strategy.markdown import ( 

140 section_splitter as _section_module, 

141 ) 

142 

143 _section_module.logger.warning( 

144 f"Excel sheet reached maximum chunks limit ({max_chunks_per_section}), truncating remaining content", 

145 extra={ 

146 "remaining_units": len(logical_units) - i, 

147 "max_chunks_per_section": max_chunks_per_section, 

148 }, 

149 ) 

150 

151 return chunks 

152 

153 

154__all__ = ["ExcelSplitter"]