Coverage for src/qdrant_loader/core/chunking/strategy/markdown/splitters/excel.py: 93%

73 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1"""Excel splitter implementation extracted from `section_splitter`.""" 

2 

3import re 

4 

5from qdrant_loader.core.chunking.strategy.markdown.splitters.base import BaseSplitter 

6 

7 

8class ExcelSplitter(BaseSplitter): 

9 """Excel-specific splitter that preserves table structure.""" 

10 

11 def split_content(self, content: str, max_size: int) -> list[str]: 

12 """Split Excel sheet content into chunks, preserving table structure where possible. 

13 

14 Args: 

15 content: Excel sheet content to split 

16 max_size: Maximum chunk size 

17 

18 Returns: 

19 List of content chunks 

20 """ 

21 chunks: list[str] = [] 

22 

23 max_chunks_per_section = min( 

24 self.settings.global_config.chunking.strategies.markdown.max_chunks_per_section, 

25 self.settings.global_config.chunking.max_chunks_per_document // 2, 

26 ) 

27 

28 logical_units: list[str] = [] 

29 lines = content.split("\n") 

30 current_unit: list[str] = [] 

31 in_table = False 

32 

33 for line in lines: 

34 line = line.strip() 

35 

36 is_table_line = bool(re.match(r"^\|.*\|$", line)) or bool( 

37 re.match(r"^[|\-\s:]+$", line) 

38 ) 

39 

40 if is_table_line and not in_table: 

41 if current_unit: 

42 logical_units.append("\n".join(current_unit)) 

43 current_unit = [] 

44 in_table = True 

45 current_unit.append(line) 

46 elif not is_table_line and in_table: 

47 if current_unit: 

48 logical_units.append("\n".join(current_unit)) 

49 current_unit = [] 

50 in_table = False 

51 if line: 

52 current_unit.append(line) 

53 else: 

54 if line or current_unit: 

55 current_unit.append(line) 

56 

57 if current_unit: 

58 logical_units.append("\n".join(current_unit)) 

59 

60 split_logical_units: list[str] = [] 

61 for unit in logical_units: 

62 if len(unit) > max_size: 

63 lines = unit.split("\n") 

64 current_sub_unit: list[str] = [] 

65 

66 for line in lines: 

67 test_unit = "\n".join(current_sub_unit + [line]) 

68 if current_sub_unit and len(test_unit) > max_size: 

69 split_logical_units.append("\n".join(current_sub_unit)) 

70 current_sub_unit = [line] 

71 else: 

72 current_sub_unit.append(line) 

73 

74 if current_sub_unit: 

75 split_logical_units.append("\n".join(current_sub_unit)) 

76 else: 

77 split_logical_units.append(unit) 

78 

79 logical_units = split_logical_units 

80 

81 i = 0 

82 while i < len(logical_units) and len(chunks) < max_chunks_per_section: 

83 current_chunk = "" 

84 units_in_chunk = 0 

85 

86 j = i 

87 while j < len(logical_units): 

88 unit = logical_units[j] 

89 

90 if current_chunk and len(current_chunk) + len(unit) + 2 > max_size: 

91 break 

92 

93 if current_chunk: 

94 current_chunk += "\n\n" + unit 

95 else: 

96 current_chunk = unit 

97 

98 units_in_chunk += 1 

99 j += 1 

100 

101 if current_chunk.strip(): 

102 chunks.append(current_chunk.strip()) 

103 

104 if units_in_chunk > 0: 

105 if self.chunk_overlap == 0: 

106 advance = units_in_chunk 

107 else: 

108 overlap_units = min(1, units_in_chunk // 2) 

109 advance = max(1, units_in_chunk - overlap_units) 

110 

111 i += advance 

112 else: 

113 i += 1 

114 

115 if i < len(logical_units) and len(chunks) >= max_chunks_per_section: 

116 from qdrant_loader.core.chunking.strategy.markdown import ( 

117 section_splitter as _section_module, 

118 ) 

119 

120 _section_module.logger.warning( 

121 f"Excel sheet reached maximum chunks limit ({max_chunks_per_section}), truncating remaining content", 

122 extra={ 

123 "remaining_units": len(logical_units) - i, 

124 "max_chunks_per_section": max_chunks_per_section, 

125 }, 

126 ) 

127 

128 return chunks 

129 

130 

131__all__ = ["ExcelSplitter"]