Coverage for src / qdrant_loader / core / file_conversion / clean_xlsx_converter.py: 95%

77 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-06-11 09:38 +0000

1"""Custom MarkItDown converters that strip NaN/NaT noise from xlsx and xls output.""" 

2 

3from __future__ import annotations 

4 

5from typing import Any, BinaryIO 

6 

7import pandas as pd 

8from markitdown._base_converter import ( 

9 DocumentConverter, 

10 DocumentConverterResult, 

11) 

12from markitdown._stream_info import StreamInfo 

13from markitdown.converters._html_converter import HtmlConverter 

14 

15from qdrant_loader.core.file_conversion.sub_table_detector import SubTableDetector 

16from qdrant_loader.core.file_conversion.xlsx_markdown_format import ( 

17 format_sheet_heading, 

18) 

19 

20 

21def _clean_dataframe(df: pd.DataFrame) -> pd.DataFrame: 

22 """Drop rows and columns that are entirely missing.""" 

23 if df.empty: 

24 return df 

25 cleaned = df.dropna(axis=0, how="all") 

26 cleaned = cleaned.dropna(axis=1, how="all") 

27 return cleaned 

28 

29 

30def _is_blank_dataframe(df: pd.DataFrame) -> bool: 

31 """Return True for a DataFrame that holds no real data after cleaning. 

32 

33 Used on per-subtable frames, not full sheets — the name avoids the older 

34 "_should_skip_sheet" misnomer. 

35 """ 

36 if df.empty: 

37 return True 

38 return bool(df.isna().all().all()) 

39 

40 

41def _cell_to_str(value: Any) -> str: 

42 """Coerce a header cell to a clean string, mapping NaN/None to ''. 

43 

44 `Series.astype(str).tolist()` in current pandas can leak a Python `float` 

45 NaN through instead of the string "nan", which breaks downstream `.replace` 

46 calls on header values. 

47 """ 

48 if value is None: 

49 return "" 

50 if isinstance(value, float) and pd.isna(value): 

51 return "" 

52 return str(value) 

53 

54 

55def _escape_string_cell(value: Any) -> Any: 

56 """Escape `|` in string cells; pass non-strings through unchanged. 

57 

58 Body cells use this rather than `_cell_to_str` so pandas' `to_html` can 

59 apply its smart numeric formatting (e.g. `1` not `1.0`, `0.5` not `0.50`). 

60 NaN survives this pass and is rendered as `""` by `to_html(na_rep="")`. 

61 """ 

62 if isinstance(value, str): 

63 return value.replace("|", r"\|") 

64 return value 

65 

66 

67class _CleanSpreadsheetConverter(DocumentConverter): 

68 """Shared logic for cleaning xlsx/xls output before handing it to MarkItDown.""" 

69 

70 ENGINE: str = "" 

71 EXTENSIONS: tuple[str, ...] = () 

72 MIME_PREFIXES: tuple[str, ...] = () 

73 

74 def __init__(self) -> None: 

75 super().__init__() 

76 self._html_converter = HtmlConverter() 

77 self._detector = SubTableDetector() 

78 

79 def accepts( 

80 self, 

81 file_stream: BinaryIO, 

82 stream_info: StreamInfo, 

83 **kwargs: Any, 

84 ) -> bool: 

85 extension = (stream_info.extension or "").lower() 

86 if extension in self.EXTENSIONS: 

87 return True 

88 mimetype = (stream_info.mimetype or "").lower() 

89 return any(mimetype.startswith(prefix) for prefix in self.MIME_PREFIXES) 

90 

91 def convert( 

92 self, 

93 file_stream: BinaryIO, 

94 stream_info: StreamInfo, 

95 **kwargs: Any, 

96 ) -> DocumentConverterResult: 

97 # Read header-less so SubTableDetector can decide where headers start. 

98 # keep_default_na=False preserves literal "N/A" strings; explicit 

99 # na_values=[""] still treats genuinely empty cells as NaN. 

100 sheets = pd.read_excel( 

101 file_stream, 

102 sheet_name=None, 

103 engine=self.ENGINE, 

104 header=None, 

105 keep_default_na=False, 

106 na_values=[""], 

107 ) 

108 

109 parts: list[str] = [] 

110 for name, sheet_df in sheets.items(): 

111 sub_tables = self._detector.detect(sheet_df) 

112 for idx, raw in enumerate(sub_tables, start=1): 

113 rendered = self._render_subtable(raw, **kwargs) 

114 if rendered is None: 

115 continue 

116 heading = self._heading(name, idx, total=len(sub_tables)) 

117 parts.append(f"{heading}\n{rendered}") 

118 

119 return DocumentConverterResult(markdown="\n\n".join(parts).strip()) 

120 

121 @staticmethod 

122 def _heading(sheet_name: str, idx: int, total: int) -> str: 

123 return format_sheet_heading( 

124 sheet_name, subtable_idx=None if total <= 1 else idx 

125 ) 

126 

127 def _render_subtable(self, raw: pd.DataFrame, **kwargs: Any) -> str | None: 

128 """Promote first row to header, clean, render to markdown. 

129 

130 Headers and body cells take different paths on purpose: headers are 

131 coerced to strings here (they become DataFrame column labels and feed 

132 `.replace`), but body cells stay typed so pandas' `to_html` can apply 

133 its native numeric formatting. Both paths escape `|` to survive the 

134 downstream markdown-table parser; `HtmlConverter` passes cell text 

135 through unchanged. 

136 """ 

137 if raw.empty: 

138 return None 

139 header = [_cell_to_str(c).replace("|", r"\|") for c in raw.iloc[0].tolist()] 

140 body = raw.iloc[1:].reset_index(drop=True) 

141 body.columns = header 

142 cleaned = _clean_dataframe(body) 

143 if _is_blank_dataframe(cleaned): 

144 return None 

145 escaped = cleaned.map(_escape_string_cell) 

146 html = escaped.to_html(index=False, na_rep="") 

147 return self._html_converter.convert_string(html, **kwargs).markdown.strip() 

148 

149 

150class CleanXlsxConverter(_CleanSpreadsheetConverter): 

151 ENGINE = "openpyxl" 

152 EXTENSIONS = (".xlsx",) 

153 MIME_PREFIXES = ( 

154 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 

155 ) 

156 

157 

158class CleanXlsConverter(_CleanSpreadsheetConverter): 

159 ENGINE = "xlrd" 

160 EXTENSIONS = (".xls",) 

161 MIME_PREFIXES = ( 

162 "application/vnd.ms-excel", 

163 "application/excel", 

164 )