Coverage for src/qdrant_loader/core/file_conversion/clean_xlsx

1"""Custom MarkItDown converters that strip NaN/NaT noise from xlsx and xls output."""

3from __future__ import annotations

5from typing import Any, BinaryIO

7import pandas as pd

8from markitdown._base_converter import (

9 DocumentConverter,

10 DocumentConverterResult,

11)

12from markitdown._stream_info import StreamInfo

13from markitdown.converters._html_converter import HtmlConverter

15from qdrant_loader.core.file_conversion.sub_table_detector import SubTableDetector

16from qdrant_loader.core.file_conversion.xlsx_markdown_format import (

17 format_sheet_heading,

18)

21def _clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:

22 """Drop rows and columns that are entirely missing."""

23 if df.empty:

24 return df

25 cleaned = df.dropna(axis=0, how="all")

26 cleaned = cleaned.dropna(axis=1, how="all")

27 return cleaned

30def _is_blank_dataframe(df: pd.DataFrame) -> bool:

31 """Return True for a DataFrame that holds no real data after cleaning.

33 Used on per-subtable frames, not full sheets — the name avoids the older

34 "_should_skip_sheet" misnomer.

35 """

36 if df.empty:

37 return True

38 return bool(df.isna().all().all())

41def _cell_to_str(value: Any) -> str:

42 """Coerce a header cell to a clean string, mapping NaN/None to ''.

44 `Series.astype(str).tolist()` in current pandas can leak a Python `float`

45 NaN through instead of the string "nan", which breaks downstream `.replace`

46 calls on header values.

47 """

48 if value is None:

49 return ""

50 if isinstance(value, float) and pd.isna(value):

51 return ""

52 return str(value)

55def _escape_string_cell(value: Any) -> Any:

56 """Escape `|` in string cells; pass non-strings through unchanged.

58 Body cells use this rather than `_cell_to_str` so pandas' `to_html` can

59 apply its smart numeric formatting (e.g. `1` not `1.0`, `0.5` not `0.50`).

60 NaN survives this pass and is rendered as `""` by `to_html(na_rep="")`.

61 """

62 if isinstance(value, str):

63 return value.replace("|", r"\|")

64 return value

67class _CleanSpreadsheetConverter(DocumentConverter):

68 """Shared logic for cleaning xlsx/xls output before handing it to MarkItDown."""

70 ENGINE: str = ""

71 EXTENSIONS: tuple[str, ...] = ()

72 MIME_PREFIXES: tuple[str, ...] = ()

74 def __init__(self) -> None:

75 super().__init__()

76 self._html_converter = HtmlConverter()

77 self._detector = SubTableDetector()

79 def accepts(

80 self,

81 file_stream: BinaryIO,

82 stream_info: StreamInfo,

83 **kwargs: Any,

84 ) -> bool:

85 extension = (stream_info.extension or "").lower()

86 if extension in self.EXTENSIONS:

87 return True

88 mimetype = (stream_info.mimetype or "").lower()

89 return any(mimetype.startswith(prefix) for prefix in self.MIME_PREFIXES)

91 def convert(

92 self,

93 file_stream: BinaryIO,

94 stream_info: StreamInfo,

95 **kwargs: Any,

96 ) -> DocumentConverterResult:

97 # Read header-less so SubTableDetector can decide where headers start.

98 # keep_default_na=False preserves literal "N/A" strings; explicit

99 # na_values=[""] still treats genuinely empty cells as NaN.

100 sheets = pd.read_excel(

101 file_stream,

102 sheet_name=None,

103 engine=self.ENGINE,

104 header=None,

105 keep_default_na=False,

106 na_values=[""],

107 )

108

109 parts: list[str] = []

110 for name, sheet_df in sheets.items():

111 sub_tables = self._detector.detect(sheet_df)

112 for idx, raw in enumerate(sub_tables, start=1):

113 rendered = self._render_subtable(raw, **kwargs)

114 if rendered is None:

115 continue

116 heading = self._heading(name, idx, total=len(sub_tables))

117 parts.append(f"{heading}\n{rendered}")

118

119 return DocumentConverterResult(markdown="\n\n".join(parts).strip())

120

121 @staticmethod

122 def _heading(sheet_name: str, idx: int, total: int) -> str:

123 return format_sheet_heading(

124 sheet_name, subtable_idx=None if total <= 1 else idx

125 )

126

127 def _render_subtable(self, raw: pd.DataFrame, **kwargs: Any) -> str | None:

128 """Promote first row to header, clean, render to markdown.

129

130 Headers and body cells take different paths on purpose: headers are

131 coerced to strings here (they become DataFrame column labels and feed

132 `.replace`), but body cells stay typed so pandas' `to_html` can apply

133 its native numeric formatting. Both paths escape `|` to survive the

134 downstream markdown-table parser; `HtmlConverter` passes cell text

135 through unchanged.

136 """

137 if raw.empty:

138 return None

139 header = [_cell_to_str(c).replace("|", r"\|") for c in raw.iloc[0].tolist()]

140 body = raw.iloc[1:].reset_index(drop=True)

141 body.columns = header

142 cleaned = _clean_dataframe(body)

143 if _is_blank_dataframe(cleaned):

144 return None

145 escaped = cleaned.map(_escape_string_cell)

146 html = escaped.to_html(index=False, na_rep="")

147 return self._html_converter.convert_string(html, **kwargs).markdown.strip()

148

149

150class CleanXlsxConverter(_CleanSpreadsheetConverter):

151 ENGINE = "openpyxl"

152 EXTENSIONS = (".xlsx",)

153 MIME_PREFIXES = (

154 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",

155 )

156

157

158class CleanXlsConverter(_CleanSpreadsheetConverter):

159 ENGINE = "xlrd"

160 EXTENSIONS = (".xls",)

161 MIME_PREFIXES = (

162 "application/vnd.ms-excel",

163 "application/excel",

164 )

Coverage for src/qdrant_loader/core/file_conversion/clean_xlsx_converter.py: 95%

77 statements