Coverage for src / qdrant_loader / core / file_conversion / clean_xlsx_converter.py: 95%
77 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-06-11 09:38 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-06-11 09:38 +0000
1"""Custom MarkItDown converters that strip NaN/NaT noise from xlsx and xls output."""
3from __future__ import annotations
5from typing import Any, BinaryIO
7import pandas as pd
8from markitdown._base_converter import (
9 DocumentConverter,
10 DocumentConverterResult,
11)
12from markitdown._stream_info import StreamInfo
13from markitdown.converters._html_converter import HtmlConverter
15from qdrant_loader.core.file_conversion.sub_table_detector import SubTableDetector
16from qdrant_loader.core.file_conversion.xlsx_markdown_format import (
17 format_sheet_heading,
18)
21def _clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
22 """Drop rows and columns that are entirely missing."""
23 if df.empty:
24 return df
25 cleaned = df.dropna(axis=0, how="all")
26 cleaned = cleaned.dropna(axis=1, how="all")
27 return cleaned
30def _is_blank_dataframe(df: pd.DataFrame) -> bool:
31 """Return True for a DataFrame that holds no real data after cleaning.
33 Used on per-subtable frames, not full sheets — the name avoids the older
34 "_should_skip_sheet" misnomer.
35 """
36 if df.empty:
37 return True
38 return bool(df.isna().all().all())
41def _cell_to_str(value: Any) -> str:
42 """Coerce a header cell to a clean string, mapping NaN/None to ''.
44 `Series.astype(str).tolist()` in current pandas can leak a Python `float`
45 NaN through instead of the string "nan", which breaks downstream `.replace`
46 calls on header values.
47 """
48 if value is None:
49 return ""
50 if isinstance(value, float) and pd.isna(value):
51 return ""
52 return str(value)
55def _escape_string_cell(value: Any) -> Any:
56 """Escape `|` in string cells; pass non-strings through unchanged.
58 Body cells use this rather than `_cell_to_str` so pandas' `to_html` can
59 apply its smart numeric formatting (e.g. `1` not `1.0`, `0.5` not `0.50`).
60 NaN survives this pass and is rendered as `""` by `to_html(na_rep="")`.
61 """
62 if isinstance(value, str):
63 return value.replace("|", r"\|")
64 return value
67class _CleanSpreadsheetConverter(DocumentConverter):
68 """Shared logic for cleaning xlsx/xls output before handing it to MarkItDown."""
70 ENGINE: str = ""
71 EXTENSIONS: tuple[str, ...] = ()
72 MIME_PREFIXES: tuple[str, ...] = ()
74 def __init__(self) -> None:
75 super().__init__()
76 self._html_converter = HtmlConverter()
77 self._detector = SubTableDetector()
79 def accepts(
80 self,
81 file_stream: BinaryIO,
82 stream_info: StreamInfo,
83 **kwargs: Any,
84 ) -> bool:
85 extension = (stream_info.extension or "").lower()
86 if extension in self.EXTENSIONS:
87 return True
88 mimetype = (stream_info.mimetype or "").lower()
89 return any(mimetype.startswith(prefix) for prefix in self.MIME_PREFIXES)
91 def convert(
92 self,
93 file_stream: BinaryIO,
94 stream_info: StreamInfo,
95 **kwargs: Any,
96 ) -> DocumentConverterResult:
97 # Read header-less so SubTableDetector can decide where headers start.
98 # keep_default_na=False preserves literal "N/A" strings; explicit
99 # na_values=[""] still treats genuinely empty cells as NaN.
100 sheets = pd.read_excel(
101 file_stream,
102 sheet_name=None,
103 engine=self.ENGINE,
104 header=None,
105 keep_default_na=False,
106 na_values=[""],
107 )
109 parts: list[str] = []
110 for name, sheet_df in sheets.items():
111 sub_tables = self._detector.detect(sheet_df)
112 for idx, raw in enumerate(sub_tables, start=1):
113 rendered = self._render_subtable(raw, **kwargs)
114 if rendered is None:
115 continue
116 heading = self._heading(name, idx, total=len(sub_tables))
117 parts.append(f"{heading}\n{rendered}")
119 return DocumentConverterResult(markdown="\n\n".join(parts).strip())
121 @staticmethod
122 def _heading(sheet_name: str, idx: int, total: int) -> str:
123 return format_sheet_heading(
124 sheet_name, subtable_idx=None if total <= 1 else idx
125 )
127 def _render_subtable(self, raw: pd.DataFrame, **kwargs: Any) -> str | None:
128 """Promote first row to header, clean, render to markdown.
130 Headers and body cells take different paths on purpose: headers are
131 coerced to strings here (they become DataFrame column labels and feed
132 `.replace`), but body cells stay typed so pandas' `to_html` can apply
133 its native numeric formatting. Both paths escape `|` to survive the
134 downstream markdown-table parser; `HtmlConverter` passes cell text
135 through unchanged.
136 """
137 if raw.empty:
138 return None
139 header = [_cell_to_str(c).replace("|", r"\|") for c in raw.iloc[0].tolist()]
140 body = raw.iloc[1:].reset_index(drop=True)
141 body.columns = header
142 cleaned = _clean_dataframe(body)
143 if _is_blank_dataframe(cleaned):
144 return None
145 escaped = cleaned.map(_escape_string_cell)
146 html = escaped.to_html(index=False, na_rep="")
147 return self._html_converter.convert_string(html, **kwargs).markdown.strip()
150class CleanXlsxConverter(_CleanSpreadsheetConverter):
151 ENGINE = "openpyxl"
152 EXTENSIONS = (".xlsx",)
153 MIME_PREFIXES = (
154 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
155 )
158class CleanXlsConverter(_CleanSpreadsheetConverter):
159 ENGINE = "xlrd"
160 EXTENSIONS = (".xls",)
161 MIME_PREFIXES = (
162 "application/vnd.ms-excel",
163 "application/excel",
164 )