Coverage for src/qdrant_loader/core/file_conversion/file_converter.py: 100%
117 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
1"""Main file conversion service using MarkItDown."""
3import logging
4import os
5import signal
6import tempfile
7import warnings
8from contextlib import contextmanager
9from pathlib import Path
10from typing import Optional
12from qdrant_loader.core.document import Document
13from qdrant_loader.core.file_conversion.conversion_config import FileConversionConfig
14from qdrant_loader.core.file_conversion.exceptions import (
15 ConversionTimeoutError,
16 FileAccessError,
17 FileSizeExceededError,
18 MarkItDownError,
19 UnsupportedFileTypeError,
20)
21from qdrant_loader.core.file_conversion.file_detector import FileDetector
22from qdrant_loader.utils.logging import LoggingConfig
24logger = LoggingConfig.get_logger(__name__)
27@contextmanager
28def capture_openpyxl_warnings(logger_instance, file_path: str):
29 """Context manager to capture openpyxl warnings and route them through our logging system."""
30 captured_warnings = []
32 # Custom warning handler
33 def warning_handler(message, category, filename, lineno, file=None, line=None):
34 # Check if this is an openpyxl warning we want to capture
35 if (
36 category == UserWarning
37 and filename
38 and "openpyxl" in filename
39 and (
40 "Data Validation extension" in str(message)
41 or "Conditional Formatting extension" in str(message)
42 )
43 ):
45 # Extract the specific warning type
46 warning_type = "Unknown Excel feature"
47 if "Data Validation extension" in str(message):
48 warning_type = "Data Validation"
49 elif "Conditional Formatting extension" in str(message):
50 warning_type = "Conditional Formatting"
52 # Track captured warning
53 captured_warnings.append(warning_type)
55 # Log through our system instead of showing the raw warning
56 logger_instance.info(
57 "Excel feature not fully supported during conversion",
58 file_path=file_path,
59 feature_type=warning_type,
60 source="openpyxl",
61 )
62 else:
63 # For non-openpyxl warnings, use the default behavior
64 original_showwarning(message, category, filename, lineno, file, line)
66 # Store original warning handler
67 original_showwarning = warnings.showwarning
69 try:
70 # Install our custom warning handler
71 warnings.showwarning = warning_handler
72 yield
74 # Log summary if any warnings were captured
75 if captured_warnings:
76 logger_instance.info(
77 "Excel conversion completed with unsupported features",
78 file_path=file_path,
79 total_warnings=len(captured_warnings),
80 warning_types=list(set(captured_warnings)),
81 source="openpyxl",
82 )
83 finally:
84 # Restore original warning handler
85 warnings.showwarning = original_showwarning
88class TimeoutHandler:
89 """Context manager for handling conversion timeouts."""
91 def __init__(self, timeout_seconds: int, file_path: str):
92 self.timeout_seconds = timeout_seconds
93 self.file_path = file_path
94 self.old_handler = None
96 def _timeout_handler(self, signum, frame):
97 """Signal handler for timeout."""
98 raise ConversionTimeoutError(self.timeout_seconds, self.file_path)
100 def __enter__(self):
101 """Set up timeout signal handler."""
102 self.old_handler = signal.signal(signal.SIGALRM, self._timeout_handler)
103 signal.alarm(self.timeout_seconds)
104 return self
106 def __exit__(self, exc_type, exc_val, exc_tb):
107 """Clean up timeout signal handler."""
108 signal.alarm(0) # Cancel the alarm
109 if self.old_handler is not None:
110 signal.signal(signal.SIGALRM, self.old_handler)
113class FileConverter:
114 """Service for converting files to Markdown using MarkItDown."""
116 def __init__(self, config: FileConversionConfig):
117 """Initialize the file converter."""
118 self.config = config
119 self.file_detector = FileDetector()
120 self.logger = LoggingConfig.get_logger(__name__)
121 self._markitdown = None
123 def _get_markitdown(self):
124 """Get MarkItDown instance with lazy loading and LLM configuration."""
125 if self._markitdown is None:
126 try:
127 from markitdown import MarkItDown # type: ignore
129 # Configure MarkItDown with LLM settings if enabled
130 if self.config.markitdown.enable_llm_descriptions:
131 self.logger.debug(
132 "Initializing MarkItDown with LLM configuration",
133 llm_model=self.config.markitdown.llm_model,
134 llm_endpoint=self.config.markitdown.llm_endpoint,
135 )
137 # Create LLM client based on endpoint
138 llm_client = self._create_llm_client()
140 self._markitdown = MarkItDown(
141 llm_client=llm_client,
142 llm_model=self.config.markitdown.llm_model,
143 )
144 self.logger.debug("MarkItDown initialized with LLM support")
145 else:
146 self._markitdown = MarkItDown()
147 self.logger.debug("MarkItDown initialized without LLM support")
149 except ImportError as e:
150 raise MarkItDownError(
151 Exception("MarkItDown library not available")
152 ) from e
153 return self._markitdown
155 def _create_llm_client(self):
156 """Create LLM client based on configuration."""
157 try:
158 # Get API key from configuration
159 api_key = self.config.markitdown.llm_api_key
160 if not api_key:
161 self.logger.warning(
162 "No LLM API key configured for MarkItDown LLM integration"
163 )
164 # Fallback to environment variable for backward compatibility
165 api_key = os.getenv("OPENAI_API_KEY") or os.getenv(
166 "LLM_API_KEY", "dummy-key"
167 )
169 # Check if it's an OpenAI-compatible endpoint
170 if "openai" in self.config.markitdown.llm_endpoint.lower():
171 from openai import OpenAI # type: ignore
173 return OpenAI(
174 base_url=self.config.markitdown.llm_endpoint,
175 api_key=api_key,
176 )
177 else:
178 # For other endpoints, try to create a generic OpenAI-compatible client
179 from openai import OpenAI # type: ignore
181 return OpenAI(
182 base_url=self.config.markitdown.llm_endpoint,
183 api_key=api_key,
184 )
185 except ImportError as e:
186 self.logger.warning(
187 "OpenAI library not available for LLM integration", error=str(e)
188 )
189 raise MarkItDownError(
190 Exception("OpenAI library required for LLM integration")
191 ) from e
193 def convert_file(self, file_path: str) -> str:
194 """Convert a file to Markdown format with timeout support."""
195 self.logger.info("Starting file conversion", file_path=file_path)
197 try:
198 self._validate_file(file_path)
199 markitdown = self._get_markitdown()
201 # Apply timeout wrapper and warning capture for conversion
202 with TimeoutHandler(self.config.conversion_timeout, file_path):
203 with capture_openpyxl_warnings(self.logger, file_path):
204 result = markitdown.convert(file_path)
206 if hasattr(result, "text_content"):
207 markdown_content = result.text_content
208 else:
209 markdown_content = str(result)
211 self.logger.info(
212 "File conversion completed",
213 file_path=file_path,
214 content_length=len(markdown_content),
215 timeout_used=self.config.conversion_timeout,
216 )
217 return markdown_content
219 except ConversionTimeoutError:
220 # Re-raise timeout errors as-is
221 self.logger.error(
222 "File conversion timed out",
223 file_path=file_path,
224 timeout=self.config.conversion_timeout,
225 )
226 raise
227 except Exception as e:
228 self.logger.error(
229 "File conversion failed", file_path=file_path, error=str(e)
230 )
231 raise MarkItDownError(e, file_path) from e
233 def _validate_file(self, file_path: str) -> None:
234 """Validate file for conversion."""
235 if not os.path.exists(file_path):
236 raise FileAccessError(f"File does not exist: {file_path}")
238 if not os.access(file_path, os.R_OK):
239 raise FileAccessError(f"File is not readable: {file_path}")
241 file_size = os.path.getsize(file_path)
242 if not self.config.is_file_size_allowed(file_size):
243 raise FileSizeExceededError(file_size, self.config.max_file_size, file_path)
245 if not self.file_detector.is_supported_for_conversion(file_path):
246 file_info = self.file_detector.get_file_type_info(file_path)
247 raise UnsupportedFileTypeError(
248 file_info.get("normalized_type", "unknown"), file_path
249 )
251 def create_fallback_document(self, file_path: str, error: Exception) -> str:
252 """Create a fallback Markdown document when conversion fails."""
253 filename = Path(file_path).name
254 file_info = self.file_detector.get_file_type_info(file_path)
256 return f"""# {filename}
258**File Information:**
259- **Type**: {file_info.get("normalized_type", "unknown")}
260- **Size**: {file_info.get("file_size", 0):,} bytes
261- **Path**: {file_path}
263**Conversion Status**: ❌ Failed
264**Error**: {str(error)}
266*This document was created as a fallback when the original file could not be converted.*
267"""