Coverage for src/qdrant_loader/core/file_conversion/file_converter.py: 100%

117 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:50 +0000

1"""Main file conversion service using MarkItDown.""" 

2 

3import logging 

4import os 

5import signal 

6import tempfile 

7import warnings 

8from contextlib import contextmanager 

9from pathlib import Path 

10from typing import Optional 

11 

12from qdrant_loader.core.document import Document 

13from qdrant_loader.core.file_conversion.conversion_config import FileConversionConfig 

14from qdrant_loader.core.file_conversion.exceptions import ( 

15 ConversionTimeoutError, 

16 FileAccessError, 

17 FileSizeExceededError, 

18 MarkItDownError, 

19 UnsupportedFileTypeError, 

20) 

21from qdrant_loader.core.file_conversion.file_detector import FileDetector 

22from qdrant_loader.utils.logging import LoggingConfig 

23 

24logger = LoggingConfig.get_logger(__name__) 

25 

26 

27@contextmanager 

28def capture_openpyxl_warnings(logger_instance, file_path: str): 

29 """Context manager to capture openpyxl warnings and route them through our logging system.""" 

30 captured_warnings = [] 

31 

32 # Custom warning handler 

33 def warning_handler(message, category, filename, lineno, file=None, line=None): 

34 # Check if this is an openpyxl warning we want to capture 

35 if ( 

36 category == UserWarning 

37 and filename 

38 and "openpyxl" in filename 

39 and ( 

40 "Data Validation extension" in str(message) 

41 or "Conditional Formatting extension" in str(message) 

42 ) 

43 ): 

44 

45 # Extract the specific warning type 

46 warning_type = "Unknown Excel feature" 

47 if "Data Validation extension" in str(message): 

48 warning_type = "Data Validation" 

49 elif "Conditional Formatting extension" in str(message): 

50 warning_type = "Conditional Formatting" 

51 

52 # Track captured warning 

53 captured_warnings.append(warning_type) 

54 

55 # Log through our system instead of showing the raw warning 

56 logger_instance.info( 

57 "Excel feature not fully supported during conversion", 

58 file_path=file_path, 

59 feature_type=warning_type, 

60 source="openpyxl", 

61 ) 

62 else: 

63 # For non-openpyxl warnings, use the default behavior 

64 original_showwarning(message, category, filename, lineno, file, line) 

65 

66 # Store original warning handler 

67 original_showwarning = warnings.showwarning 

68 

69 try: 

70 # Install our custom warning handler 

71 warnings.showwarning = warning_handler 

72 yield 

73 

74 # Log summary if any warnings were captured 

75 if captured_warnings: 

76 logger_instance.info( 

77 "Excel conversion completed with unsupported features", 

78 file_path=file_path, 

79 total_warnings=len(captured_warnings), 

80 warning_types=list(set(captured_warnings)), 

81 source="openpyxl", 

82 ) 

83 finally: 

84 # Restore original warning handler 

85 warnings.showwarning = original_showwarning 

86 

87 

88class TimeoutHandler: 

89 """Context manager for handling conversion timeouts.""" 

90 

91 def __init__(self, timeout_seconds: int, file_path: str): 

92 self.timeout_seconds = timeout_seconds 

93 self.file_path = file_path 

94 self.old_handler = None 

95 

96 def _timeout_handler(self, signum, frame): 

97 """Signal handler for timeout.""" 

98 raise ConversionTimeoutError(self.timeout_seconds, self.file_path) 

99 

100 def __enter__(self): 

101 """Set up timeout signal handler.""" 

102 self.old_handler = signal.signal(signal.SIGALRM, self._timeout_handler) 

103 signal.alarm(self.timeout_seconds) 

104 return self 

105 

106 def __exit__(self, exc_type, exc_val, exc_tb): 

107 """Clean up timeout signal handler.""" 

108 signal.alarm(0) # Cancel the alarm 

109 if self.old_handler is not None: 

110 signal.signal(signal.SIGALRM, self.old_handler) 

111 

112 

113class FileConverter: 

114 """Service for converting files to Markdown using MarkItDown.""" 

115 

116 def __init__(self, config: FileConversionConfig): 

117 """Initialize the file converter.""" 

118 self.config = config 

119 self.file_detector = FileDetector() 

120 self.logger = LoggingConfig.get_logger(__name__) 

121 self._markitdown = None 

122 

123 def _get_markitdown(self): 

124 """Get MarkItDown instance with lazy loading and LLM configuration.""" 

125 if self._markitdown is None: 

126 try: 

127 from markitdown import MarkItDown # type: ignore 

128 

129 # Configure MarkItDown with LLM settings if enabled 

130 if self.config.markitdown.enable_llm_descriptions: 

131 self.logger.debug( 

132 "Initializing MarkItDown with LLM configuration", 

133 llm_model=self.config.markitdown.llm_model, 

134 llm_endpoint=self.config.markitdown.llm_endpoint, 

135 ) 

136 

137 # Create LLM client based on endpoint 

138 llm_client = self._create_llm_client() 

139 

140 self._markitdown = MarkItDown( 

141 llm_client=llm_client, 

142 llm_model=self.config.markitdown.llm_model, 

143 ) 

144 self.logger.debug("MarkItDown initialized with LLM support") 

145 else: 

146 self._markitdown = MarkItDown() 

147 self.logger.debug("MarkItDown initialized without LLM support") 

148 

149 except ImportError as e: 

150 raise MarkItDownError( 

151 Exception("MarkItDown library not available") 

152 ) from e 

153 return self._markitdown 

154 

155 def _create_llm_client(self): 

156 """Create LLM client based on configuration.""" 

157 try: 

158 # Get API key from configuration 

159 api_key = self.config.markitdown.llm_api_key 

160 if not api_key: 

161 self.logger.warning( 

162 "No LLM API key configured for MarkItDown LLM integration" 

163 ) 

164 # Fallback to environment variable for backward compatibility 

165 api_key = os.getenv("OPENAI_API_KEY") or os.getenv( 

166 "LLM_API_KEY", "dummy-key" 

167 ) 

168 

169 # Check if it's an OpenAI-compatible endpoint 

170 if "openai" in self.config.markitdown.llm_endpoint.lower(): 

171 from openai import OpenAI # type: ignore 

172 

173 return OpenAI( 

174 base_url=self.config.markitdown.llm_endpoint, 

175 api_key=api_key, 

176 ) 

177 else: 

178 # For other endpoints, try to create a generic OpenAI-compatible client 

179 from openai import OpenAI # type: ignore 

180 

181 return OpenAI( 

182 base_url=self.config.markitdown.llm_endpoint, 

183 api_key=api_key, 

184 ) 

185 except ImportError as e: 

186 self.logger.warning( 

187 "OpenAI library not available for LLM integration", error=str(e) 

188 ) 

189 raise MarkItDownError( 

190 Exception("OpenAI library required for LLM integration") 

191 ) from e 

192 

193 def convert_file(self, file_path: str) -> str: 

194 """Convert a file to Markdown format with timeout support.""" 

195 self.logger.info("Starting file conversion", file_path=file_path) 

196 

197 try: 

198 self._validate_file(file_path) 

199 markitdown = self._get_markitdown() 

200 

201 # Apply timeout wrapper and warning capture for conversion 

202 with TimeoutHandler(self.config.conversion_timeout, file_path): 

203 with capture_openpyxl_warnings(self.logger, file_path): 

204 result = markitdown.convert(file_path) 

205 

206 if hasattr(result, "text_content"): 

207 markdown_content = result.text_content 

208 else: 

209 markdown_content = str(result) 

210 

211 self.logger.info( 

212 "File conversion completed", 

213 file_path=file_path, 

214 content_length=len(markdown_content), 

215 timeout_used=self.config.conversion_timeout, 

216 ) 

217 return markdown_content 

218 

219 except ConversionTimeoutError: 

220 # Re-raise timeout errors as-is 

221 self.logger.error( 

222 "File conversion timed out", 

223 file_path=file_path, 

224 timeout=self.config.conversion_timeout, 

225 ) 

226 raise 

227 except Exception as e: 

228 self.logger.error( 

229 "File conversion failed", file_path=file_path, error=str(e) 

230 ) 

231 raise MarkItDownError(e, file_path) from e 

232 

233 def _validate_file(self, file_path: str) -> None: 

234 """Validate file for conversion.""" 

235 if not os.path.exists(file_path): 

236 raise FileAccessError(f"File does not exist: {file_path}") 

237 

238 if not os.access(file_path, os.R_OK): 

239 raise FileAccessError(f"File is not readable: {file_path}") 

240 

241 file_size = os.path.getsize(file_path) 

242 if not self.config.is_file_size_allowed(file_size): 

243 raise FileSizeExceededError(file_size, self.config.max_file_size, file_path) 

244 

245 if not self.file_detector.is_supported_for_conversion(file_path): 

246 file_info = self.file_detector.get_file_type_info(file_path) 

247 raise UnsupportedFileTypeError( 

248 file_info.get("normalized_type", "unknown"), file_path 

249 ) 

250 

251 def create_fallback_document(self, file_path: str, error: Exception) -> str: 

252 """Create a fallback Markdown document when conversion fails.""" 

253 filename = Path(file_path).name 

254 file_info = self.file_detector.get_file_type_info(file_path) 

255 

256 return f"""# {filename} 

257 

258**File Information:** 

259- **Type**: {file_info.get("normalized_type", "unknown")} 

260- **Size**: {file_info.get("file_size", 0):,} bytes 

261- **Path**: {file_path} 

262 

263**Conversion Status**: ❌ Failed 

264**Error**: {str(error)} 

265 

266*This document was created as a fallback when the original file could not be converted.* 

267"""