Coverage for src/qdrant_loader/core/file_conversion/file_detector.py: 92%

79 statements  

« prev     ^ index     » next       coverage.py v7.10.0, created at 2025-07-25 11:39 +0000

1"""File type detection service using MIME type and extension-based detection.""" 

2 

3import mimetypes 

4import os 

5from pathlib import Path 

6 

7from qdrant_loader.utils.logging import LoggingConfig 

8 

9from .exceptions import FileAccessError 

10 

11logger = LoggingConfig.get_logger(__name__) 

12 

13 

14class FileDetector: 

15 """Service for detecting file types using MIME type and extension-based detection.""" 

16 

17 # MarkItDown supported file types (based on documentation) 

18 SUPPORTED_MIME_TYPES = { 

19 # PDF files 

20 "application/pdf": "pdf", 

21 # Microsoft Office documents 

22 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", 

23 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx", 

24 "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx", 

25 "application/msword": "doc", 

26 "application/vnd.ms-excel": "xls", 

27 "application/vnd.ms-powerpoint": "ppt", 

28 # Images 

29 "image/jpeg": "jpg", 

30 "image/png": "png", 

31 "image/gif": "gif", 

32 "image/bmp": "bmp", 

33 "image/tiff": "tiff", 

34 "image/webp": "webp", 

35 # Audio files 

36 "audio/mpeg": "mp3", 

37 "audio/wav": "wav", 

38 "audio/x-wav": "wav", 

39 "audio/wave": "wav", 

40 # EPUB files 

41 "application/epub+zip": "epub", 

42 # ZIP archives 

43 "application/zip": "zip", 

44 "application/x-zip-compressed": "zip", 

45 # Plain text (for completeness) 

46 "text/plain": "txt", 

47 # CSV files 

48 "text/csv": "csv", 

49 "application/csv": "csv", 

50 # XML files 

51 "application/xml": "xml", 

52 "text/xml": "xml", 

53 } 

54 

55 # File extensions that should be excluded (handled by existing strategies) 

56 EXCLUDED_EXTENSIONS = { 

57 ".html", 

58 ".htm", # HTML strategy 

59 ".md", 

60 ".markdown", # Markdown strategy 

61 ".txt", # Base strategy for plain text 

62 ".json", # JSON strategy 

63 } 

64 

65 def __init__(self): 

66 """Initialize the file detector.""" 

67 self.logger = LoggingConfig.get_logger(__name__) 

68 

69 # Initialize mimetypes with additional types 

70 mimetypes.init() 

71 self._add_custom_mime_types() 

72 

73 def _add_custom_mime_types(self): 

74 """Add custom MIME type mappings for better detection.""" 

75 # Add Office document types that might not be in default mimetypes 

76 custom_types = { 

77 ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 

78 ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 

79 ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", 

80 ".doc": "application/msword", 

81 ".xls": "application/vnd.ms-excel", 

82 ".ppt": "application/vnd.ms-powerpoint", 

83 ".epub": "application/epub+zip", 

84 } 

85 

86 for ext, mime_type in custom_types.items(): 

87 mimetypes.add_type(mime_type, ext) 

88 

89 def detect_file_type(self, file_path: str) -> tuple[str | None, str | None]: 

90 """Detect file type using MIME type detection with extension fallback. 

91 

92 Args: 

93 file_path: Path to the file to analyze 

94 

95 Returns: 

96 Tuple of (mime_type, file_extension) or (None, None) if detection fails 

97 """ 

98 try: 

99 # Get file extension 

100 file_extension = Path(file_path).suffix.lower() 

101 

102 # Try MIME type detection using mimetypes 

103 mime_type = self._detect_mime_type(file_path) 

104 

105 self.logger.debug( 

106 "File type detection", 

107 file_path=file_path.replace("\\", "/"), 

108 detected_mime_type=mime_type, 

109 file_extension=file_extension, 

110 ) 

111 

112 return mime_type, file_extension 

113 

114 except Exception as e: 

115 self.logger.warning( 

116 "File type detection failed", 

117 file_path=file_path.replace("\\", "/"), 

118 error=str(e), 

119 ) 

120 return None, None 

121 

122 def _detect_mime_type(self, file_path: str) -> str | None: 

123 """Detect MIME type using mimetypes module. 

124 

125 Args: 

126 file_path: Path to the file 

127 

128 Returns: 

129 MIME type string or None if detection fails 

130 """ 

131 try: 

132 # Check if file exists and is accessible 

133 if not os.path.exists(file_path): 

134 raise FileAccessError(f"File does not exist: {file_path}") 

135 

136 if not os.access(file_path, os.R_OK): 

137 raise FileAccessError(f"File is not readable: {file_path}") 

138 

139 # Use mimetypes module for MIME type detection 

140 mime_type, _ = mimetypes.guess_type(file_path) 

141 

142 return mime_type 

143 

144 except Exception as e: 

145 self.logger.debug( 

146 "MIME type detection failed, will try extension fallback", 

147 file_path=file_path.replace("\\", "/"), 

148 error=str(e), 

149 ) 

150 return None 

151 

152 def is_supported_for_conversion(self, file_path: str) -> bool: 

153 """Check if file is supported for conversion. 

154 

155 Args: 

156 file_path: Path to the file 

157 

158 Returns: 

159 True if file is supported for conversion, False otherwise 

160 """ 

161 mime_type, file_extension = self.detect_file_type(file_path) 

162 

163 # Check if extension should be excluded (handled by existing strategies) 

164 if file_extension in self.EXCLUDED_EXTENSIONS: 

165 self.logger.debug( 

166 "File excluded - handled by existing strategy", 

167 file_path=file_path.replace("\\", "/"), 

168 file_extension=file_extension, 

169 ) 

170 return False 

171 

172 # Check if MIME type is supported 

173 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES: 

174 self.logger.debug( 

175 "File supported via MIME type", 

176 file_path=file_path.replace("\\", "/"), 

177 mime_type=mime_type, 

178 ) 

179 return True 

180 

181 # Check if extension is supported (fallback) 

182 if file_extension: 

183 extension_without_dot = file_extension.lstrip(".") 

184 supported_extensions = set(self.SUPPORTED_MIME_TYPES.values()) 

185 

186 if extension_without_dot in supported_extensions: 

187 self.logger.debug( 

188 "File supported via extension fallback", 

189 file_path=file_path.replace("\\", "/"), 

190 file_extension=file_extension, 

191 ) 

192 return True 

193 

194 self.logger.debug( 

195 "File not supported for conversion", 

196 file_path=file_path.replace("\\", "/"), 

197 mime_type=mime_type, 

198 file_extension=file_extension, 

199 ) 

200 return False 

201 

202 def get_file_type_info(self, file_path: str) -> dict: 

203 """Get comprehensive file type information. 

204 

205 Args: 

206 file_path: Path to the file 

207 

208 Returns: 

209 Dictionary with file type information 

210 """ 

211 mime_type, file_extension = self.detect_file_type(file_path) 

212 

213 # Get file size 

214 file_size = None 

215 try: 

216 file_size = os.path.getsize(file_path) 

217 except OSError: 

218 pass 

219 

220 # Determine if supported 

221 is_supported = self.is_supported_for_conversion(file_path) 

222 

223 # Get normalized file type 

224 normalized_type = None 

225 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES: 

226 normalized_type = self.SUPPORTED_MIME_TYPES[mime_type] 

227 elif file_extension: 

228 extension_without_dot = file_extension.lstrip(".") 

229 if extension_without_dot in self.SUPPORTED_MIME_TYPES.values(): 

230 normalized_type = extension_without_dot 

231 

232 return { 

233 "file_path": file_path, 

234 "mime_type": mime_type, 

235 "file_extension": file_extension, 

236 "file_size": file_size, 

237 "is_supported": is_supported, 

238 "normalized_type": normalized_type, 

239 "is_excluded": file_extension in self.EXCLUDED_EXTENSIONS, 

240 } 

241 

242 @classmethod 

243 def get_supported_extensions(cls) -> set[str]: 

244 """Get set of supported file extensions. 

245 

246 Returns: 

247 Set of supported file extensions (with dots) 

248 """ 

249 extensions = set() 

250 for file_type in cls.SUPPORTED_MIME_TYPES.values(): 

251 extensions.add(f".{file_type}") 

252 

253 # Add some common variations 

254 extensions.update({".jpeg", ".tif", ".wave"}) 

255 

256 return extensions 

257 

258 @classmethod 

259 def get_supported_mime_types(cls) -> set[str]: 

260 """Get set of supported MIME types. 

261 

262 Returns: 

263 Set of supported MIME types 

264 """ 

265 return set(cls.SUPPORTED_MIME_TYPES.keys())