Coverage for src/qdrant_loader/core/file_conversion/file_detector.py: 92%

80 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:50 +0000

1"""File type detection service using MIME type and extension-based detection.""" 

2 

3import mimetypes 

4import os 

5from pathlib import Path 

6from typing import Optional, Tuple 

7 

8from qdrant_loader.utils.logging import LoggingConfig 

9 

10from .exceptions import FileAccessError 

11 

12logger = LoggingConfig.get_logger(__name__) 

13 

14 

15class FileDetector: 

16 """Service for detecting file types using MIME type and extension-based detection.""" 

17 

18 # MarkItDown supported file types (based on documentation) 

19 SUPPORTED_MIME_TYPES = { 

20 # PDF files 

21 "application/pdf": "pdf", 

22 # Microsoft Office documents 

23 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", 

24 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx", 

25 "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx", 

26 "application/msword": "doc", 

27 "application/vnd.ms-excel": "xls", 

28 "application/vnd.ms-powerpoint": "ppt", 

29 # Images 

30 "image/jpeg": "jpg", 

31 "image/png": "png", 

32 "image/gif": "gif", 

33 "image/bmp": "bmp", 

34 "image/tiff": "tiff", 

35 "image/webp": "webp", 

36 # Audio files 

37 "audio/mpeg": "mp3", 

38 "audio/wav": "wav", 

39 "audio/x-wav": "wav", 

40 "audio/wave": "wav", 

41 # EPUB files 

42 "application/epub+zip": "epub", 

43 # ZIP archives 

44 "application/zip": "zip", 

45 "application/x-zip-compressed": "zip", 

46 # Plain text (for completeness) 

47 "text/plain": "txt", 

48 # CSV files 

49 "text/csv": "csv", 

50 "application/csv": "csv", 

51 # XML files 

52 "application/xml": "xml", 

53 "text/xml": "xml", 

54 } 

55 

56 # File extensions that should be excluded (handled by existing strategies) 

57 EXCLUDED_EXTENSIONS = { 

58 ".html", 

59 ".htm", # HTML strategy 

60 ".md", 

61 ".markdown", # Markdown strategy 

62 ".txt", # Base strategy for plain text 

63 ".json", # JSON strategy 

64 } 

65 

66 def __init__(self): 

67 """Initialize the file detector.""" 

68 self.logger = LoggingConfig.get_logger(__name__) 

69 

70 # Initialize mimetypes with additional types 

71 mimetypes.init() 

72 self._add_custom_mime_types() 

73 

74 def _add_custom_mime_types(self): 

75 """Add custom MIME type mappings for better detection.""" 

76 # Add Office document types that might not be in default mimetypes 

77 custom_types = { 

78 ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 

79 ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 

80 ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", 

81 ".doc": "application/msword", 

82 ".xls": "application/vnd.ms-excel", 

83 ".ppt": "application/vnd.ms-powerpoint", 

84 ".epub": "application/epub+zip", 

85 } 

86 

87 for ext, mime_type in custom_types.items(): 

88 mimetypes.add_type(mime_type, ext) 

89 

90 def detect_file_type(self, file_path: str) -> Tuple[Optional[str], Optional[str]]: 

91 """Detect file type using MIME type detection with extension fallback. 

92 

93 Args: 

94 file_path: Path to the file to analyze 

95 

96 Returns: 

97 Tuple of (mime_type, file_extension) or (None, None) if detection fails 

98 """ 

99 try: 

100 # Get file extension 

101 file_extension = Path(file_path).suffix.lower() 

102 

103 # Try MIME type detection using mimetypes 

104 mime_type = self._detect_mime_type(file_path) 

105 

106 self.logger.debug( 

107 "File type detection", 

108 file_path=file_path, 

109 detected_mime_type=mime_type, 

110 file_extension=file_extension, 

111 ) 

112 

113 return mime_type, file_extension 

114 

115 except Exception as e: 

116 self.logger.warning( 

117 "File type detection failed", file_path=file_path, error=str(e) 

118 ) 

119 return None, None 

120 

121 def _detect_mime_type(self, file_path: str) -> Optional[str]: 

122 """Detect MIME type using mimetypes module. 

123 

124 Args: 

125 file_path: Path to the file 

126 

127 Returns: 

128 MIME type string or None if detection fails 

129 """ 

130 try: 

131 # Check if file exists and is accessible 

132 if not os.path.exists(file_path): 

133 raise FileAccessError(f"File does not exist: {file_path}") 

134 

135 if not os.access(file_path, os.R_OK): 

136 raise FileAccessError(f"File is not readable: {file_path}") 

137 

138 # Use mimetypes module for MIME type detection 

139 mime_type, _ = mimetypes.guess_type(file_path) 

140 

141 return mime_type 

142 

143 except Exception as e: 

144 self.logger.debug( 

145 "MIME type detection failed, will try extension fallback", 

146 file_path=file_path, 

147 error=str(e), 

148 ) 

149 return None 

150 

151 def is_supported_for_conversion(self, file_path: str) -> bool: 

152 """Check if file is supported for conversion. 

153 

154 Args: 

155 file_path: Path to the file 

156 

157 Returns: 

158 True if file is supported for conversion, False otherwise 

159 """ 

160 mime_type, file_extension = self.detect_file_type(file_path) 

161 

162 # Check if extension should be excluded (handled by existing strategies) 

163 if file_extension in self.EXCLUDED_EXTENSIONS: 

164 self.logger.debug( 

165 "File excluded - handled by existing strategy", 

166 file_path=file_path, 

167 file_extension=file_extension, 

168 ) 

169 return False 

170 

171 # Check if MIME type is supported 

172 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES: 

173 self.logger.debug( 

174 "File supported via MIME type", file_path=file_path, mime_type=mime_type 

175 ) 

176 return True 

177 

178 # Check if extension is supported (fallback) 

179 if file_extension: 

180 extension_without_dot = file_extension.lstrip(".") 

181 supported_extensions = set(self.SUPPORTED_MIME_TYPES.values()) 

182 

183 if extension_without_dot in supported_extensions: 

184 self.logger.debug( 

185 "File supported via extension fallback", 

186 file_path=file_path, 

187 file_extension=file_extension, 

188 ) 

189 return True 

190 

191 self.logger.debug( 

192 "File not supported for conversion", 

193 file_path=file_path, 

194 mime_type=mime_type, 

195 file_extension=file_extension, 

196 ) 

197 return False 

198 

199 def get_file_type_info(self, file_path: str) -> dict: 

200 """Get comprehensive file type information. 

201 

202 Args: 

203 file_path: Path to the file 

204 

205 Returns: 

206 Dictionary with file type information 

207 """ 

208 mime_type, file_extension = self.detect_file_type(file_path) 

209 

210 # Get file size 

211 file_size = None 

212 try: 

213 file_size = os.path.getsize(file_path) 

214 except OSError: 

215 pass 

216 

217 # Determine if supported 

218 is_supported = self.is_supported_for_conversion(file_path) 

219 

220 # Get normalized file type 

221 normalized_type = None 

222 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES: 

223 normalized_type = self.SUPPORTED_MIME_TYPES[mime_type] 

224 elif file_extension: 

225 extension_without_dot = file_extension.lstrip(".") 

226 if extension_without_dot in self.SUPPORTED_MIME_TYPES.values(): 

227 normalized_type = extension_without_dot 

228 

229 return { 

230 "file_path": file_path, 

231 "mime_type": mime_type, 

232 "file_extension": file_extension, 

233 "file_size": file_size, 

234 "is_supported": is_supported, 

235 "normalized_type": normalized_type, 

236 "is_excluded": file_extension in self.EXCLUDED_EXTENSIONS, 

237 } 

238 

239 @classmethod 

240 def get_supported_extensions(cls) -> set[str]: 

241 """Get set of supported file extensions. 

242 

243 Returns: 

244 Set of supported file extensions (with dots) 

245 """ 

246 extensions = set() 

247 for file_type in cls.SUPPORTED_MIME_TYPES.values(): 

248 extensions.add(f".{file_type}") 

249 

250 # Add some common variations 

251 extensions.update({".jpeg", ".tif", ".wave"}) 

252 

253 return extensions 

254 

255 @classmethod 

256 def get_supported_mime_types(cls) -> set[str]: 

257 """Get set of supported MIME types. 

258 

259 Returns: 

260 Set of supported MIME types 

261 """ 

262 return set(cls.SUPPORTED_MIME_TYPES.keys())