Coverage for src / qdrant_loader / core / file_conversion / file_detector.py: 96%

79 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-18 04:48 +0000

1"""File type detection service using MIME type and extension-based detection.""" 

2 

3import mimetypes 

4import os 

5from pathlib import Path 

6 

7from qdrant_loader.utils.logging import LoggingConfig 

8 

9from .exceptions import FileAccessError 

10 

11logger = LoggingConfig.get_logger(__name__) 

12 

13 

14class FileDetector: 

15 """Service for detecting file types using MIME type and extension-based detection.""" 

16 

17 # MarkItDown supported file types (based on documentation) 

18 SUPPORTED_MIME_TYPES = { 

19 # PDF files 

20 "application/pdf": "pdf", 

21 # Microsoft Office documents 

22 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", 

23 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx", 

24 "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx", 

25 "application/vnd.ms-excel": "xls", 

26 # Images 

27 "image/jpeg": "jpg", 

28 "image/png": "png", 

29 "image/gif": "gif", 

30 "image/bmp": "bmp", 

31 "image/tiff": "tiff", 

32 "image/webp": "webp", 

33 # Audio files 

34 "audio/mpeg": "mp3", 

35 "audio/wav": "wav", 

36 "audio/x-wav": "wav", 

37 "audio/wave": "wav", 

38 # EPUB files 

39 "application/epub+zip": "epub", 

40 # ZIP archives 

41 "application/zip": "zip", 

42 "application/x-zip-compressed": "zip", 

43 # Plain text (for completeness) 

44 "text/plain": "txt", 

45 # CSV files 

46 "text/csv": "csv", 

47 "application/csv": "csv", 

48 # XML files 

49 "application/xml": "xml", 

50 "text/xml": "xml", 

51 } 

52 

53 # File extensions that should be excluded (handled by existing strategies) 

54 EXCLUDED_EXTENSIONS = { 

55 ".html", 

56 ".htm", # HTML strategy 

57 ".md", 

58 ".markdown", # Markdown strategy 

59 ".txt", # Base strategy for plain text 

60 ".json", # JSON strategy 

61 } 

62 

63 def __init__(self): 

64 """Initialize the file detector.""" 

65 self.logger = LoggingConfig.get_logger(__name__) 

66 

67 # Initialize mimetypes with additional types 

68 mimetypes.init() 

69 self._add_custom_mime_types() 

70 

71 def _add_custom_mime_types(self): 

72 """Add custom MIME type mappings for better detection.""" 

73 # Add Office document types that might not be in default mimetypes 

74 custom_types = { 

75 ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 

76 ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 

77 ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", 

78 ".xls": "application/vnd.ms-excel", 

79 ".epub": "application/epub+zip", 

80 } 

81 

82 for ext, mime_type in custom_types.items(): 

83 mimetypes.add_type(mime_type, ext) 

84 

85 def detect_file_type(self, file_path: str) -> tuple[str | None, str | None]: 

86 """Detect file type using MIME type detection with extension fallback. 

87 

88 Args: 

89 file_path: Path to the file to analyze 

90 

91 Returns: 

92 Tuple of (mime_type, file_extension) or (None, None) if detection fails 

93 """ 

94 try: 

95 # Get file extension 

96 file_extension = Path(file_path).suffix.lower() 

97 

98 # Try MIME type detection using mimetypes 

99 mime_type = self._detect_mime_type(file_path) 

100 

101 self.logger.debug( 

102 "File type detection", 

103 file_path=file_path.replace("\\", "/"), 

104 detected_mime_type=mime_type, 

105 file_extension=file_extension, 

106 ) 

107 

108 return mime_type, file_extension 

109 

110 except Exception as e: 

111 self.logger.warning( 

112 "File type detection failed", 

113 file_path=file_path.replace("\\", "/"), 

114 error=str(e), 

115 ) 

116 return None, None 

117 

118 def _detect_mime_type(self, file_path: str) -> str | None: 

119 """Detect MIME type using mimetypes module. 

120 

121 Args: 

122 file_path: Path to the file 

123 

124 Returns: 

125 MIME type string or None if detection fails 

126 """ 

127 try: 

128 # Check if file exists and is accessible 

129 if not os.path.exists(file_path): 

130 raise FileAccessError(f"File does not exist: {file_path}") 

131 

132 if not os.access(file_path, os.R_OK): 

133 raise FileAccessError(f"File is not readable: {file_path}") 

134 

135 # Use mimetypes module for MIME type detection 

136 mime_type, _ = mimetypes.guess_type(file_path) 

137 

138 return mime_type 

139 

140 except Exception as e: 

141 self.logger.debug( 

142 "MIME type detection failed, will try extension fallback", 

143 file_path=file_path.replace("\\", "/"), 

144 error=str(e), 

145 ) 

146 return None 

147 

148 def is_supported_for_conversion(self, file_path: str) -> bool: 

149 """Check if file is supported for conversion. 

150 

151 Args: 

152 file_path: Path to the file 

153 

154 Returns: 

155 True if file is supported for conversion, False otherwise 

156 """ 

157 mime_type, file_extension = self.detect_file_type(file_path) 

158 

159 # Check if extension should be excluded (handled by existing strategies) 

160 if file_extension in self.EXCLUDED_EXTENSIONS: 

161 self.logger.debug( 

162 "File excluded - handled by existing strategy", 

163 file_path=file_path.replace("\\", "/"), 

164 file_extension=file_extension, 

165 ) 

166 return False 

167 

168 # Check if MIME type is supported 

169 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES: 

170 self.logger.debug( 

171 "File supported via MIME type", 

172 file_path=file_path.replace("\\", "/"), 

173 mime_type=mime_type, 

174 ) 

175 return True 

176 

177 # Check if extension is supported (fallback) 

178 if file_extension: 

179 extension_without_dot = file_extension.lstrip(".") 

180 supported_extensions = set(self.SUPPORTED_MIME_TYPES.values()) 

181 

182 if extension_without_dot in supported_extensions: 

183 self.logger.debug( 

184 "File supported via extension fallback", 

185 file_path=file_path.replace("\\", "/"), 

186 file_extension=file_extension, 

187 ) 

188 return True 

189 

190 self.logger.debug( 

191 "File not supported for conversion", 

192 file_path=file_path.replace("\\", "/"), 

193 mime_type=mime_type, 

194 file_extension=file_extension, 

195 ) 

196 return False 

197 

198 def get_file_type_info(self, file_path: str) -> dict: 

199 """Get comprehensive file type information. 

200 

201 Args: 

202 file_path: Path to the file 

203 

204 Returns: 

205 Dictionary with file type information 

206 """ 

207 mime_type, file_extension = self.detect_file_type(file_path) 

208 

209 # Get file size 

210 file_size = None 

211 try: 

212 file_size = os.path.getsize(file_path) 

213 except OSError: 

214 pass 

215 

216 # Determine if supported 

217 is_supported = self.is_supported_for_conversion(file_path) 

218 

219 # Get normalized file type 

220 normalized_type = None 

221 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES: 

222 normalized_type = self.SUPPORTED_MIME_TYPES[mime_type] 

223 elif file_extension: 

224 extension_without_dot = file_extension.lstrip(".") 

225 if extension_without_dot in self.SUPPORTED_MIME_TYPES.values(): 

226 normalized_type = extension_without_dot 

227 

228 return { 

229 "file_path": file_path, 

230 "mime_type": mime_type, 

231 "file_extension": file_extension, 

232 "file_size": file_size, 

233 "is_supported": is_supported, 

234 "normalized_type": normalized_type, 

235 "is_excluded": file_extension in self.EXCLUDED_EXTENSIONS, 

236 } 

237 

238 @classmethod 

239 def get_supported_extensions(cls) -> set[str]: 

240 """Get set of supported file extensions. 

241 

242 Returns: 

243 Set of supported file extensions (with dots) 

244 """ 

245 extensions = set() 

246 for file_type in cls.SUPPORTED_MIME_TYPES.values(): 

247 extensions.add(f".{file_type}") 

248 

249 # Add some common variations 

250 extensions.update({".jpeg", ".tif", ".wave"}) 

251 

252 return extensions 

253 

254 @classmethod 

255 def get_supported_mime_types(cls) -> set[str]: 

256 """Get set of supported MIME types. 

257 

258 Returns: 

259 Set of supported MIME types 

260 """ 

261 return set(cls.SUPPORTED_MIME_TYPES.keys())