Coverage for src/qdrant_loader/core/file_conversion/file

1"""File type detection service using MIME type and extension-based detection."""

3import mimetypes

4import os

5from pathlib import Path

7from qdrant_loader.utils.logging import LoggingConfig

9from .exceptions import FileAccessError

11logger = LoggingConfig.get_logger(__name__)

14class FileDetector:

15 """Service for detecting file types using MIME type and extension-based detection."""

17 # MarkItDown supported file types (based on documentation)

18 SUPPORTED_MIME_TYPES = {

19 # PDF files

20 "application/pdf": "pdf",

21 # Microsoft Office documents

22 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",

23 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",

24 "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",

25 "application/msword": "doc",

26 "application/vnd.ms-excel": "xls",

27 "application/vnd.ms-powerpoint": "ppt",

28 # Images

29 "image/jpeg": "jpg",

30 "image/png": "png",

31 "image/gif": "gif",

32 "image/bmp": "bmp",

33 "image/tiff": "tiff",

34 "image/webp": "webp",

35 # Audio files

36 "audio/mpeg": "mp3",

37 "audio/wav": "wav",

38 "audio/x-wav": "wav",

39 "audio/wave": "wav",

40 # EPUB files

41 "application/epub+zip": "epub",

42 # ZIP archives

43 "application/zip": "zip",

44 "application/x-zip-compressed": "zip",

45 # Plain text (for completeness)

46 "text/plain": "txt",

47 # CSV files

48 "text/csv": "csv",

49 "application/csv": "csv",

50 # XML files

51 "application/xml": "xml",

52 "text/xml": "xml",

53 }

55 # File extensions that should be excluded (handled by existing strategies)

56 EXCLUDED_EXTENSIONS = {

57 ".html",

58 ".htm", # HTML strategy

59 ".md",

60 ".markdown", # Markdown strategy

61 ".txt", # Base strategy for plain text

62 ".json", # JSON strategy

63 }

65 def __init__(self):

66 """Initialize the file detector."""

67 self.logger = LoggingConfig.get_logger(__name__)

69 # Initialize mimetypes with additional types

70 mimetypes.init()

71 self._add_custom_mime_types()

73 def _add_custom_mime_types(self):

74 """Add custom MIME type mappings for better detection."""

75 # Add Office document types that might not be in default mimetypes

76 custom_types = {

77 ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

78 ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",

79 ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",

80 ".doc": "application/msword",

81 ".xls": "application/vnd.ms-excel",

82 ".ppt": "application/vnd.ms-powerpoint",

83 ".epub": "application/epub+zip",

84 }

86 for ext, mime_type in custom_types.items():

87 mimetypes.add_type(mime_type, ext)

89 def detect_file_type(self, file_path: str) -> tuple[str | None, str | None]:

90 """Detect file type using MIME type detection with extension fallback.

92 Args:

93 file_path: Path to the file to analyze

95 Returns:

96 Tuple of (mime_type, file_extension) or (None, None) if detection fails

97 """

98 try:

99 # Get file extension

100 file_extension = Path(file_path).suffix.lower()

101

102 # Try MIME type detection using mimetypes

103 mime_type = self._detect_mime_type(file_path)

104

105 self.logger.debug(

106 "File type detection",

107 file_path=file_path.replace("\\", "/"),

108 detected_mime_type=mime_type,

109 file_extension=file_extension,

110 )

111

112 return mime_type, file_extension

113

114 except Exception as e:

115 self.logger.warning(

116 "File type detection failed",

117 file_path=file_path.replace("\\", "/"),

118 error=str(e),

119 )

120 return None, None

121

122 def _detect_mime_type(self, file_path: str) -> str | None:

123 """Detect MIME type using mimetypes module.

124

125 Args:

126 file_path: Path to the file

127

128 Returns:

129 MIME type string or None if detection fails

130 """

131 try:

132 # Check if file exists and is accessible

133 if not os.path.exists(file_path):

134 raise FileAccessError(f"File does not exist: {file_path}")

135

136 if not os.access(file_path, os.R_OK):

137 raise FileAccessError(f"File is not readable: {file_path}")

138

139 # Use mimetypes module for MIME type detection

140 mime_type, _ = mimetypes.guess_type(file_path)

141

142 return mime_type

143

144 except Exception as e:

145 self.logger.debug(

146 "MIME type detection failed, will try extension fallback",

147 file_path=file_path.replace("\\", "/"),

148 error=str(e),

149 )

150 return None

151

152 def is_supported_for_conversion(self, file_path: str) -> bool:

153 """Check if file is supported for conversion.

154

155 Args:

156 file_path: Path to the file

157

158 Returns:

159 True if file is supported for conversion, False otherwise

160 """

161 mime_type, file_extension = self.detect_file_type(file_path)

162

163 # Check if extension should be excluded (handled by existing strategies)

164 if file_extension in self.EXCLUDED_EXTENSIONS:

165 self.logger.debug(

166 "File excluded - handled by existing strategy",

167 file_path=file_path.replace("\\", "/"),

168 file_extension=file_extension,

169 )

170 return False

171

172 # Check if MIME type is supported

173 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:

174 self.logger.debug(

175 "File supported via MIME type",

176 file_path=file_path.replace("\\", "/"),

177 mime_type=mime_type,

178 )

179 return True

180

181 # Check if extension is supported (fallback)

182 if file_extension:

183 extension_without_dot = file_extension.lstrip(".")

184 supported_extensions = set(self.SUPPORTED_MIME_TYPES.values())

185

186 if extension_without_dot in supported_extensions:

187 self.logger.debug(

188 "File supported via extension fallback",

189 file_path=file_path.replace("\\", "/"),

190 file_extension=file_extension,

191 )

192 return True

193

194 self.logger.debug(

195 "File not supported for conversion",

196 file_path=file_path.replace("\\", "/"),

197 mime_type=mime_type,

198 file_extension=file_extension,

199 )

200 return False

201

202 def get_file_type_info(self, file_path: str) -> dict:

203 """Get comprehensive file type information.

204

205 Args:

206 file_path: Path to the file

207

208 Returns:

209 Dictionary with file type information

210 """

211 mime_type, file_extension = self.detect_file_type(file_path)

212

213 # Get file size

214 file_size = None

215 try:

216 file_size = os.path.getsize(file_path)

217 except OSError:

218 pass

219

220 # Determine if supported

221 is_supported = self.is_supported_for_conversion(file_path)

222

223 # Get normalized file type

224 normalized_type = None

225 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:

226 normalized_type = self.SUPPORTED_MIME_TYPES[mime_type]

227 elif file_extension:

228 extension_without_dot = file_extension.lstrip(".")

229 if extension_without_dot in self.SUPPORTED_MIME_TYPES.values():

230 normalized_type = extension_without_dot

231

232 return {

233 "file_path": file_path,

234 "mime_type": mime_type,

235 "file_extension": file_extension,

236 "file_size": file_size,

237 "is_supported": is_supported,

238 "normalized_type": normalized_type,

239 "is_excluded": file_extension in self.EXCLUDED_EXTENSIONS,

240 }

241

242 @classmethod

243 def get_supported_extensions(cls) -> set[str]:

244 """Get set of supported file extensions.

245

246 Returns:

247 Set of supported file extensions (with dots)

248 """

249 extensions = set()

250 for file_type in cls.SUPPORTED_MIME_TYPES.values():

251 extensions.add(f".{file_type}")

252

253 # Add some common variations

254 extensions.update({".jpeg", ".tif", ".wave"})

255

256 return extensions

257

258 @classmethod

259 def get_supported_mime_types(cls) -> set[str]:

260 """Get set of supported MIME types.

261

262 Returns:

263 Set of supported MIME types

264 """

265 return set(cls.SUPPORTED_MIME_TYPES.keys())

Coverage for src / qdrant_loader / core / file_conversion / file_detector.py: 92%

79 statements