Coverage for src/qdrant_loader/core/file_conversion/file_detector.py: 92%
79 statements
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:39 +0000
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:39 +0000
1"""File type detection service using MIME type and extension-based detection."""
3import mimetypes
4import os
5from pathlib import Path
7from qdrant_loader.utils.logging import LoggingConfig
9from .exceptions import FileAccessError
11logger = LoggingConfig.get_logger(__name__)
14class FileDetector:
15 """Service for detecting file types using MIME type and extension-based detection."""
17 # MarkItDown supported file types (based on documentation)
18 SUPPORTED_MIME_TYPES = {
19 # PDF files
20 "application/pdf": "pdf",
21 # Microsoft Office documents
22 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
23 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
24 "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
25 "application/msword": "doc",
26 "application/vnd.ms-excel": "xls",
27 "application/vnd.ms-powerpoint": "ppt",
28 # Images
29 "image/jpeg": "jpg",
30 "image/png": "png",
31 "image/gif": "gif",
32 "image/bmp": "bmp",
33 "image/tiff": "tiff",
34 "image/webp": "webp",
35 # Audio files
36 "audio/mpeg": "mp3",
37 "audio/wav": "wav",
38 "audio/x-wav": "wav",
39 "audio/wave": "wav",
40 # EPUB files
41 "application/epub+zip": "epub",
42 # ZIP archives
43 "application/zip": "zip",
44 "application/x-zip-compressed": "zip",
45 # Plain text (for completeness)
46 "text/plain": "txt",
47 # CSV files
48 "text/csv": "csv",
49 "application/csv": "csv",
50 # XML files
51 "application/xml": "xml",
52 "text/xml": "xml",
53 }
55 # File extensions that should be excluded (handled by existing strategies)
56 EXCLUDED_EXTENSIONS = {
57 ".html",
58 ".htm", # HTML strategy
59 ".md",
60 ".markdown", # Markdown strategy
61 ".txt", # Base strategy for plain text
62 ".json", # JSON strategy
63 }
65 def __init__(self):
66 """Initialize the file detector."""
67 self.logger = LoggingConfig.get_logger(__name__)
69 # Initialize mimetypes with additional types
70 mimetypes.init()
71 self._add_custom_mime_types()
73 def _add_custom_mime_types(self):
74 """Add custom MIME type mappings for better detection."""
75 # Add Office document types that might not be in default mimetypes
76 custom_types = {
77 ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
78 ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
79 ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
80 ".doc": "application/msword",
81 ".xls": "application/vnd.ms-excel",
82 ".ppt": "application/vnd.ms-powerpoint",
83 ".epub": "application/epub+zip",
84 }
86 for ext, mime_type in custom_types.items():
87 mimetypes.add_type(mime_type, ext)
89 def detect_file_type(self, file_path: str) -> tuple[str | None, str | None]:
90 """Detect file type using MIME type detection with extension fallback.
92 Args:
93 file_path: Path to the file to analyze
95 Returns:
96 Tuple of (mime_type, file_extension) or (None, None) if detection fails
97 """
98 try:
99 # Get file extension
100 file_extension = Path(file_path).suffix.lower()
102 # Try MIME type detection using mimetypes
103 mime_type = self._detect_mime_type(file_path)
105 self.logger.debug(
106 "File type detection",
107 file_path=file_path.replace("\\", "/"),
108 detected_mime_type=mime_type,
109 file_extension=file_extension,
110 )
112 return mime_type, file_extension
114 except Exception as e:
115 self.logger.warning(
116 "File type detection failed",
117 file_path=file_path.replace("\\", "/"),
118 error=str(e),
119 )
120 return None, None
122 def _detect_mime_type(self, file_path: str) -> str | None:
123 """Detect MIME type using mimetypes module.
125 Args:
126 file_path: Path to the file
128 Returns:
129 MIME type string or None if detection fails
130 """
131 try:
132 # Check if file exists and is accessible
133 if not os.path.exists(file_path):
134 raise FileAccessError(f"File does not exist: {file_path}")
136 if not os.access(file_path, os.R_OK):
137 raise FileAccessError(f"File is not readable: {file_path}")
139 # Use mimetypes module for MIME type detection
140 mime_type, _ = mimetypes.guess_type(file_path)
142 return mime_type
144 except Exception as e:
145 self.logger.debug(
146 "MIME type detection failed, will try extension fallback",
147 file_path=file_path.replace("\\", "/"),
148 error=str(e),
149 )
150 return None
152 def is_supported_for_conversion(self, file_path: str) -> bool:
153 """Check if file is supported for conversion.
155 Args:
156 file_path: Path to the file
158 Returns:
159 True if file is supported for conversion, False otherwise
160 """
161 mime_type, file_extension = self.detect_file_type(file_path)
163 # Check if extension should be excluded (handled by existing strategies)
164 if file_extension in self.EXCLUDED_EXTENSIONS:
165 self.logger.debug(
166 "File excluded - handled by existing strategy",
167 file_path=file_path.replace("\\", "/"),
168 file_extension=file_extension,
169 )
170 return False
172 # Check if MIME type is supported
173 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:
174 self.logger.debug(
175 "File supported via MIME type",
176 file_path=file_path.replace("\\", "/"),
177 mime_type=mime_type,
178 )
179 return True
181 # Check if extension is supported (fallback)
182 if file_extension:
183 extension_without_dot = file_extension.lstrip(".")
184 supported_extensions = set(self.SUPPORTED_MIME_TYPES.values())
186 if extension_without_dot in supported_extensions:
187 self.logger.debug(
188 "File supported via extension fallback",
189 file_path=file_path.replace("\\", "/"),
190 file_extension=file_extension,
191 )
192 return True
194 self.logger.debug(
195 "File not supported for conversion",
196 file_path=file_path.replace("\\", "/"),
197 mime_type=mime_type,
198 file_extension=file_extension,
199 )
200 return False
202 def get_file_type_info(self, file_path: str) -> dict:
203 """Get comprehensive file type information.
205 Args:
206 file_path: Path to the file
208 Returns:
209 Dictionary with file type information
210 """
211 mime_type, file_extension = self.detect_file_type(file_path)
213 # Get file size
214 file_size = None
215 try:
216 file_size = os.path.getsize(file_path)
217 except OSError:
218 pass
220 # Determine if supported
221 is_supported = self.is_supported_for_conversion(file_path)
223 # Get normalized file type
224 normalized_type = None
225 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:
226 normalized_type = self.SUPPORTED_MIME_TYPES[mime_type]
227 elif file_extension:
228 extension_without_dot = file_extension.lstrip(".")
229 if extension_without_dot in self.SUPPORTED_MIME_TYPES.values():
230 normalized_type = extension_without_dot
232 return {
233 "file_path": file_path,
234 "mime_type": mime_type,
235 "file_extension": file_extension,
236 "file_size": file_size,
237 "is_supported": is_supported,
238 "normalized_type": normalized_type,
239 "is_excluded": file_extension in self.EXCLUDED_EXTENSIONS,
240 }
242 @classmethod
243 def get_supported_extensions(cls) -> set[str]:
244 """Get set of supported file extensions.
246 Returns:
247 Set of supported file extensions (with dots)
248 """
249 extensions = set()
250 for file_type in cls.SUPPORTED_MIME_TYPES.values():
251 extensions.add(f".{file_type}")
253 # Add some common variations
254 extensions.update({".jpeg", ".tif", ".wave"})
256 return extensions
258 @classmethod
259 def get_supported_mime_types(cls) -> set[str]:
260 """Get set of supported MIME types.
262 Returns:
263 Set of supported MIME types
264 """
265 return set(cls.SUPPORTED_MIME_TYPES.keys())