Coverage for src / qdrant_loader / core / file_conversion / file_detector.py: 96%
79 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-18 04:48 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-18 04:48 +0000
1"""File type detection service using MIME type and extension-based detection."""
3import mimetypes
4import os
5from pathlib import Path
7from qdrant_loader.utils.logging import LoggingConfig
9from .exceptions import FileAccessError
11logger = LoggingConfig.get_logger(__name__)
14class FileDetector:
15 """Service for detecting file types using MIME type and extension-based detection."""
17 # MarkItDown supported file types (based on documentation)
18 SUPPORTED_MIME_TYPES = {
19 # PDF files
20 "application/pdf": "pdf",
21 # Microsoft Office documents
22 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
23 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
24 "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
25 "application/vnd.ms-excel": "xls",
26 # Images
27 "image/jpeg": "jpg",
28 "image/png": "png",
29 "image/gif": "gif",
30 "image/bmp": "bmp",
31 "image/tiff": "tiff",
32 "image/webp": "webp",
33 # Audio files
34 "audio/mpeg": "mp3",
35 "audio/wav": "wav",
36 "audio/x-wav": "wav",
37 "audio/wave": "wav",
38 # EPUB files
39 "application/epub+zip": "epub",
40 # ZIP archives
41 "application/zip": "zip",
42 "application/x-zip-compressed": "zip",
43 # Plain text (for completeness)
44 "text/plain": "txt",
45 # CSV files
46 "text/csv": "csv",
47 "application/csv": "csv",
48 # XML files
49 "application/xml": "xml",
50 "text/xml": "xml",
51 }
53 # File extensions that should be excluded (handled by existing strategies)
54 EXCLUDED_EXTENSIONS = {
55 ".html",
56 ".htm", # HTML strategy
57 ".md",
58 ".markdown", # Markdown strategy
59 ".txt", # Base strategy for plain text
60 ".json", # JSON strategy
61 }
63 def __init__(self):
64 """Initialize the file detector."""
65 self.logger = LoggingConfig.get_logger(__name__)
67 # Initialize mimetypes with additional types
68 mimetypes.init()
69 self._add_custom_mime_types()
71 def _add_custom_mime_types(self):
72 """Add custom MIME type mappings for better detection."""
73 # Add Office document types that might not be in default mimetypes
74 custom_types = {
75 ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
76 ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
77 ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
78 ".xls": "application/vnd.ms-excel",
79 ".epub": "application/epub+zip",
80 }
82 for ext, mime_type in custom_types.items():
83 mimetypes.add_type(mime_type, ext)
85 def detect_file_type(self, file_path: str) -> tuple[str | None, str | None]:
86 """Detect file type using MIME type detection with extension fallback.
88 Args:
89 file_path: Path to the file to analyze
91 Returns:
92 Tuple of (mime_type, file_extension) or (None, None) if detection fails
93 """
94 try:
95 # Get file extension
96 file_extension = Path(file_path).suffix.lower()
98 # Try MIME type detection using mimetypes
99 mime_type = self._detect_mime_type(file_path)
101 self.logger.debug(
102 "File type detection",
103 file_path=file_path.replace("\\", "/"),
104 detected_mime_type=mime_type,
105 file_extension=file_extension,
106 )
108 return mime_type, file_extension
110 except Exception as e:
111 self.logger.warning(
112 "File type detection failed",
113 file_path=file_path.replace("\\", "/"),
114 error=str(e),
115 )
116 return None, None
118 def _detect_mime_type(self, file_path: str) -> str | None:
119 """Detect MIME type using mimetypes module.
121 Args:
122 file_path: Path to the file
124 Returns:
125 MIME type string or None if detection fails
126 """
127 try:
128 # Check if file exists and is accessible
129 if not os.path.exists(file_path):
130 raise FileAccessError(f"File does not exist: {file_path}")
132 if not os.access(file_path, os.R_OK):
133 raise FileAccessError(f"File is not readable: {file_path}")
135 # Use mimetypes module for MIME type detection
136 mime_type, _ = mimetypes.guess_type(file_path)
138 return mime_type
140 except Exception as e:
141 self.logger.debug(
142 "MIME type detection failed, will try extension fallback",
143 file_path=file_path.replace("\\", "/"),
144 error=str(e),
145 )
146 return None
148 def is_supported_for_conversion(self, file_path: str) -> bool:
149 """Check if file is supported for conversion.
151 Args:
152 file_path: Path to the file
154 Returns:
155 True if file is supported for conversion, False otherwise
156 """
157 mime_type, file_extension = self.detect_file_type(file_path)
159 # Check if extension should be excluded (handled by existing strategies)
160 if file_extension in self.EXCLUDED_EXTENSIONS:
161 self.logger.debug(
162 "File excluded - handled by existing strategy",
163 file_path=file_path.replace("\\", "/"),
164 file_extension=file_extension,
165 )
166 return False
168 # Check if MIME type is supported
169 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:
170 self.logger.debug(
171 "File supported via MIME type",
172 file_path=file_path.replace("\\", "/"),
173 mime_type=mime_type,
174 )
175 return True
177 # Check if extension is supported (fallback)
178 if file_extension:
179 extension_without_dot = file_extension.lstrip(".")
180 supported_extensions = set(self.SUPPORTED_MIME_TYPES.values())
182 if extension_without_dot in supported_extensions:
183 self.logger.debug(
184 "File supported via extension fallback",
185 file_path=file_path.replace("\\", "/"),
186 file_extension=file_extension,
187 )
188 return True
190 self.logger.debug(
191 "File not supported for conversion",
192 file_path=file_path.replace("\\", "/"),
193 mime_type=mime_type,
194 file_extension=file_extension,
195 )
196 return False
198 def get_file_type_info(self, file_path: str) -> dict:
199 """Get comprehensive file type information.
201 Args:
202 file_path: Path to the file
204 Returns:
205 Dictionary with file type information
206 """
207 mime_type, file_extension = self.detect_file_type(file_path)
209 # Get file size
210 file_size = None
211 try:
212 file_size = os.path.getsize(file_path)
213 except OSError:
214 pass
216 # Determine if supported
217 is_supported = self.is_supported_for_conversion(file_path)
219 # Get normalized file type
220 normalized_type = None
221 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:
222 normalized_type = self.SUPPORTED_MIME_TYPES[mime_type]
223 elif file_extension:
224 extension_without_dot = file_extension.lstrip(".")
225 if extension_without_dot in self.SUPPORTED_MIME_TYPES.values():
226 normalized_type = extension_without_dot
228 return {
229 "file_path": file_path,
230 "mime_type": mime_type,
231 "file_extension": file_extension,
232 "file_size": file_size,
233 "is_supported": is_supported,
234 "normalized_type": normalized_type,
235 "is_excluded": file_extension in self.EXCLUDED_EXTENSIONS,
236 }
238 @classmethod
239 def get_supported_extensions(cls) -> set[str]:
240 """Get set of supported file extensions.
242 Returns:
243 Set of supported file extensions (with dots)
244 """
245 extensions = set()
246 for file_type in cls.SUPPORTED_MIME_TYPES.values():
247 extensions.add(f".{file_type}")
249 # Add some common variations
250 extensions.update({".jpeg", ".tif", ".wave"})
252 return extensions
254 @classmethod
255 def get_supported_mime_types(cls) -> set[str]:
256 """Get set of supported MIME types.
258 Returns:
259 Set of supported MIME types
260 """
261 return set(cls.SUPPORTED_MIME_TYPES.keys())