Coverage for src/qdrant_loader/core/file_conversion/file_detector.py: 92%
80 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
1"""File type detection service using MIME type and extension-based detection."""
3import mimetypes
4import os
5from pathlib import Path
6from typing import Optional, Tuple
8from qdrant_loader.utils.logging import LoggingConfig
10from .exceptions import FileAccessError
12logger = LoggingConfig.get_logger(__name__)
15class FileDetector:
16 """Service for detecting file types using MIME type and extension-based detection."""
18 # MarkItDown supported file types (based on documentation)
19 SUPPORTED_MIME_TYPES = {
20 # PDF files
21 "application/pdf": "pdf",
22 # Microsoft Office documents
23 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
24 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
25 "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
26 "application/msword": "doc",
27 "application/vnd.ms-excel": "xls",
28 "application/vnd.ms-powerpoint": "ppt",
29 # Images
30 "image/jpeg": "jpg",
31 "image/png": "png",
32 "image/gif": "gif",
33 "image/bmp": "bmp",
34 "image/tiff": "tiff",
35 "image/webp": "webp",
36 # Audio files
37 "audio/mpeg": "mp3",
38 "audio/wav": "wav",
39 "audio/x-wav": "wav",
40 "audio/wave": "wav",
41 # EPUB files
42 "application/epub+zip": "epub",
43 # ZIP archives
44 "application/zip": "zip",
45 "application/x-zip-compressed": "zip",
46 # Plain text (for completeness)
47 "text/plain": "txt",
48 # CSV files
49 "text/csv": "csv",
50 "application/csv": "csv",
51 # XML files
52 "application/xml": "xml",
53 "text/xml": "xml",
54 }
56 # File extensions that should be excluded (handled by existing strategies)
57 EXCLUDED_EXTENSIONS = {
58 ".html",
59 ".htm", # HTML strategy
60 ".md",
61 ".markdown", # Markdown strategy
62 ".txt", # Base strategy for plain text
63 ".json", # JSON strategy
64 }
66 def __init__(self):
67 """Initialize the file detector."""
68 self.logger = LoggingConfig.get_logger(__name__)
70 # Initialize mimetypes with additional types
71 mimetypes.init()
72 self._add_custom_mime_types()
74 def _add_custom_mime_types(self):
75 """Add custom MIME type mappings for better detection."""
76 # Add Office document types that might not be in default mimetypes
77 custom_types = {
78 ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
79 ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
80 ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
81 ".doc": "application/msword",
82 ".xls": "application/vnd.ms-excel",
83 ".ppt": "application/vnd.ms-powerpoint",
84 ".epub": "application/epub+zip",
85 }
87 for ext, mime_type in custom_types.items():
88 mimetypes.add_type(mime_type, ext)
90 def detect_file_type(self, file_path: str) -> Tuple[Optional[str], Optional[str]]:
91 """Detect file type using MIME type detection with extension fallback.
93 Args:
94 file_path: Path to the file to analyze
96 Returns:
97 Tuple of (mime_type, file_extension) or (None, None) if detection fails
98 """
99 try:
100 # Get file extension
101 file_extension = Path(file_path).suffix.lower()
103 # Try MIME type detection using mimetypes
104 mime_type = self._detect_mime_type(file_path)
106 self.logger.debug(
107 "File type detection",
108 file_path=file_path,
109 detected_mime_type=mime_type,
110 file_extension=file_extension,
111 )
113 return mime_type, file_extension
115 except Exception as e:
116 self.logger.warning(
117 "File type detection failed", file_path=file_path, error=str(e)
118 )
119 return None, None
121 def _detect_mime_type(self, file_path: str) -> Optional[str]:
122 """Detect MIME type using mimetypes module.
124 Args:
125 file_path: Path to the file
127 Returns:
128 MIME type string or None if detection fails
129 """
130 try:
131 # Check if file exists and is accessible
132 if not os.path.exists(file_path):
133 raise FileAccessError(f"File does not exist: {file_path}")
135 if not os.access(file_path, os.R_OK):
136 raise FileAccessError(f"File is not readable: {file_path}")
138 # Use mimetypes module for MIME type detection
139 mime_type, _ = mimetypes.guess_type(file_path)
141 return mime_type
143 except Exception as e:
144 self.logger.debug(
145 "MIME type detection failed, will try extension fallback",
146 file_path=file_path,
147 error=str(e),
148 )
149 return None
151 def is_supported_for_conversion(self, file_path: str) -> bool:
152 """Check if file is supported for conversion.
154 Args:
155 file_path: Path to the file
157 Returns:
158 True if file is supported for conversion, False otherwise
159 """
160 mime_type, file_extension = self.detect_file_type(file_path)
162 # Check if extension should be excluded (handled by existing strategies)
163 if file_extension in self.EXCLUDED_EXTENSIONS:
164 self.logger.debug(
165 "File excluded - handled by existing strategy",
166 file_path=file_path,
167 file_extension=file_extension,
168 )
169 return False
171 # Check if MIME type is supported
172 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:
173 self.logger.debug(
174 "File supported via MIME type", file_path=file_path, mime_type=mime_type
175 )
176 return True
178 # Check if extension is supported (fallback)
179 if file_extension:
180 extension_without_dot = file_extension.lstrip(".")
181 supported_extensions = set(self.SUPPORTED_MIME_TYPES.values())
183 if extension_without_dot in supported_extensions:
184 self.logger.debug(
185 "File supported via extension fallback",
186 file_path=file_path,
187 file_extension=file_extension,
188 )
189 return True
191 self.logger.debug(
192 "File not supported for conversion",
193 file_path=file_path,
194 mime_type=mime_type,
195 file_extension=file_extension,
196 )
197 return False
199 def get_file_type_info(self, file_path: str) -> dict:
200 """Get comprehensive file type information.
202 Args:
203 file_path: Path to the file
205 Returns:
206 Dictionary with file type information
207 """
208 mime_type, file_extension = self.detect_file_type(file_path)
210 # Get file size
211 file_size = None
212 try:
213 file_size = os.path.getsize(file_path)
214 except OSError:
215 pass
217 # Determine if supported
218 is_supported = self.is_supported_for_conversion(file_path)
220 # Get normalized file type
221 normalized_type = None
222 if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:
223 normalized_type = self.SUPPORTED_MIME_TYPES[mime_type]
224 elif file_extension:
225 extension_without_dot = file_extension.lstrip(".")
226 if extension_without_dot in self.SUPPORTED_MIME_TYPES.values():
227 normalized_type = extension_without_dot
229 return {
230 "file_path": file_path,
231 "mime_type": mime_type,
232 "file_extension": file_extension,
233 "file_size": file_size,
234 "is_supported": is_supported,
235 "normalized_type": normalized_type,
236 "is_excluded": file_extension in self.EXCLUDED_EXTENSIONS,
237 }
239 @classmethod
240 def get_supported_extensions(cls) -> set[str]:
241 """Get set of supported file extensions.
243 Returns:
244 Set of supported file extensions (with dots)
245 """
246 extensions = set()
247 for file_type in cls.SUPPORTED_MIME_TYPES.values():
248 extensions.add(f".{file_type}")
250 # Add some common variations
251 extensions.update({".jpeg", ".tif", ".wave"})
253 return extensions
255 @classmethod
256 def get_supported_mime_types(cls) -> set[str]:
257 """Get set of supported MIME types.
259 Returns:
260 Set of supported MIME types
261 """
262 return set(cls.SUPPORTED_MIME_TYPES.keys())