Coverage for src / qdrant_loader / connectors / localfile / file_processor.py: 94%
99 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 09:46 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 09:46 +0000
1"""File processing and filtering logic for LocalFile connector."""
3import fnmatch
4import os
5from typing import TYPE_CHECKING, Optional
7from qdrant_loader.utils.logging import LoggingConfig
9if TYPE_CHECKING:
10 from qdrant_loader.core.file_conversion import FileDetector
12from .config import LocalFileConfig
15class LocalFileFileProcessor:
16 """Handles file processing and filtering logic for local files."""
18 def __init__(
19 self,
20 config: LocalFileConfig,
21 base_path: str,
22 file_detector: Optional["FileDetector"] = None,
23 ):
24 """Initialize the file processor.
26 Args:
27 config: Local file configuration
28 base_path: Base directory path
29 file_detector: Optional file detector for conversion support
30 """
31 self.config = config
32 self.base_path = base_path
33 self.file_detector = file_detector
34 self.logger = LoggingConfig.get_logger(__name__)
36 def should_process_file(self, file_path: str) -> bool:
37 try:
38 self.logger.debug(
39 "Checking if file should be processed",
40 file_path=file_path.replace("\\", "/"),
41 )
42 self.logger.debug(
43 "Current configuration",
44 file_types=self.config.file_types,
45 include_paths=self.config.include_paths,
46 exclude_paths=self.config.exclude_paths,
47 max_file_size=self.config.max_file_size,
48 )
50 if not os.path.isfile(file_path):
51 self.logger.debug(f"Skipping {file_path}: file does not exist")
52 return False
53 if not os.access(file_path, os.R_OK):
54 self.logger.debug(f"Skipping {file_path}: file is not readable")
55 return False
57 # Handle cross-drive paths on Windows
58 try:
59 rel_path = os.path.relpath(file_path, self.base_path)
60 except ValueError:
61 # Cannot calculate relative path (e.g., cross-drive on Windows)
62 # Skip this file as we cannot reliably apply include/exclude patterns
63 self.logger.warning(
64 "Skipping file on different drive - cannot apply patterns",
65 file_path=file_path,
66 base_path=self.base_path,
67 )
68 return False
70 # Normalize path separators to forward slashes for consistent matching
71 rel_path = rel_path.replace("\\", "/")
73 file_basename = os.path.basename(rel_path)
74 if file_basename.startswith("."):
75 self.logger.debug(
76 f"Skipping {rel_path}: invalid filename (starts with dot)"
77 )
78 return False
80 for pattern in self.config.exclude_paths:
81 pattern = pattern.lstrip("/")
82 if pattern.endswith("/**"):
83 dir_pattern = pattern[:-3]
84 if dir_pattern == os.path.dirname(rel_path) or os.path.dirname(
85 rel_path
86 ).startswith(dir_pattern + "/"):
87 self.logger.debug(
88 f"Skipping {rel_path}: matches exclude directory pattern {pattern}"
89 )
90 return False
91 elif pattern.endswith("/"):
92 dir_pattern = pattern[:-1]
93 if os.path.dirname(rel_path) == dir_pattern or os.path.dirname(
94 rel_path
95 ).startswith(dir_pattern + "/"):
96 self.logger.debug(
97 f"Skipping {rel_path}: matches exclude directory pattern {pattern}"
98 )
99 return False
100 elif fnmatch.fnmatch(rel_path, pattern):
101 self.logger.debug(
102 f"Skipping {rel_path}: matches exclude pattern {pattern}"
103 )
104 return False
106 file_type_match = False
107 file_ext = os.path.splitext(file_basename)[1].lower()
108 self.logger.debug(f"Checking file extension: {file_ext}")
110 # If no file types are configured, process all files (default behavior)
111 if not self.config.file_types:
112 self.logger.debug(
113 "No file types configured, processing all readable files"
114 )
115 file_type_match = True
116 else:
117 # Check configured file types
118 for pattern in self.config.file_types:
119 self.logger.debug(f"Checking file type pattern: {pattern}")
120 # Handle patterns that start with a dot (e.g., ".txt")
121 if pattern.startswith("."):
122 pattern_ext = pattern.lower()
123 else:
124 pattern_ext = os.path.splitext(pattern)[1].lower()
126 if pattern_ext and file_ext == pattern_ext:
127 file_type_match = True
128 self.logger.debug(
129 f"File {rel_path} matches file type pattern {pattern}"
130 )
131 break
133 # If file conversion is enabled and file doesn't match configured types,
134 # check if it can be converted
135 if (
136 not file_type_match
137 and self.config.enable_file_conversion
138 and self.file_detector
139 ):
140 if self.file_detector.is_supported_for_conversion(file_path):
141 file_type_match = True
142 self.logger.debug(f"File {rel_path} supported for conversion")
144 if not file_type_match:
145 self.logger.debug(
146 f"Skipping {rel_path}: does not match any file type patterns and not supported for conversion"
147 )
148 return False
150 file_size = os.path.getsize(file_path)
151 if file_size > self.config.max_file_size:
152 self.logger.debug(f"Skipping {rel_path}: exceeds max file size")
153 return False
155 if not self.config.include_paths:
156 return True
158 rel_dir = os.path.dirname(rel_path)
159 for pattern in self.config.include_paths:
160 pattern = pattern.lstrip("/")
161 if pattern == "" or pattern == "/":
162 if rel_dir == "":
163 return True
164 if pattern.endswith("/**/*"):
165 dir_pattern = pattern[:-5]
166 if dir_pattern == "" or dir_pattern == "/":
167 return True
168 if dir_pattern == rel_dir or rel_dir.startswith(dir_pattern + "/"):
169 return True
170 elif pattern.endswith("/"):
171 dir_pattern = pattern[:-1]
172 if dir_pattern == "" or dir_pattern == "/":
173 if rel_dir == "":
174 return True
175 if dir_pattern == rel_dir or rel_dir.startswith(dir_pattern + "/"):
176 return True
177 elif fnmatch.fnmatch(rel_path, pattern):
178 return True
179 return False
180 except Exception as e:
181 self.logger.error(f"Error checking if file should be processed: {e}")
182 return False