Coverage for src / qdrant_loader / connectors / git / file_processor.py: 79%
112 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 09:46 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 09:46 +0000
1"""File processing and filtering logic for Git connector."""
3import fnmatch
4import os
5from typing import TYPE_CHECKING, Optional
7from qdrant_loader.utils.logging import LoggingConfig
9if TYPE_CHECKING:
10 from qdrant_loader.connectors.git.config import GitRepoConfig
11 from qdrant_loader.core.file_conversion import FileDetector
13logger = LoggingConfig.get_logger(__name__)
16class FileProcessor:
17 """Handles file processing and filtering logic."""
19 def __init__(
20 self,
21 config: "GitRepoConfig",
22 temp_dir: str,
23 file_detector: Optional["FileDetector"] = None,
24 ):
25 """Initialize the file processor.
27 Args:
28 config: Git repository configuration
29 temp_dir: Temporary directory path
30 file_detector: Optional file detector for conversion support
31 """
32 self.config = config
33 self.temp_dir = temp_dir
34 self.file_detector = file_detector
35 self.logger = LoggingConfig.get_logger(__name__)
37 def should_process_file(self, file_path: str) -> bool:
38 """Check if a file should be processed based on configuration.
40 Args:
41 file_path: Path to the file
43 Returns:
44 True if the file should be processed, False otherwise
45 """
46 try:
47 self.logger.debug(
48 "Checking if file should be processed",
49 file_path=file_path.replace("\\", "/"),
50 )
51 self.logger.debug(
52 "Current configuration",
53 file_types=self.config.file_types,
54 include_paths=self.config.include_paths,
55 exclude_paths=self.config.exclude_paths,
56 max_file_size=self.config.max_file_size,
57 )
59 # Check if file exists and is readable
60 if not os.path.isfile(file_path):
61 self.logger.debug(f"Skipping {file_path}: file does not exist")
62 return False
63 if not os.access(file_path, os.R_OK):
64 self.logger.debug(f"Skipping {file_path}: file is not readable")
65 return False
67 # Get relative path from repository root
68 # Handle cross-drive paths on Windows (ValueError when paths are on different drives)
69 try:
70 rel_path = os.path.relpath(file_path, self.temp_dir)
71 except ValueError:
72 # Cannot calculate relative path (e.g., cross-drive on Windows)
73 # Skip this file as we cannot reliably apply include/exclude patterns
74 self.logger.warning(
75 "Skipping file on different drive - cannot apply patterns",
76 file_path=file_path,
77 base_path=self.temp_dir,
78 )
79 return False
81 # Normalize path separators to forward slashes for consistent matching
82 rel_path = rel_path.replace("\\", "/")
83 self.logger.debug(f"Relative path: {rel_path}")
85 # Skip files that are just extensions without names (e.g. ".md")
86 file_basename = os.path.basename(rel_path)
87 if file_basename.startswith("."):
88 self.logger.debug(
89 f"Skipping {rel_path}: invalid filename (starts with dot)"
90 )
91 return False
93 # Check if file matches any exclude patterns first
94 for pattern in self.config.exclude_paths:
95 pattern = pattern.lstrip("/")
96 self.logger.debug(f"Checking exclude pattern: {pattern}")
97 if pattern.endswith("/**"):
98 dir_pattern = pattern[:-3] # Remove /** suffix
99 if dir_pattern == os.path.dirname(rel_path) or os.path.dirname(
100 rel_path
101 ).startswith(dir_pattern + "/"):
102 self.logger.debug(
103 f"Skipping {rel_path}: matches exclude directory pattern {pattern}"
104 )
105 return False
106 elif pattern.endswith("/"):
107 dir_pattern = pattern[:-1] # Remove trailing slash
108 if os.path.dirname(rel_path) == dir_pattern or os.path.dirname(
109 rel_path
110 ).startswith(dir_pattern + "/"):
111 self.logger.debug(
112 f"Skipping {rel_path}: matches exclude directory pattern {pattern}"
113 )
114 return False
115 elif fnmatch.fnmatch(rel_path, pattern):
116 self.logger.debug(
117 f"Skipping {rel_path}: matches exclude pattern {pattern}"
118 )
119 return False
121 # Check if file matches any file type patterns (case-insensitive)
122 file_type_match = False
123 file_ext = os.path.splitext(file_basename)[
124 1
125 ].lower() # Get extension with dot
126 self.logger.debug(f"Checking file extension: {file_ext}")
128 # If no file types are configured, process all files (default behavior)
129 if not self.config.file_types:
130 self.logger.debug(
131 "No file types configured, processing all readable files"
132 )
133 file_type_match = True
134 else:
135 # Check configured file types
136 for pattern in self.config.file_types:
137 self.logger.debug(f"Checking file type pattern: {pattern}")
138 # Handle patterns that start with a dot (e.g., ".txt") or extract from glob (e.g., "*.md" -> ".md")
139 if pattern.startswith("."):
140 pattern_ext = pattern.lower()
141 else:
142 pattern_ext = os.path.splitext(pattern)[1].lower()
144 if pattern_ext and file_ext == pattern_ext:
145 file_type_match = True
146 self.logger.debug(
147 f"File {rel_path} matches file type pattern {pattern}"
148 )
149 break
151 # If file conversion is enabled and file doesn't match configured types,
152 # check if it can be converted
153 if (
154 not file_type_match
155 and self.config.enable_file_conversion
156 and self.file_detector
157 ):
158 if self.file_detector.is_supported_for_conversion(file_path):
159 file_type_match = True
160 self.logger.debug(f"File {rel_path} supported for conversion")
162 if not file_type_match:
163 self.logger.debug(
164 f"Skipping {rel_path}: does not match any file type patterns and not supported for conversion"
165 )
166 return False
168 # Check file size
169 file_size = os.path.getsize(file_path)
170 self.logger.debug(
171 f"File size: {file_size} bytes (max: {self.config.max_file_size})"
172 )
173 if file_size > self.config.max_file_size:
174 self.logger.debug(f"Skipping {rel_path}: exceeds max file size")
175 return False
177 # Check if file matches any include patterns
178 if not self.config.include_paths:
179 # If no include paths specified, include everything
180 self.logger.debug("No include paths specified, including all files")
181 return True
183 # Get the file's directory relative to repo root
184 rel_dir = os.path.dirname(rel_path)
185 self.logger.debug(f"Checking include patterns for directory: {rel_dir}")
187 for pattern in self.config.include_paths:
188 pattern = pattern.lstrip("/")
189 self.logger.debug(f"Checking include pattern: {pattern}")
190 if pattern == "" or pattern == "/":
191 # Root pattern means include only files in root directory
192 if rel_dir == "":
193 self.logger.debug(f"Including {rel_path}: matches root pattern")
194 return True
195 if pattern.endswith("/**/*"):
196 dir_pattern = pattern[:-5] # Remove /**/* suffix
197 if dir_pattern == "" or dir_pattern == "/":
198 self.logger.debug(
199 f"Including {rel_path}: matches root /**/* pattern"
200 )
201 return True # Root pattern with /**/* means include everything
202 if dir_pattern == rel_dir or rel_dir.startswith(dir_pattern + "/"):
203 self.logger.debug(
204 f"Including {rel_path}: matches directory pattern {pattern}"
205 )
206 return True
207 elif pattern.endswith("/"):
208 dir_pattern = pattern[:-1] # Remove trailing slash
209 if dir_pattern == "" or dir_pattern == "/":
210 # Root pattern with / means include only files in root directory
211 if rel_dir == "":
212 self.logger.debug(
213 f"Including {rel_path}: matches root pattern"
214 )
215 return True
216 if dir_pattern == rel_dir or rel_dir.startswith(dir_pattern + "/"):
217 self.logger.debug(
218 f"Including {rel_path}: matches directory pattern {pattern}"
219 )
220 return True
221 elif fnmatch.fnmatch(rel_path, pattern):
222 self.logger.debug(
223 f"Including {rel_path}: matches exact pattern {pattern}"
224 )
225 return True
227 # If we have include patterns but none matched, exclude the file
228 self.logger.debug(f"Skipping {rel_path}: not in include paths")
229 return False
231 except Exception as e:
232 self.logger.error(f"Error checking if file should be processed: {e}")
233 return False