Coverage for src / qdrant_loader / connectors / git / file_processor.py: 79%

112 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 09:46 +0000

1"""File processing and filtering logic for Git connector.""" 

2 

3import fnmatch 

4import os 

5from typing import TYPE_CHECKING, Optional 

6 

7from qdrant_loader.utils.logging import LoggingConfig 

8 

9if TYPE_CHECKING: 

10 from qdrant_loader.connectors.git.config import GitRepoConfig 

11 from qdrant_loader.core.file_conversion import FileDetector 

12 

13logger = LoggingConfig.get_logger(__name__) 

14 

15 

16class FileProcessor: 

17 """Handles file processing and filtering logic.""" 

18 

19 def __init__( 

20 self, 

21 config: "GitRepoConfig", 

22 temp_dir: str, 

23 file_detector: Optional["FileDetector"] = None, 

24 ): 

25 """Initialize the file processor. 

26 

27 Args: 

28 config: Git repository configuration 

29 temp_dir: Temporary directory path 

30 file_detector: Optional file detector for conversion support 

31 """ 

32 self.config = config 

33 self.temp_dir = temp_dir 

34 self.file_detector = file_detector 

35 self.logger = LoggingConfig.get_logger(__name__) 

36 

37 def should_process_file(self, file_path: str) -> bool: 

38 """Check if a file should be processed based on configuration. 

39 

40 Args: 

41 file_path: Path to the file 

42 

43 Returns: 

44 True if the file should be processed, False otherwise 

45 """ 

46 try: 

47 self.logger.debug( 

48 "Checking if file should be processed", 

49 file_path=file_path.replace("\\", "/"), 

50 ) 

51 self.logger.debug( 

52 "Current configuration", 

53 file_types=self.config.file_types, 

54 include_paths=self.config.include_paths, 

55 exclude_paths=self.config.exclude_paths, 

56 max_file_size=self.config.max_file_size, 

57 ) 

58 

59 # Check if file exists and is readable 

60 if not os.path.isfile(file_path): 

61 self.logger.debug(f"Skipping {file_path}: file does not exist") 

62 return False 

63 if not os.access(file_path, os.R_OK): 

64 self.logger.debug(f"Skipping {file_path}: file is not readable") 

65 return False 

66 

67 # Get relative path from repository root 

68 # Handle cross-drive paths on Windows (ValueError when paths are on different drives) 

69 try: 

70 rel_path = os.path.relpath(file_path, self.temp_dir) 

71 except ValueError: 

72 # Cannot calculate relative path (e.g., cross-drive on Windows) 

73 # Skip this file as we cannot reliably apply include/exclude patterns 

74 self.logger.warning( 

75 "Skipping file on different drive - cannot apply patterns", 

76 file_path=file_path, 

77 base_path=self.temp_dir, 

78 ) 

79 return False 

80 

81 # Normalize path separators to forward slashes for consistent matching 

82 rel_path = rel_path.replace("\\", "/") 

83 self.logger.debug(f"Relative path: {rel_path}") 

84 

85 # Skip files that are just extensions without names (e.g. ".md") 

86 file_basename = os.path.basename(rel_path) 

87 if file_basename.startswith("."): 

88 self.logger.debug( 

89 f"Skipping {rel_path}: invalid filename (starts with dot)" 

90 ) 

91 return False 

92 

93 # Check if file matches any exclude patterns first 

94 for pattern in self.config.exclude_paths: 

95 pattern = pattern.lstrip("/") 

96 self.logger.debug(f"Checking exclude pattern: {pattern}") 

97 if pattern.endswith("/**"): 

98 dir_pattern = pattern[:-3] # Remove /** suffix 

99 if dir_pattern == os.path.dirname(rel_path) or os.path.dirname( 

100 rel_path 

101 ).startswith(dir_pattern + "/"): 

102 self.logger.debug( 

103 f"Skipping {rel_path}: matches exclude directory pattern {pattern}" 

104 ) 

105 return False 

106 elif pattern.endswith("/"): 

107 dir_pattern = pattern[:-1] # Remove trailing slash 

108 if os.path.dirname(rel_path) == dir_pattern or os.path.dirname( 

109 rel_path 

110 ).startswith(dir_pattern + "/"): 

111 self.logger.debug( 

112 f"Skipping {rel_path}: matches exclude directory pattern {pattern}" 

113 ) 

114 return False 

115 elif fnmatch.fnmatch(rel_path, pattern): 

116 self.logger.debug( 

117 f"Skipping {rel_path}: matches exclude pattern {pattern}" 

118 ) 

119 return False 

120 

121 # Check if file matches any file type patterns (case-insensitive) 

122 file_type_match = False 

123 file_ext = os.path.splitext(file_basename)[ 

124 1 

125 ].lower() # Get extension with dot 

126 self.logger.debug(f"Checking file extension: {file_ext}") 

127 

128 # If no file types are configured, process all files (default behavior) 

129 if not self.config.file_types: 

130 self.logger.debug( 

131 "No file types configured, processing all readable files" 

132 ) 

133 file_type_match = True 

134 else: 

135 # Check configured file types 

136 for pattern in self.config.file_types: 

137 self.logger.debug(f"Checking file type pattern: {pattern}") 

138 # Handle patterns that start with a dot (e.g., ".txt") or extract from glob (e.g., "*.md" -> ".md") 

139 if pattern.startswith("."): 

140 pattern_ext = pattern.lower() 

141 else: 

142 pattern_ext = os.path.splitext(pattern)[1].lower() 

143 

144 if pattern_ext and file_ext == pattern_ext: 

145 file_type_match = True 

146 self.logger.debug( 

147 f"File {rel_path} matches file type pattern {pattern}" 

148 ) 

149 break 

150 

151 # If file conversion is enabled and file doesn't match configured types, 

152 # check if it can be converted 

153 if ( 

154 not file_type_match 

155 and self.config.enable_file_conversion 

156 and self.file_detector 

157 ): 

158 if self.file_detector.is_supported_for_conversion(file_path): 

159 file_type_match = True 

160 self.logger.debug(f"File {rel_path} supported for conversion") 

161 

162 if not file_type_match: 

163 self.logger.debug( 

164 f"Skipping {rel_path}: does not match any file type patterns and not supported for conversion" 

165 ) 

166 return False 

167 

168 # Check file size 

169 file_size = os.path.getsize(file_path) 

170 self.logger.debug( 

171 f"File size: {file_size} bytes (max: {self.config.max_file_size})" 

172 ) 

173 if file_size > self.config.max_file_size: 

174 self.logger.debug(f"Skipping {rel_path}: exceeds max file size") 

175 return False 

176 

177 # Check if file matches any include patterns 

178 if not self.config.include_paths: 

179 # If no include paths specified, include everything 

180 self.logger.debug("No include paths specified, including all files") 

181 return True 

182 

183 # Get the file's directory relative to repo root 

184 rel_dir = os.path.dirname(rel_path) 

185 self.logger.debug(f"Checking include patterns for directory: {rel_dir}") 

186 

187 for pattern in self.config.include_paths: 

188 pattern = pattern.lstrip("/") 

189 self.logger.debug(f"Checking include pattern: {pattern}") 

190 if pattern == "" or pattern == "/": 

191 # Root pattern means include only files in root directory 

192 if rel_dir == "": 

193 self.logger.debug(f"Including {rel_path}: matches root pattern") 

194 return True 

195 if pattern.endswith("/**/*"): 

196 dir_pattern = pattern[:-5] # Remove /**/* suffix 

197 if dir_pattern == "" or dir_pattern == "/": 

198 self.logger.debug( 

199 f"Including {rel_path}: matches root /**/* pattern" 

200 ) 

201 return True # Root pattern with /**/* means include everything 

202 if dir_pattern == rel_dir or rel_dir.startswith(dir_pattern + "/"): 

203 self.logger.debug( 

204 f"Including {rel_path}: matches directory pattern {pattern}" 

205 ) 

206 return True 

207 elif pattern.endswith("/"): 

208 dir_pattern = pattern[:-1] # Remove trailing slash 

209 if dir_pattern == "" or dir_pattern == "/": 

210 # Root pattern with / means include only files in root directory 

211 if rel_dir == "": 

212 self.logger.debug( 

213 f"Including {rel_path}: matches root pattern" 

214 ) 

215 return True 

216 if dir_pattern == rel_dir or rel_dir.startswith(dir_pattern + "/"): 

217 self.logger.debug( 

218 f"Including {rel_path}: matches directory pattern {pattern}" 

219 ) 

220 return True 

221 elif fnmatch.fnmatch(rel_path, pattern): 

222 self.logger.debug( 

223 f"Including {rel_path}: matches exact pattern {pattern}" 

224 ) 

225 return True 

226 

227 # If we have include patterns but none matched, exclude the file 

228 self.logger.debug(f"Skipping {rel_path}: not in include paths") 

229 return False 

230 

231 except Exception as e: 

232 self.logger.error(f"Error checking if file should be processed: {e}") 

233 return False