Coverage for src / qdrant_loader / connectors / localfile / file_processor.py: 94%

99 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 09:46 +0000

1"""File processing and filtering logic for LocalFile connector.""" 

2 

3import fnmatch 

4import os 

5from typing import TYPE_CHECKING, Optional 

6 

7from qdrant_loader.utils.logging import LoggingConfig 

8 

9if TYPE_CHECKING: 

10 from qdrant_loader.core.file_conversion import FileDetector 

11 

12from .config import LocalFileConfig 

13 

14 

15class LocalFileFileProcessor: 

16 """Handles file processing and filtering logic for local files.""" 

17 

18 def __init__( 

19 self, 

20 config: LocalFileConfig, 

21 base_path: str, 

22 file_detector: Optional["FileDetector"] = None, 

23 ): 

24 """Initialize the file processor. 

25 

26 Args: 

27 config: Local file configuration 

28 base_path: Base directory path 

29 file_detector: Optional file detector for conversion support 

30 """ 

31 self.config = config 

32 self.base_path = base_path 

33 self.file_detector = file_detector 

34 self.logger = LoggingConfig.get_logger(__name__) 

35 

36 def should_process_file(self, file_path: str) -> bool: 

37 try: 

38 self.logger.debug( 

39 "Checking if file should be processed", 

40 file_path=file_path.replace("\\", "/"), 

41 ) 

42 self.logger.debug( 

43 "Current configuration", 

44 file_types=self.config.file_types, 

45 include_paths=self.config.include_paths, 

46 exclude_paths=self.config.exclude_paths, 

47 max_file_size=self.config.max_file_size, 

48 ) 

49 

50 if not os.path.isfile(file_path): 

51 self.logger.debug(f"Skipping {file_path}: file does not exist") 

52 return False 

53 if not os.access(file_path, os.R_OK): 

54 self.logger.debug(f"Skipping {file_path}: file is not readable") 

55 return False 

56 

57 # Handle cross-drive paths on Windows 

58 try: 

59 rel_path = os.path.relpath(file_path, self.base_path) 

60 except ValueError: 

61 # Cannot calculate relative path (e.g., cross-drive on Windows) 

62 # Skip this file as we cannot reliably apply include/exclude patterns 

63 self.logger.warning( 

64 "Skipping file on different drive - cannot apply patterns", 

65 file_path=file_path, 

66 base_path=self.base_path, 

67 ) 

68 return False 

69 

70 # Normalize path separators to forward slashes for consistent matching 

71 rel_path = rel_path.replace("\\", "/") 

72 

73 file_basename = os.path.basename(rel_path) 

74 if file_basename.startswith("."): 

75 self.logger.debug( 

76 f"Skipping {rel_path}: invalid filename (starts with dot)" 

77 ) 

78 return False 

79 

80 for pattern in self.config.exclude_paths: 

81 pattern = pattern.lstrip("/") 

82 if pattern.endswith("/**"): 

83 dir_pattern = pattern[:-3] 

84 if dir_pattern == os.path.dirname(rel_path) or os.path.dirname( 

85 rel_path 

86 ).startswith(dir_pattern + "/"): 

87 self.logger.debug( 

88 f"Skipping {rel_path}: matches exclude directory pattern {pattern}" 

89 ) 

90 return False 

91 elif pattern.endswith("/"): 

92 dir_pattern = pattern[:-1] 

93 if os.path.dirname(rel_path) == dir_pattern or os.path.dirname( 

94 rel_path 

95 ).startswith(dir_pattern + "/"): 

96 self.logger.debug( 

97 f"Skipping {rel_path}: matches exclude directory pattern {pattern}" 

98 ) 

99 return False 

100 elif fnmatch.fnmatch(rel_path, pattern): 

101 self.logger.debug( 

102 f"Skipping {rel_path}: matches exclude pattern {pattern}" 

103 ) 

104 return False 

105 

106 file_type_match = False 

107 file_ext = os.path.splitext(file_basename)[1].lower() 

108 self.logger.debug(f"Checking file extension: {file_ext}") 

109 

110 # If no file types are configured, process all files (default behavior) 

111 if not self.config.file_types: 

112 self.logger.debug( 

113 "No file types configured, processing all readable files" 

114 ) 

115 file_type_match = True 

116 else: 

117 # Check configured file types 

118 for pattern in self.config.file_types: 

119 self.logger.debug(f"Checking file type pattern: {pattern}") 

120 # Handle patterns that start with a dot (e.g., ".txt") 

121 if pattern.startswith("."): 

122 pattern_ext = pattern.lower() 

123 else: 

124 pattern_ext = os.path.splitext(pattern)[1].lower() 

125 

126 if pattern_ext and file_ext == pattern_ext: 

127 file_type_match = True 

128 self.logger.debug( 

129 f"File {rel_path} matches file type pattern {pattern}" 

130 ) 

131 break 

132 

133 # If file conversion is enabled and file doesn't match configured types, 

134 # check if it can be converted 

135 if ( 

136 not file_type_match 

137 and self.config.enable_file_conversion 

138 and self.file_detector 

139 ): 

140 if self.file_detector.is_supported_for_conversion(file_path): 

141 file_type_match = True 

142 self.logger.debug(f"File {rel_path} supported for conversion") 

143 

144 if not file_type_match: 

145 self.logger.debug( 

146 f"Skipping {rel_path}: does not match any file type patterns and not supported for conversion" 

147 ) 

148 return False 

149 

150 file_size = os.path.getsize(file_path) 

151 if file_size > self.config.max_file_size: 

152 self.logger.debug(f"Skipping {rel_path}: exceeds max file size") 

153 return False 

154 

155 if not self.config.include_paths: 

156 return True 

157 

158 rel_dir = os.path.dirname(rel_path) 

159 for pattern in self.config.include_paths: 

160 pattern = pattern.lstrip("/") 

161 if pattern == "" or pattern == "/": 

162 if rel_dir == "": 

163 return True 

164 if pattern.endswith("/**/*"): 

165 dir_pattern = pattern[:-5] 

166 if dir_pattern == "" or dir_pattern == "/": 

167 return True 

168 if dir_pattern == rel_dir or rel_dir.startswith(dir_pattern + "/"): 

169 return True 

170 elif pattern.endswith("/"): 

171 dir_pattern = pattern[:-1] 

172 if dir_pattern == "" or dir_pattern == "/": 

173 if rel_dir == "": 

174 return True 

175 if dir_pattern == rel_dir or rel_dir.startswith(dir_pattern + "/"): 

176 return True 

177 elif fnmatch.fnmatch(rel_path, pattern): 

178 return True 

179 return False 

180 except Exception as e: 

181 self.logger.error(f"Error checking if file should be processed: {e}") 

182 return False