Coverage for src/qdrant_loader/connectors/localfile/file_processor.py: 51%
96 statements
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:39 +0000
« prev ^ index » next coverage.py v7.10.0, created at 2025-07-25 11:39 +0000
1"""File processing and filtering logic for LocalFile connector."""
3import fnmatch
4import os
5from typing import TYPE_CHECKING, Optional
7from qdrant_loader.utils.logging import LoggingConfig
9if TYPE_CHECKING:
10 from qdrant_loader.core.file_conversion import FileDetector
12from .config import LocalFileConfig
15class LocalFileFileProcessor:
16 """Handles file processing and filtering logic for local files."""
18 def __init__(
19 self,
20 config: LocalFileConfig,
21 base_path: str,
22 file_detector: Optional["FileDetector"] = None,
23 ):
24 """Initialize the file processor.
26 Args:
27 config: Local file configuration
28 base_path: Base directory path
29 file_detector: Optional file detector for conversion support
30 """
31 self.config = config
32 self.base_path = base_path
33 self.file_detector = file_detector
34 self.logger = LoggingConfig.get_logger(__name__)
36 def should_process_file(self, file_path: str) -> bool:
37 try:
38 self.logger.debug(
39 "Checking if file should be processed",
40 file_path=file_path.replace("\\", "/"),
41 )
42 self.logger.debug(
43 "Current configuration",
44 file_types=self.config.file_types,
45 include_paths=self.config.include_paths,
46 exclude_paths=self.config.exclude_paths,
47 max_file_size=self.config.max_file_size,
48 )
50 if not os.path.isfile(file_path):
51 self.logger.debug(f"Skipping {file_path}: file does not exist")
52 return False
53 if not os.access(file_path, os.R_OK):
54 self.logger.debug(f"Skipping {file_path}: file is not readable")
55 return False
57 rel_path = os.path.relpath(file_path, self.base_path)
58 file_basename = os.path.basename(rel_path)
59 if file_basename.startswith("."):
60 self.logger.debug(
61 f"Skipping {rel_path}: invalid filename (starts with dot)"
62 )
63 return False
65 for pattern in self.config.exclude_paths:
66 pattern = pattern.lstrip("/")
67 if pattern.endswith("/**"):
68 dir_pattern = pattern[:-3]
69 if dir_pattern == os.path.dirname(rel_path) or os.path.dirname(
70 rel_path
71 ).startswith(dir_pattern + "/"):
72 self.logger.debug(
73 f"Skipping {rel_path}: matches exclude directory pattern {pattern}"
74 )
75 return False
76 elif pattern.endswith("/"):
77 dir_pattern = pattern[:-1]
78 if os.path.dirname(rel_path) == dir_pattern or os.path.dirname(
79 rel_path
80 ).startswith(dir_pattern + "/"):
81 self.logger.debug(
82 f"Skipping {rel_path}: matches exclude directory pattern {pattern}"
83 )
84 return False
85 elif fnmatch.fnmatch(rel_path, pattern):
86 self.logger.debug(
87 f"Skipping {rel_path}: matches exclude pattern {pattern}"
88 )
89 return False
91 file_type_match = False
92 file_ext = os.path.splitext(file_basename)[1].lower()
93 self.logger.debug(f"Checking file extension: {file_ext}")
95 # If no file types are configured, process all files (default behavior)
96 if not self.config.file_types:
97 self.logger.debug(
98 "No file types configured, processing all readable files"
99 )
100 file_type_match = True
101 else:
102 # Check configured file types
103 for pattern in self.config.file_types:
104 self.logger.debug(f"Checking file type pattern: {pattern}")
105 # Handle patterns that start with a dot (e.g., ".txt")
106 if pattern.startswith('.'):
107 pattern_ext = pattern.lower()
108 else:
109 pattern_ext = os.path.splitext(pattern)[1].lower()
111 if pattern_ext and file_ext == pattern_ext:
112 file_type_match = True
113 self.logger.debug(
114 f"File {rel_path} matches file type pattern {pattern}"
115 )
116 break
118 # If file conversion is enabled and file doesn't match configured types,
119 # check if it can be converted
120 if (
121 not file_type_match
122 and self.config.enable_file_conversion
123 and self.file_detector
124 ):
125 if self.file_detector.is_supported_for_conversion(file_path):
126 file_type_match = True
127 self.logger.debug(f"File {rel_path} supported for conversion")
129 if not file_type_match:
130 self.logger.debug(
131 f"Skipping {rel_path}: does not match any file type patterns and not supported for conversion"
132 )
133 return False
135 file_size = os.path.getsize(file_path)
136 if file_size > self.config.max_file_size:
137 self.logger.debug(f"Skipping {rel_path}: exceeds max file size")
138 return False
140 if not self.config.include_paths:
141 return True
143 rel_dir = os.path.dirname(rel_path)
144 for pattern in self.config.include_paths:
145 pattern = pattern.lstrip("/")
146 if pattern == "" or pattern == "/":
147 if rel_dir == "":
148 return True
149 if pattern.endswith("/**/*"):
150 dir_pattern = pattern[:-5]
151 if dir_pattern == "" or dir_pattern == "/":
152 return True
153 if dir_pattern == rel_dir or rel_dir.startswith(dir_pattern + "/"):
154 return True
155 elif pattern.endswith("/"):
156 dir_pattern = pattern[:-1]
157 if dir_pattern == "" or dir_pattern == "/":
158 if rel_dir == "":
159 return True
160 if dir_pattern == rel_dir or rel_dir.startswith(dir_pattern + "/"):
161 return True
162 elif fnmatch.fnmatch(rel_path, pattern):
163 return True
164 return False
165 except Exception as e:
166 self.logger.error(f"Error checking if file should be processed: {e}")
167 return False