Coverage for src / qdrant_loader / connectors / git / operations.py: 98%
141 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:40 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:40 +0000
1"""Git operations wrapper."""
3import os
4import shutil
5import time
6from datetime import datetime
8import git
9from git.exc import GitCommandError
11from qdrant_loader.utils.logging import LoggingConfig
13logger = LoggingConfig.get_logger(__name__)
16class GitOperations:
17 """Git operations wrapper."""
19 def __init__(self):
20 """Initialize Git operations."""
21 self.repo = None
22 self.logger = LoggingConfig.get_logger(__name__)
23 self.logger.info("Initializing GitOperations")
25 @staticmethod
26 def _to_git_path(path: str) -> str:
27 """Normalize a filesystem path to git pathspec format.
29 Git commands expect POSIX-style separators, even on Windows.
30 """
31 return path.replace("\\", "/")
33 def clone(
34 self,
35 url: str,
36 to_path: str,
37 branch: str,
38 depth: int,
39 max_retries: int = 3,
40 retry_delay: int = 2,
41 auth_token: str | None = None,
42 ) -> None:
43 """Clone a Git repository.
45 Args:
46 url (str): Repository URL or local path
47 to_path (str): Local path to clone to
48 branch (str): Branch to clone
49 depth (int): Clone depth (use 0 for full history)
50 max_retries (int, optional): Maximum number of retry attempts. Defaults to 3.
51 retry_delay (int, optional): Delay between retries in seconds. Defaults to 2.
52 auth_token (Optional[str], optional): Authentication token. Defaults to None.
53 """
54 # Resolve the URL to an absolute path if it's a local path
55 if os.path.exists(url):
56 url = os.path.abspath(url)
57 self.logger.info("Using local repository", url=url)
59 # Ensure the source is a valid Git repository
60 if not os.path.exists(os.path.join(url, ".git")):
61 self.logger.error("Invalid Git repository", path=url)
62 raise ValueError(f"Path {url} is not a valid Git repository")
64 # Copy the repository
65 shutil.copytree(url, to_path, dirs_exist_ok=True)
66 self.repo = git.Repo(to_path)
67 return
69 for attempt in range(max_retries):
70 try:
71 clone_args = ["--branch", branch]
72 if depth > 0:
73 clone_args.extend(["--depth", str(depth)])
75 # Store original value and disable credential prompts
76 original_prompt = os.environ.get("GIT_TERMINAL_PROMPT")
77 os.environ["GIT_TERMINAL_PROMPT"] = "0"
78 self.logger.info(
79 f"Cloning repository : {url} | branch: {branch} | depth: {depth}",
80 )
81 try:
82 # If auth token is provided, modify the URL to include it
83 clone_url = url
84 if auth_token and url.startswith("https://"):
85 # Insert token into URL: https://token@github.com/...
86 clone_url = url.replace("https://", f"https://{auth_token}@")
87 self.logger.debug("Using authenticated URL", url=clone_url)
89 self.logger.debug(
90 "Attempting to clone repository",
91 attempt=attempt + 1,
92 max_attempts=max_retries,
93 url=clone_url,
94 branch=branch,
95 depth=depth,
96 to_path=to_path,
97 )
99 # Verify target directory is empty
100 if os.path.exists(to_path) and os.listdir(to_path):
101 self.logger.warning(
102 "Target directory is not empty",
103 to_path=to_path,
104 contents=os.listdir(to_path),
105 )
106 shutil.rmtree(to_path)
107 os.makedirs(to_path)
109 self.repo = git.Repo.clone_from(
110 clone_url, to_path, multi_options=clone_args
111 )
113 # Verify repository was cloned successfully
114 if not self.repo or not os.path.exists(
115 os.path.join(to_path, ".git")
116 ):
117 raise RuntimeError("Repository was not cloned successfully")
119 self.logger.info("Successfully cloned repository")
120 finally:
121 # Restore original value
122 if original_prompt is not None:
123 os.environ["GIT_TERMINAL_PROMPT"] = original_prompt
124 else:
125 del os.environ["GIT_TERMINAL_PROMPT"]
126 return
127 except GitCommandError as e:
128 self.logger.error(
129 "Git clone attempt failed",
130 attempt=attempt + 1,
131 max_attempts=max_retries,
132 error=str(e),
133 error_type=type(e).__name__,
134 stderr=getattr(e, "stderr", None),
135 )
136 if attempt < max_retries - 1:
137 self.logger.warning(
138 f"Retrying in {retry_delay} seconds...",
139 next_attempt=attempt + 2,
140 )
141 time.sleep(retry_delay)
142 else:
143 self.logger.error("All clone attempts failed", error=str(e))
144 raise
146 def get_file_content(self, file_path: str) -> str:
147 """Get file content.
149 Args:
150 file_path (str): Path to the file
152 Returns:
153 str: File content
155 Raises:
156 ValueError: If repository is not initialized
157 FileNotFoundError: If file does not exist in the repository
158 Exception: For other errors
159 """
160 try:
161 if not self.repo:
162 raise ValueError("Repository not initialized")
164 # Get the relative path from the repository root
165 rel_path = os.path.relpath(file_path, self.repo.working_dir)
166 rel_path = self._to_git_path(rel_path)
168 # Check if file exists in the repository
169 try:
170 # First try to get the file content using git show
171 content = self.repo.git.show(f"HEAD:{rel_path}")
172 return content
173 except GitCommandError as e:
174 if "exists on disk, but not in" in str(e):
175 # File exists on disk but not in the repository
176 raise FileNotFoundError(
177 f"File {rel_path} exists on disk but not in the repository"
178 ) from e
179 elif "does not exist" in str(e):
180 # File does not exist in the repository
181 raise FileNotFoundError(
182 f"File {rel_path} does not exist in the repository"
183 ) from e
184 else:
185 # Other git command errors
186 raise
187 except Exception as e:
188 self.logger.error(f"Failed to read file {file_path}: {e}")
189 raise
191 def get_last_commit_date(self, file_path: str) -> datetime | None:
192 """Get the last commit date for a file.
194 Args:
195 file_path: Path to the file
197 Returns:
198 Last commit date or None if not found
199 """
200 try:
201 if not self.repo:
202 raise ValueError("Repository not initialized")
204 # Get the relative path from the repository root
205 rel_path = os.path.relpath(file_path, self.repo.working_dir)
206 rel_path = self._to_git_path(rel_path)
207 self.logger.debug("Getting last commit date", file_path=rel_path)
209 # Get the last commit for the file
210 try:
211 commits = list(self.repo.iter_commits(paths=rel_path, max_count=1))
212 if commits:
213 last_commit = commits[0]
214 self.logger.debug(
215 "Found last commit",
216 file_path=rel_path,
217 commit_date=last_commit.committed_datetime,
218 commit_hash=last_commit.hexsha,
219 )
220 return last_commit.committed_datetime
221 self.logger.debug("No commits found for file", file_path=rel_path)
222 return None
223 except GitCommandError as e:
224 self.logger.warning(
225 "Failed to get commits for file",
226 file_path=rel_path,
227 error=str(e),
228 error_type=type(e).__name__,
229 )
230 return None
231 except BrokenPipeError as e:
232 self.logger.warning(
233 "Git process terminated unexpectedly",
234 file_path=rel_path,
235 error=str(e),
236 error_type=type(e).__name__,
237 )
238 return None
239 except Exception as e:
240 self.logger.warning(
241 "Unexpected error getting commits",
242 file_path=rel_path,
243 error=str(e),
244 error_type=type(e).__name__,
245 )
246 return None
248 except Exception as e:
249 self.logger.error(
250 "Failed to get last commit date",
251 file_path=file_path,
252 error=str(e),
253 error_type=type(e).__name__,
254 )
255 return None
257 def get_first_commit_date(self, file_path: str) -> datetime | None:
258 """Get the creation date for a file.
260 Args:
261 file_path: Path to the file
263 Returns:
264 Creation date or None if not found
265 """
266 try:
267 if not self.repo:
268 raise ValueError("Repository not initialized")
270 # Get the relative path from the repository root
271 rel_path = os.path.relpath(file_path, self.repo.working_dir)
272 rel_path = self._to_git_path(rel_path)
273 self.logger.debug("Getting creation date", file_path=rel_path)
275 # Get the first commit for the file
276 try:
277 # Use git log with --reverse to get commits in chronological order
278 commits = list(
279 self.repo.iter_commits(paths=rel_path, reverse=True, max_count=1)
280 )
281 if commits:
282 first_commit = commits[0]
283 self.logger.debug(
284 "Found first commit",
285 file_path=rel_path,
286 commit_date=first_commit.committed_datetime,
287 commit_hash=first_commit.hexsha,
288 )
289 return first_commit.committed_datetime
290 self.logger.debug("No commits found for file", file_path=rel_path)
291 return None
292 except GitCommandError as e:
293 self.logger.warning(
294 "Failed to get commits for file",
295 file_path=rel_path,
296 error=str(e),
297 error_type=type(e).__name__,
298 )
299 return None
300 except BrokenPipeError as e:
301 self.logger.warning(
302 "Git process terminated unexpectedly",
303 file_path=rel_path,
304 error=str(e),
305 error_type=type(e).__name__,
306 )
307 return None
308 except Exception as e:
309 self.logger.warning(
310 "Unexpected error getting commits",
311 file_path=rel_path,
312 error=str(e),
313 error_type=type(e).__name__,
314 )
315 return None
317 except Exception as e:
318 self.logger.error(
319 "Failed to get creation date",
320 file_path=file_path,
321 error=str(e),
322 error_type=type(e).__name__,
323 )
324 return None
326 def list_files(self) -> list[str]:
327 """List all files in the repository.
329 Returns:
330 List of file paths
331 """
332 try:
333 if not self.repo:
334 raise ValueError("Repository not initialized")
336 # Use git ls-tree to list all files
337 output = self.repo.git.ls_tree("-r", "--name-only", "HEAD")
338 files = output.splitlines() if output else []
340 # Convert relative paths to absolute paths
341 return [os.path.join(self.repo.working_dir, f) for f in files]
342 except Exception as e:
343 self.logger.error("Failed to list files", error=str(e))
344 raise