Coverage for src/qdrant_loader/connectors/git/operations.py: 88%
135 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:50 +0000
1"""Git operations wrapper."""
3import os
4import shutil
5import time
6from datetime import datetime
8import git
9from git.exc import GitCommandError
11from qdrant_loader.utils.logging import LoggingConfig
13logger = LoggingConfig.get_logger(__name__)
16class GitOperations:
17 """Git operations wrapper."""
19 def __init__(self):
20 """Initialize Git operations."""
21 self.repo = None
22 self.logger = LoggingConfig.get_logger(__name__)
23 self.logger.info("Initializing GitOperations")
25 def clone(
26 self,
27 url: str,
28 to_path: str,
29 branch: str,
30 depth: int,
31 max_retries: int = 3,
32 retry_delay: int = 2,
33 auth_token: str | None = None,
34 ) -> None:
35 """Clone a Git repository.
37 Args:
38 url (str): Repository URL or local path
39 to_path (str): Local path to clone to
40 branch (str): Branch to clone
41 depth (int): Clone depth (use 0 for full history)
42 max_retries (int, optional): Maximum number of retry attempts. Defaults to 3.
43 retry_delay (int, optional): Delay between retries in seconds. Defaults to 2.
44 auth_token (Optional[str], optional): Authentication token. Defaults to None.
45 """
46 # Resolve the URL to an absolute path if it's a local path
47 if os.path.exists(url):
48 url = os.path.abspath(url)
49 self.logger.info("Using local repository", url=url)
51 # Ensure the source is a valid Git repository
52 if not os.path.exists(os.path.join(url, ".git")):
53 self.logger.error("Invalid Git repository", path=url)
54 raise ValueError(f"Path {url} is not a valid Git repository")
56 # Copy the repository
57 shutil.copytree(url, to_path, dirs_exist_ok=True)
58 self.repo = git.Repo(to_path)
59 return
61 for attempt in range(max_retries):
62 try:
63 clone_args = ["--branch", branch]
64 if depth > 0:
65 clone_args.extend(["--depth", str(depth)])
67 # Store original value and disable credential prompts
68 original_prompt = os.environ.get("GIT_TERMINAL_PROMPT")
69 os.environ["GIT_TERMINAL_PROMPT"] = "0"
70 self.logger.info(
71 f"Cloning repository : {url} | branch: {branch} | depth: {depth}",
72 )
73 try:
74 # If auth token is provided, modify the URL to include it
75 clone_url = url
76 if auth_token and url.startswith("https://"):
77 # Insert token into URL: https://token@github.com/...
78 clone_url = url.replace("https://", f"https://{auth_token}@")
79 self.logger.debug("Using authenticated URL", url=clone_url)
81 self.logger.debug(
82 "Attempting to clone repository",
83 attempt=attempt + 1,
84 max_attempts=max_retries,
85 url=clone_url,
86 branch=branch,
87 depth=depth,
88 to_path=to_path,
89 )
91 # Verify target directory is empty
92 if os.path.exists(to_path) and os.listdir(to_path):
93 self.logger.warning(
94 "Target directory is not empty",
95 to_path=to_path,
96 contents=os.listdir(to_path),
97 )
98 shutil.rmtree(to_path)
99 os.makedirs(to_path)
101 self.repo = git.Repo.clone_from(
102 clone_url, to_path, multi_options=clone_args
103 )
105 # Verify repository was cloned successfully
106 if not self.repo or not os.path.exists(
107 os.path.join(to_path, ".git")
108 ):
109 raise RuntimeError("Repository was not cloned successfully")
111 self.logger.info("Successfully cloned repository")
112 finally:
113 # Restore original value
114 if original_prompt is not None:
115 os.environ["GIT_TERMINAL_PROMPT"] = original_prompt
116 else:
117 del os.environ["GIT_TERMINAL_PROMPT"]
118 return
119 except GitCommandError as e:
120 self.logger.error(
121 "Git clone attempt failed",
122 attempt=attempt + 1,
123 max_attempts=max_retries,
124 error=str(e),
125 error_type=type(e).__name__,
126 stderr=getattr(e, "stderr", None),
127 )
128 if attempt < max_retries - 1:
129 self.logger.warning(
130 f"Retrying in {retry_delay} seconds...",
131 next_attempt=attempt + 2,
132 )
133 time.sleep(retry_delay)
134 else:
135 self.logger.error("All clone attempts failed", error=str(e))
136 raise
138 def get_file_content(self, file_path: str) -> str:
139 """Get file content.
141 Args:
142 file_path (str): Path to the file
144 Returns:
145 str: File content
147 Raises:
148 ValueError: If repository is not initialized
149 FileNotFoundError: If file does not exist in the repository
150 Exception: For other errors
151 """
152 try:
153 if not self.repo:
154 raise ValueError("Repository not initialized")
156 # Get the relative path from the repository root
157 rel_path = os.path.relpath(file_path, self.repo.working_dir)
159 # Check if file exists in the repository
160 try:
161 # First try to get the file content using git show
162 content = self.repo.git.show(f"HEAD:{rel_path}")
163 return content
164 except GitCommandError as e:
165 if "exists on disk, but not in" in str(e):
166 # File exists on disk but not in the repository
167 raise FileNotFoundError(
168 f"File {rel_path} exists on disk but not in the repository"
169 ) from e
170 elif "does not exist" in str(e):
171 # File does not exist in the repository
172 raise FileNotFoundError(
173 f"File {rel_path} does not exist in the repository"
174 ) from e
175 else:
176 # Other git command errors
177 raise
178 except Exception as e:
179 self.logger.error(f"Failed to read file {file_path}: {e}")
180 raise
182 def get_last_commit_date(self, file_path: str) -> datetime | None:
183 """Get the last commit date for a file.
185 Args:
186 file_path: Path to the file
188 Returns:
189 Last commit date or None if not found
190 """
191 try:
192 if not self.repo:
193 raise ValueError("Repository not initialized")
195 # Get the relative path from the repository root
196 rel_path = os.path.relpath(file_path, self.repo.working_dir)
197 self.logger.debug("Getting last commit date", file_path=rel_path)
199 # Get the last commit for the file
200 try:
201 commits = list(self.repo.iter_commits(paths=rel_path, max_count=1))
202 if commits:
203 last_commit = commits[0]
204 self.logger.debug(
205 "Found last commit",
206 file_path=rel_path,
207 commit_date=last_commit.committed_datetime,
208 commit_hash=last_commit.hexsha,
209 )
210 return last_commit.committed_datetime
211 self.logger.debug("No commits found for file", file_path=rel_path)
212 return None
213 except GitCommandError as e:
214 self.logger.warning(
215 "Failed to get commits for file",
216 file_path=rel_path,
217 error=str(e),
218 error_type=type(e).__name__,
219 )
220 return None
221 except BrokenPipeError as e:
222 self.logger.warning(
223 "Git process terminated unexpectedly",
224 file_path=rel_path,
225 error=str(e),
226 error_type=type(e).__name__,
227 )
228 return None
229 except Exception as e:
230 self.logger.warning(
231 "Unexpected error getting commits",
232 file_path=rel_path,
233 error=str(e),
234 error_type=type(e).__name__,
235 )
236 return None
238 except Exception as e:
239 self.logger.error(
240 "Failed to get last commit date",
241 file_path=file_path,
242 error=str(e),
243 error_type=type(e).__name__,
244 )
245 return None
247 def get_first_commit_date(self, file_path: str) -> datetime | None:
248 """Get the creation date for a file.
250 Args:
251 file_path: Path to the file
253 Returns:
254 Creation date or None if not found
255 """
256 try:
257 if not self.repo:
258 raise ValueError("Repository not initialized")
260 # Get the relative path from the repository root
261 rel_path = os.path.relpath(file_path, self.repo.working_dir)
262 self.logger.debug("Getting creation date", file_path=rel_path)
264 # Get the first commit for the file
265 try:
266 # Use git log with --reverse to get commits in chronological order
267 commits = list(
268 self.repo.iter_commits(paths=rel_path, reverse=True, max_count=1)
269 )
270 if commits:
271 first_commit = commits[0]
272 self.logger.debug(
273 "Found first commit",
274 file_path=rel_path,
275 commit_date=first_commit.committed_datetime,
276 commit_hash=first_commit.hexsha,
277 )
278 return first_commit.committed_datetime
279 self.logger.debug("No commits found for file", file_path=rel_path)
280 return None
281 except GitCommandError as e:
282 self.logger.warning(
283 "Failed to get commits for file",
284 file_path=rel_path,
285 error=str(e),
286 error_type=type(e).__name__,
287 )
288 return None
289 except BrokenPipeError as e:
290 self.logger.warning(
291 "Git process terminated unexpectedly",
292 file_path=rel_path,
293 error=str(e),
294 error_type=type(e).__name__,
295 )
296 return None
297 except Exception as e:
298 self.logger.warning(
299 "Unexpected error getting commits",
300 file_path=rel_path,
301 error=str(e),
302 error_type=type(e).__name__,
303 )
304 return None
306 except Exception as e:
307 self.logger.error(
308 "Failed to get creation date",
309 file_path=file_path,
310 error=str(e),
311 error_type=type(e).__name__,
312 )
313 return None
315 def list_files(self) -> list[str]:
316 """List all files in the repository.
318 Returns:
319 List of file paths
320 """
321 try:
322 if not self.repo:
323 raise ValueError("Repository not initialized")
325 # Use git ls-tree to list all files
326 output = self.repo.git.ls_tree("-r", "--name-only", "HEAD")
327 files = output.splitlines() if output else []
329 # Convert relative paths to absolute paths
330 return [os.path.join(self.repo.working_dir, f) for f in files]
331 except Exception as e:
332 self.logger.error("Failed to list files", error=str(e))
333 raise