Coverage for src / qdrant_loader / connectors / git / operations.py: 98%

141 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-10 09:40 +0000

1"""Git operations wrapper.""" 

2 

3import os 

4import shutil 

5import time 

6from datetime import datetime 

7 

8import git 

9from git.exc import GitCommandError 

10 

11from qdrant_loader.utils.logging import LoggingConfig 

12 

13logger = LoggingConfig.get_logger(__name__) 

14 

15 

16class GitOperations: 

17 """Git operations wrapper.""" 

18 

19 def __init__(self): 

20 """Initialize Git operations.""" 

21 self.repo = None 

22 self.logger = LoggingConfig.get_logger(__name__) 

23 self.logger.info("Initializing GitOperations") 

24 

25 @staticmethod 

26 def _to_git_path(path: str) -> str: 

27 """Normalize a filesystem path to git pathspec format. 

28 

29 Git commands expect POSIX-style separators, even on Windows. 

30 """ 

31 return path.replace("\\", "/") 

32 

33 def clone( 

34 self, 

35 url: str, 

36 to_path: str, 

37 branch: str, 

38 depth: int, 

39 max_retries: int = 3, 

40 retry_delay: int = 2, 

41 auth_token: str | None = None, 

42 ) -> None: 

43 """Clone a Git repository. 

44 

45 Args: 

46 url (str): Repository URL or local path 

47 to_path (str): Local path to clone to 

48 branch (str): Branch to clone 

49 depth (int): Clone depth (use 0 for full history) 

50 max_retries (int, optional): Maximum number of retry attempts. Defaults to 3. 

51 retry_delay (int, optional): Delay between retries in seconds. Defaults to 2. 

52 auth_token (Optional[str], optional): Authentication token. Defaults to None. 

53 """ 

54 # Resolve the URL to an absolute path if it's a local path 

55 if os.path.exists(url): 

56 url = os.path.abspath(url) 

57 self.logger.info("Using local repository", url=url) 

58 

59 # Ensure the source is a valid Git repository 

60 if not os.path.exists(os.path.join(url, ".git")): 

61 self.logger.error("Invalid Git repository", path=url) 

62 raise ValueError(f"Path {url} is not a valid Git repository") 

63 

64 # Copy the repository 

65 shutil.copytree(url, to_path, dirs_exist_ok=True) 

66 self.repo = git.Repo(to_path) 

67 return 

68 

69 for attempt in range(max_retries): 

70 try: 

71 clone_args = ["--branch", branch] 

72 if depth > 0: 

73 clone_args.extend(["--depth", str(depth)]) 

74 

75 # Store original value and disable credential prompts 

76 original_prompt = os.environ.get("GIT_TERMINAL_PROMPT") 

77 os.environ["GIT_TERMINAL_PROMPT"] = "0" 

78 self.logger.info( 

79 f"Cloning repository : {url} | branch: {branch} | depth: {depth}", 

80 ) 

81 try: 

82 # If auth token is provided, modify the URL to include it 

83 clone_url = url 

84 if auth_token and url.startswith("https://"): 

85 # Insert token into URL: https://token@github.com/... 

86 clone_url = url.replace("https://", f"https://{auth_token}@") 

87 self.logger.debug("Using authenticated URL", url=clone_url) 

88 

89 self.logger.debug( 

90 "Attempting to clone repository", 

91 attempt=attempt + 1, 

92 max_attempts=max_retries, 

93 url=clone_url, 

94 branch=branch, 

95 depth=depth, 

96 to_path=to_path, 

97 ) 

98 

99 # Verify target directory is empty 

100 if os.path.exists(to_path) and os.listdir(to_path): 

101 self.logger.warning( 

102 "Target directory is not empty", 

103 to_path=to_path, 

104 contents=os.listdir(to_path), 

105 ) 

106 shutil.rmtree(to_path) 

107 os.makedirs(to_path) 

108 

109 self.repo = git.Repo.clone_from( 

110 clone_url, to_path, multi_options=clone_args 

111 ) 

112 

113 # Verify repository was cloned successfully 

114 if not self.repo or not os.path.exists( 

115 os.path.join(to_path, ".git") 

116 ): 

117 raise RuntimeError("Repository was not cloned successfully") 

118 

119 self.logger.info("Successfully cloned repository") 

120 finally: 

121 # Restore original value 

122 if original_prompt is not None: 

123 os.environ["GIT_TERMINAL_PROMPT"] = original_prompt 

124 else: 

125 del os.environ["GIT_TERMINAL_PROMPT"] 

126 return 

127 except GitCommandError as e: 

128 self.logger.error( 

129 "Git clone attempt failed", 

130 attempt=attempt + 1, 

131 max_attempts=max_retries, 

132 error=str(e), 

133 error_type=type(e).__name__, 

134 stderr=getattr(e, "stderr", None), 

135 ) 

136 if attempt < max_retries - 1: 

137 self.logger.warning( 

138 f"Retrying in {retry_delay} seconds...", 

139 next_attempt=attempt + 2, 

140 ) 

141 time.sleep(retry_delay) 

142 else: 

143 self.logger.error("All clone attempts failed", error=str(e)) 

144 raise 

145 

146 def get_file_content(self, file_path: str) -> str: 

147 """Get file content. 

148 

149 Args: 

150 file_path (str): Path to the file 

151 

152 Returns: 

153 str: File content 

154 

155 Raises: 

156 ValueError: If repository is not initialized 

157 FileNotFoundError: If file does not exist in the repository 

158 Exception: For other errors 

159 """ 

160 try: 

161 if not self.repo: 

162 raise ValueError("Repository not initialized") 

163 

164 # Get the relative path from the repository root 

165 rel_path = os.path.relpath(file_path, self.repo.working_dir) 

166 rel_path = self._to_git_path(rel_path) 

167 

168 # Check if file exists in the repository 

169 try: 

170 # First try to get the file content using git show 

171 content = self.repo.git.show(f"HEAD:{rel_path}") 

172 return content 

173 except GitCommandError as e: 

174 if "exists on disk, but not in" in str(e): 

175 # File exists on disk but not in the repository 

176 raise FileNotFoundError( 

177 f"File {rel_path} exists on disk but not in the repository" 

178 ) from e 

179 elif "does not exist" in str(e): 

180 # File does not exist in the repository 

181 raise FileNotFoundError( 

182 f"File {rel_path} does not exist in the repository" 

183 ) from e 

184 else: 

185 # Other git command errors 

186 raise 

187 except Exception as e: 

188 self.logger.error(f"Failed to read file {file_path}: {e}") 

189 raise 

190 

191 def get_last_commit_date(self, file_path: str) -> datetime | None: 

192 """Get the last commit date for a file. 

193 

194 Args: 

195 file_path: Path to the file 

196 

197 Returns: 

198 Last commit date or None if not found 

199 """ 

200 try: 

201 if not self.repo: 

202 raise ValueError("Repository not initialized") 

203 

204 # Get the relative path from the repository root 

205 rel_path = os.path.relpath(file_path, self.repo.working_dir) 

206 rel_path = self._to_git_path(rel_path) 

207 self.logger.debug("Getting last commit date", file_path=rel_path) 

208 

209 # Get the last commit for the file 

210 try: 

211 commits = list(self.repo.iter_commits(paths=rel_path, max_count=1)) 

212 if commits: 

213 last_commit = commits[0] 

214 self.logger.debug( 

215 "Found last commit", 

216 file_path=rel_path, 

217 commit_date=last_commit.committed_datetime, 

218 commit_hash=last_commit.hexsha, 

219 ) 

220 return last_commit.committed_datetime 

221 self.logger.debug("No commits found for file", file_path=rel_path) 

222 return None 

223 except GitCommandError as e: 

224 self.logger.warning( 

225 "Failed to get commits for file", 

226 file_path=rel_path, 

227 error=str(e), 

228 error_type=type(e).__name__, 

229 ) 

230 return None 

231 except BrokenPipeError as e: 

232 self.logger.warning( 

233 "Git process terminated unexpectedly", 

234 file_path=rel_path, 

235 error=str(e), 

236 error_type=type(e).__name__, 

237 ) 

238 return None 

239 except Exception as e: 

240 self.logger.warning( 

241 "Unexpected error getting commits", 

242 file_path=rel_path, 

243 error=str(e), 

244 error_type=type(e).__name__, 

245 ) 

246 return None 

247 

248 except Exception as e: 

249 self.logger.error( 

250 "Failed to get last commit date", 

251 file_path=file_path, 

252 error=str(e), 

253 error_type=type(e).__name__, 

254 ) 

255 return None 

256 

257 def get_first_commit_date(self, file_path: str) -> datetime | None: 

258 """Get the creation date for a file. 

259 

260 Args: 

261 file_path: Path to the file 

262 

263 Returns: 

264 Creation date or None if not found 

265 """ 

266 try: 

267 if not self.repo: 

268 raise ValueError("Repository not initialized") 

269 

270 # Get the relative path from the repository root 

271 rel_path = os.path.relpath(file_path, self.repo.working_dir) 

272 rel_path = self._to_git_path(rel_path) 

273 self.logger.debug("Getting creation date", file_path=rel_path) 

274 

275 # Get the first commit for the file 

276 try: 

277 # Use git log with --reverse to get commits in chronological order 

278 commits = list( 

279 self.repo.iter_commits(paths=rel_path, reverse=True, max_count=1) 

280 ) 

281 if commits: 

282 first_commit = commits[0] 

283 self.logger.debug( 

284 "Found first commit", 

285 file_path=rel_path, 

286 commit_date=first_commit.committed_datetime, 

287 commit_hash=first_commit.hexsha, 

288 ) 

289 return first_commit.committed_datetime 

290 self.logger.debug("No commits found for file", file_path=rel_path) 

291 return None 

292 except GitCommandError as e: 

293 self.logger.warning( 

294 "Failed to get commits for file", 

295 file_path=rel_path, 

296 error=str(e), 

297 error_type=type(e).__name__, 

298 ) 

299 return None 

300 except BrokenPipeError as e: 

301 self.logger.warning( 

302 "Git process terminated unexpectedly", 

303 file_path=rel_path, 

304 error=str(e), 

305 error_type=type(e).__name__, 

306 ) 

307 return None 

308 except Exception as e: 

309 self.logger.warning( 

310 "Unexpected error getting commits", 

311 file_path=rel_path, 

312 error=str(e), 

313 error_type=type(e).__name__, 

314 ) 

315 return None 

316 

317 except Exception as e: 

318 self.logger.error( 

319 "Failed to get creation date", 

320 file_path=file_path, 

321 error=str(e), 

322 error_type=type(e).__name__, 

323 ) 

324 return None 

325 

326 def list_files(self) -> list[str]: 

327 """List all files in the repository. 

328 

329 Returns: 

330 List of file paths 

331 """ 

332 try: 

333 if not self.repo: 

334 raise ValueError("Repository not initialized") 

335 

336 # Use git ls-tree to list all files 

337 output = self.repo.git.ls_tree("-r", "--name-only", "HEAD") 

338 files = output.splitlines() if output else [] 

339 

340 # Convert relative paths to absolute paths 

341 return [os.path.join(self.repo.working_dir, f) for f in files] 

342 except Exception as e: 

343 self.logger.error("Failed to list files", error=str(e)) 

344 raise