Coverage for src/qdrant_loader/connectors/git/operations.py: 88%

135 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:50 +0000

1"""Git operations wrapper.""" 

2 

3import os 

4import shutil 

5import time 

6from datetime import datetime 

7 

8import git 

9from git.exc import GitCommandError 

10 

11from qdrant_loader.utils.logging import LoggingConfig 

12 

13logger = LoggingConfig.get_logger(__name__) 

14 

15 

16class GitOperations: 

17 """Git operations wrapper.""" 

18 

19 def __init__(self): 

20 """Initialize Git operations.""" 

21 self.repo = None 

22 self.logger = LoggingConfig.get_logger(__name__) 

23 self.logger.info("Initializing GitOperations") 

24 

25 def clone( 

26 self, 

27 url: str, 

28 to_path: str, 

29 branch: str, 

30 depth: int, 

31 max_retries: int = 3, 

32 retry_delay: int = 2, 

33 auth_token: str | None = None, 

34 ) -> None: 

35 """Clone a Git repository. 

36 

37 Args: 

38 url (str): Repository URL or local path 

39 to_path (str): Local path to clone to 

40 branch (str): Branch to clone 

41 depth (int): Clone depth (use 0 for full history) 

42 max_retries (int, optional): Maximum number of retry attempts. Defaults to 3. 

43 retry_delay (int, optional): Delay between retries in seconds. Defaults to 2. 

44 auth_token (Optional[str], optional): Authentication token. Defaults to None. 

45 """ 

46 # Resolve the URL to an absolute path if it's a local path 

47 if os.path.exists(url): 

48 url = os.path.abspath(url) 

49 self.logger.info("Using local repository", url=url) 

50 

51 # Ensure the source is a valid Git repository 

52 if not os.path.exists(os.path.join(url, ".git")): 

53 self.logger.error("Invalid Git repository", path=url) 

54 raise ValueError(f"Path {url} is not a valid Git repository") 

55 

56 # Copy the repository 

57 shutil.copytree(url, to_path, dirs_exist_ok=True) 

58 self.repo = git.Repo(to_path) 

59 return 

60 

61 for attempt in range(max_retries): 

62 try: 

63 clone_args = ["--branch", branch] 

64 if depth > 0: 

65 clone_args.extend(["--depth", str(depth)]) 

66 

67 # Store original value and disable credential prompts 

68 original_prompt = os.environ.get("GIT_TERMINAL_PROMPT") 

69 os.environ["GIT_TERMINAL_PROMPT"] = "0" 

70 self.logger.info( 

71 f"Cloning repository : {url} | branch: {branch} | depth: {depth}", 

72 ) 

73 try: 

74 # If auth token is provided, modify the URL to include it 

75 clone_url = url 

76 if auth_token and url.startswith("https://"): 

77 # Insert token into URL: https://token@github.com/... 

78 clone_url = url.replace("https://", f"https://{auth_token}@") 

79 self.logger.debug("Using authenticated URL", url=clone_url) 

80 

81 self.logger.debug( 

82 "Attempting to clone repository", 

83 attempt=attempt + 1, 

84 max_attempts=max_retries, 

85 url=clone_url, 

86 branch=branch, 

87 depth=depth, 

88 to_path=to_path, 

89 ) 

90 

91 # Verify target directory is empty 

92 if os.path.exists(to_path) and os.listdir(to_path): 

93 self.logger.warning( 

94 "Target directory is not empty", 

95 to_path=to_path, 

96 contents=os.listdir(to_path), 

97 ) 

98 shutil.rmtree(to_path) 

99 os.makedirs(to_path) 

100 

101 self.repo = git.Repo.clone_from( 

102 clone_url, to_path, multi_options=clone_args 

103 ) 

104 

105 # Verify repository was cloned successfully 

106 if not self.repo or not os.path.exists( 

107 os.path.join(to_path, ".git") 

108 ): 

109 raise RuntimeError("Repository was not cloned successfully") 

110 

111 self.logger.info("Successfully cloned repository") 

112 finally: 

113 # Restore original value 

114 if original_prompt is not None: 

115 os.environ["GIT_TERMINAL_PROMPT"] = original_prompt 

116 else: 

117 del os.environ["GIT_TERMINAL_PROMPT"] 

118 return 

119 except GitCommandError as e: 

120 self.logger.error( 

121 "Git clone attempt failed", 

122 attempt=attempt + 1, 

123 max_attempts=max_retries, 

124 error=str(e), 

125 error_type=type(e).__name__, 

126 stderr=getattr(e, "stderr", None), 

127 ) 

128 if attempt < max_retries - 1: 

129 self.logger.warning( 

130 f"Retrying in {retry_delay} seconds...", 

131 next_attempt=attempt + 2, 

132 ) 

133 time.sleep(retry_delay) 

134 else: 

135 self.logger.error("All clone attempts failed", error=str(e)) 

136 raise 

137 

138 def get_file_content(self, file_path: str) -> str: 

139 """Get file content. 

140 

141 Args: 

142 file_path (str): Path to the file 

143 

144 Returns: 

145 str: File content 

146 

147 Raises: 

148 ValueError: If repository is not initialized 

149 FileNotFoundError: If file does not exist in the repository 

150 Exception: For other errors 

151 """ 

152 try: 

153 if not self.repo: 

154 raise ValueError("Repository not initialized") 

155 

156 # Get the relative path from the repository root 

157 rel_path = os.path.relpath(file_path, self.repo.working_dir) 

158 

159 # Check if file exists in the repository 

160 try: 

161 # First try to get the file content using git show 

162 content = self.repo.git.show(f"HEAD:{rel_path}") 

163 return content 

164 except GitCommandError as e: 

165 if "exists on disk, but not in" in str(e): 

166 # File exists on disk but not in the repository 

167 raise FileNotFoundError( 

168 f"File {rel_path} exists on disk but not in the repository" 

169 ) from e 

170 elif "does not exist" in str(e): 

171 # File does not exist in the repository 

172 raise FileNotFoundError( 

173 f"File {rel_path} does not exist in the repository" 

174 ) from e 

175 else: 

176 # Other git command errors 

177 raise 

178 except Exception as e: 

179 self.logger.error(f"Failed to read file {file_path}: {e}") 

180 raise 

181 

182 def get_last_commit_date(self, file_path: str) -> datetime | None: 

183 """Get the last commit date for a file. 

184 

185 Args: 

186 file_path: Path to the file 

187 

188 Returns: 

189 Last commit date or None if not found 

190 """ 

191 try: 

192 if not self.repo: 

193 raise ValueError("Repository not initialized") 

194 

195 # Get the relative path from the repository root 

196 rel_path = os.path.relpath(file_path, self.repo.working_dir) 

197 self.logger.debug("Getting last commit date", file_path=rel_path) 

198 

199 # Get the last commit for the file 

200 try: 

201 commits = list(self.repo.iter_commits(paths=rel_path, max_count=1)) 

202 if commits: 

203 last_commit = commits[0] 

204 self.logger.debug( 

205 "Found last commit", 

206 file_path=rel_path, 

207 commit_date=last_commit.committed_datetime, 

208 commit_hash=last_commit.hexsha, 

209 ) 

210 return last_commit.committed_datetime 

211 self.logger.debug("No commits found for file", file_path=rel_path) 

212 return None 

213 except GitCommandError as e: 

214 self.logger.warning( 

215 "Failed to get commits for file", 

216 file_path=rel_path, 

217 error=str(e), 

218 error_type=type(e).__name__, 

219 ) 

220 return None 

221 except BrokenPipeError as e: 

222 self.logger.warning( 

223 "Git process terminated unexpectedly", 

224 file_path=rel_path, 

225 error=str(e), 

226 error_type=type(e).__name__, 

227 ) 

228 return None 

229 except Exception as e: 

230 self.logger.warning( 

231 "Unexpected error getting commits", 

232 file_path=rel_path, 

233 error=str(e), 

234 error_type=type(e).__name__, 

235 ) 

236 return None 

237 

238 except Exception as e: 

239 self.logger.error( 

240 "Failed to get last commit date", 

241 file_path=file_path, 

242 error=str(e), 

243 error_type=type(e).__name__, 

244 ) 

245 return None 

246 

247 def get_first_commit_date(self, file_path: str) -> datetime | None: 

248 """Get the creation date for a file. 

249 

250 Args: 

251 file_path: Path to the file 

252 

253 Returns: 

254 Creation date or None if not found 

255 """ 

256 try: 

257 if not self.repo: 

258 raise ValueError("Repository not initialized") 

259 

260 # Get the relative path from the repository root 

261 rel_path = os.path.relpath(file_path, self.repo.working_dir) 

262 self.logger.debug("Getting creation date", file_path=rel_path) 

263 

264 # Get the first commit for the file 

265 try: 

266 # Use git log with --reverse to get commits in chronological order 

267 commits = list( 

268 self.repo.iter_commits(paths=rel_path, reverse=True, max_count=1) 

269 ) 

270 if commits: 

271 first_commit = commits[0] 

272 self.logger.debug( 

273 "Found first commit", 

274 file_path=rel_path, 

275 commit_date=first_commit.committed_datetime, 

276 commit_hash=first_commit.hexsha, 

277 ) 

278 return first_commit.committed_datetime 

279 self.logger.debug("No commits found for file", file_path=rel_path) 

280 return None 

281 except GitCommandError as e: 

282 self.logger.warning( 

283 "Failed to get commits for file", 

284 file_path=rel_path, 

285 error=str(e), 

286 error_type=type(e).__name__, 

287 ) 

288 return None 

289 except BrokenPipeError as e: 

290 self.logger.warning( 

291 "Git process terminated unexpectedly", 

292 file_path=rel_path, 

293 error=str(e), 

294 error_type=type(e).__name__, 

295 ) 

296 return None 

297 except Exception as e: 

298 self.logger.warning( 

299 "Unexpected error getting commits", 

300 file_path=rel_path, 

301 error=str(e), 

302 error_type=type(e).__name__, 

303 ) 

304 return None 

305 

306 except Exception as e: 

307 self.logger.error( 

308 "Failed to get creation date", 

309 file_path=file_path, 

310 error=str(e), 

311 error_type=type(e).__name__, 

312 ) 

313 return None 

314 

315 def list_files(self) -> list[str]: 

316 """List all files in the repository. 

317 

318 Returns: 

319 List of file paths 

320 """ 

321 try: 

322 if not self.repo: 

323 raise ValueError("Repository not initialized") 

324 

325 # Use git ls-tree to list all files 

326 output = self.repo.git.ls_tree("-r", "--name-only", "HEAD") 

327 files = output.splitlines() if output else [] 

328 

329 # Convert relative paths to absolute paths 

330 return [os.path.join(self.repo.working_dir, f) for f in files] 

331 except Exception as e: 

332 self.logger.error("Failed to list files", error=str(e)) 

333 raise