fix github clone to folder gitclone

This commit is contained in:
Suherdy Yacob 2025-08-22 21:16:21 +07:00
parent 8fc65ddae8
commit 1f1c183884
3 changed files with 32 additions and 26 deletions

1
gitclone/enterprise Submodule

@ -0,0 +1 @@
Subproject commit 38276c9a237a1779ce7a3e6f31102ff615691690

1
gitclone/odoo Submodule

@ -0,0 +1 @@
Subproject commit 8678e1c777b1faec23d4e7fd311c8ec96041f5b7

View File

@ -133,8 +133,11 @@ class DatasetProcessor:
Returns: Returns:
List of code samples with metadata List of code samples with metadata
""" """
temp_dir = tempfile.mkdtemp() # Create a persistent directory for cloned repositories
self.temp_dirs.append(temp_dir) gitclone_dir = Path("./gitclone")
gitclone_dir.mkdir(exist_ok=True)
temp_dir = str(gitclone_dir)
# Note: We don't add this to temp_dirs since we want to keep it
depth = 1 depth = 1
branch = "18.0" branch = "18.0"
@ -143,31 +146,31 @@ class DatasetProcessor:
# Clone repository # Clone repository
repo_name = repo_url.split('/')[-1].replace('.git', '') repo_name = repo_url.split('/')[-1].replace('.git', '')
repo_path = os.path.join(temp_dir, repo_name) repo_path = os.path.join(temp_dir, repo_name)
if not os.path.exists(repo_path):
self.logger.info(f"Cloning {repo_url} to {repo_path}")
self.logger.info(f"Cloning {repo_url} to {repo_path}") # Use token for private repositories if provided
clone_url = repo_url
# Use token for private repositories if provided if github_token and "github.com" in repo_url:
clone_url = repo_url # Handle SSH URLs
if github_token and "github.com" in repo_url: if repo_url.startswith("git@"):
# Handle SSH URLs # SSH URL doesn't need token modification
if repo_url.startswith("git@"): pass
# SSH URL doesn't need token modification
pass
else:
# Add token to HTTPS URL
if repo_url.startswith("https://"):
clone_url = repo_url.replace("https://", f"https://{github_token}@")
elif repo_url.startswith("http://"):
clone_url = repo_url.replace("http://", f"http://{github_token}@")
else: else:
# For URLs like "github.com/user/repo" or "user/repo" # Add token to HTTPS URL
if repo_url.startswith("github.com/"): if repo_url.startswith("https://"):
clone_url = f"https://{github_token}@{repo_url}" clone_url = repo_url.replace("https://", f"https://{github_token}@")
elif repo_url.startswith("http://"):
clone_url = repo_url.replace("http://", f"http://{github_token}@")
else: else:
# Assume it's a GitHub path like "user/repo" # For URLs like "github.com/user/repo" or "user/repo"
clone_url = f"https://{github_token}@github.com/{repo_url}" if repo_url.startswith("github.com/"):
clone_url = f"https://{github_token}@{repo_url}"
repo = git.Repo.clone_from(clone_url, repo_path, depth=depth, branch=branch) else:
# Assume it's a GitHub path like "user/repo"
clone_url = f"https://{github_token}@github.com/{repo_url}"
repo = git.Repo.clone_from(clone_url, repo_path, depth=depth, branch=branch)
# Extract code samples # Extract code samples
code_samples = self._extract_code_samples(repo_path, config) code_samples = self._extract_code_samples(repo_path, config)
@ -175,8 +178,9 @@ class DatasetProcessor:
return code_samples return code_samples
finally: finally:
# Cleanup # Cleanup temporary directories, but keep gitclone folder
shutil.rmtree(temp_dir, ignore_errors=True) if temp_dir != "./gitclone":
shutil.rmtree(temp_dir, ignore_errors=True)
def _extract_code_samples(self, repo_path: str, config: AppConfig) -> List[Dict]: def _extract_code_samples(self, repo_path: str, config: AppConfig) -> List[Dict]:
""" """