diff --git a/gitclone/enterprise b/gitclone/enterprise new file mode 160000 index 0000000..38276c9 --- /dev/null +++ b/gitclone/enterprise @@ -0,0 +1 @@ +Subproject commit 38276c9a237a1779ce7a3e6f31102ff615691690 diff --git a/gitclone/odoo b/gitclone/odoo new file mode 160000 index 0000000..8678e1c --- /dev/null +++ b/gitclone/odoo @@ -0,0 +1 @@ +Subproject commit 8678e1c777b1faec23d4e7fd311c8ec96041f5b7 diff --git a/src/dataset_processor.py b/src/dataset_processor.py index c5d3695..38b866c 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -133,8 +133,11 @@ class DatasetProcessor: Returns: List of code samples with metadata """ - temp_dir = tempfile.mkdtemp() - self.temp_dirs.append(temp_dir) + # Create a persistent directory for cloned repositories + gitclone_dir = Path("./gitclone") + gitclone_dir.mkdir(exist_ok=True) + temp_dir = str(gitclone_dir) + # Note: We don't add this to temp_dirs since we want to keep it depth = 1 branch = "18.0" @@ -143,31 +146,31 @@ class DatasetProcessor: # Clone repository repo_name = repo_url.split('/')[-1].replace('.git', '') repo_path = os.path.join(temp_dir, repo_name) + if not os.path.exists(repo_path): + self.logger.info(f"Cloning {repo_url} to {repo_path}") - self.logger.info(f"Cloning {repo_url} to {repo_path}") - - # Use token for private repositories if provided - clone_url = repo_url - if github_token and "github.com" in repo_url: - # Handle SSH URLs - if repo_url.startswith("git@"): - # SSH URL doesn't need token modification - pass - else: - # Add token to HTTPS URL - if repo_url.startswith("https://"): - clone_url = repo_url.replace("https://", f"https://{github_token}@") - elif repo_url.startswith("http://"): - clone_url = repo_url.replace("http://", f"http://{github_token}@") + # Use token for private repositories if provided + clone_url = repo_url + if github_token and "github.com" in repo_url: + # Handle SSH URLs + if repo_url.startswith("git@"): + # SSH URL doesn't need token modification + pass else: - # For URLs like "github.com/user/repo" or "user/repo" - if repo_url.startswith("github.com/"): - clone_url = f"https://{github_token}@{repo_url}" + # Add token to HTTPS URL + if repo_url.startswith("https://"): + clone_url = repo_url.replace("https://", f"https://{github_token}@") + elif repo_url.startswith("http://"): + clone_url = repo_url.replace("http://", f"http://{github_token}@") else: - # Assume it's a GitHub path like "user/repo" - clone_url = f"https://{github_token}@github.com/{repo_url}" - - repo = git.Repo.clone_from(clone_url, repo_path, depth=depth, branch=branch) + # For URLs like "github.com/user/repo" or "user/repo" + if repo_url.startswith("github.com/"): + clone_url = f"https://{github_token}@{repo_url}" + else: + # Assume it's a GitHub path like "user/repo" + clone_url = f"https://{github_token}@github.com/{repo_url}" + + repo = git.Repo.clone_from(clone_url, repo_path, depth=depth, branch=branch) # Extract code samples code_samples = self._extract_code_samples(repo_path, config) @@ -175,8 +178,9 @@ class DatasetProcessor: return code_samples finally: - # Cleanup - shutil.rmtree(temp_dir, ignore_errors=True) + # Cleanup temporary directories, but keep gitclone folder + if temp_dir != "./gitclone": + shutil.rmtree(temp_dir, ignore_errors=True) def _extract_code_samples(self, repo_path: str, config: AppConfig) -> List[Dict]: """