From c7a84c520c12df1065c0e3d2ac398634ccd35774 Mon Sep 17 00:00:00 2001 From: Suherdy Yacob Date: Fri, 22 Aug 2025 19:33:17 +0700 Subject: [PATCH] fix some bugs --- .gitignore | 141 +++++++++++++++++++++++++++++++++++++++ README.md | 31 +++++++++ requirements.txt | 8 +-- src/dataset_processor.py | 41 ++++++++++-- src/main.py | 19 +++++- 5 files changed, 226 insertions(+), 14 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2d5066f --- /dev/null +++ b/.gitignore @@ -0,0 +1,141 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +/models +/unsloth_compiled_cache + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +build/ +temp/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +Pipfile.lock + +# poetry +poetry.lock + +# PEP 582; used by pythonloc +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# VS Code settings +.vscode/ \ No newline at end of file diff --git a/README.md b/README.md index 84c3f81..c559d96 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,27 @@ A Python application for training various unsloth models using data from GitHub - Git - Dependencies listed in `requirements.txt` +## Private Repository Support + +The application now supports processing private GitHub repositories by using a GitHub token for authentication. +To use this feature: + +1. Generate a GitHub personal access token with appropriate permissions +2. Pass the token using the `--github_token` command line argument +3. Use private repository URLs in the same format as public repositories + +Supported URL formats for private repositories: +- `https://github.com/user/private-repo.git` +- `github.com/user/private-repo` +- `user/private-repo` + ## Installation 1. Clone this repository +2. if have CUDA GPU install PyTorch: + ```bash + pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu129 + ``` 2. Install dependencies: ```bash pip install -r requirements.txt @@ -56,6 +74,12 @@ python src/main.py \ python run_training.py \ --repo1 https://github.com/user/repo1 \ --repo2 https://github.com/user/repo2 + +# Using private repositories with a GitHub token +python run_training.py \ + --repo1 https://github.com/user/private-repo1 \ + --repo2 https://github.com/user/private-repo2 \ + --github_token YOUR_GITHUB_TOKEN ``` ### Training Qwen3-8B @@ -72,6 +96,12 @@ python src/main.py \ python run_training_qwen3.py \ --repo1 https://github.com/user/repo1 \ --repo2 https://github.com/user/repo2 + +# Using private repositories with a GitHub token +python run_training_qwen3.py \ + --repo1 https://github.com/user/private-repo1 \ + --repo2 https://github.com/user/private-repo2 \ + --github_token YOUR_GITHUB_TOKEN ``` ### Command Line Arguments @@ -81,6 +111,7 @@ python run_training_qwen3.py \ - `--config`: Path to training configuration file (default: configs/training_config.yaml) - `--output_dir`: Directory to save trained model (default: ./models) - `--log_level`: Logging level (DEBUG, INFO, WARNING, ERROR) +- `--github_token`: GitHub token for accessing private repositories (optional) ## Project Structure diff --git a/requirements.txt b/requirements.txt index dd72d9e..f46d69f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ # Core ML libraries -torch>=2.1.0 -torchvision>=0.16.0 -torchaudio>=2.1.0 +# torch>=2.1.0 +# torchvision>=0.16.0 +# torchaudio>=2.1.0 # Unsloth for efficient model training -unsloth[cu121]>=2024.5 +unsloth[cu129]>=2024.5 unsloth_zoo>=2024.5 # Transformers and tokenizers diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 8f0f776..c5d3695 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -16,7 +16,7 @@ import git from datasets import Dataset from tqdm import tqdm -from config import TrainingConfig +from config import AppConfig class DatasetProcessor: @@ -81,13 +81,14 @@ class DatasetProcessor: self.logger = logging.getLogger(__name__) self.temp_dirs = [] - def process_github_repos(self, repo_urls: List[str], config: TrainingConfig) -> Dataset: + def process_github_repos(self, repo_urls: List[str], config: AppConfig, github_token: Optional[str] = None) -> Dataset: """ Process multiple GitHub repositories into a training dataset Args: repo_urls: List of GitHub repository URLs config: Training configuration + github_token: Optional GitHub token for accessing private repositories Returns: Dataset ready for training @@ -97,7 +98,7 @@ class DatasetProcessor: for repo_url in repo_urls: try: self.logger.info(f"Processing repository: {repo_url}") - repo_samples = self._process_single_repo(repo_url, config) + repo_samples = self._process_single_repo(repo_url, config, github_token) all_code_samples.extend(repo_samples) self.logger.info(f"Extracted {len(repo_samples)} samples from {repo_url}") except Exception as e: @@ -120,13 +121,14 @@ class DatasetProcessor: self.logger.info(f"Dataset size after filtering: {len(dataset)}") return dataset - def _process_single_repo(self, repo_url: str, config: TrainingConfig) -> List[Dict]: + def _process_single_repo(self, repo_url: str, config: AppConfig, github_token: Optional[str] = None) -> List[Dict]: """ Process a single GitHub repository Args: repo_url: GitHub repository URL config: Training configuration + github_token: Optional GitHub token for accessing private repositories Returns: List of code samples with metadata @@ -134,13 +136,38 @@ class DatasetProcessor: temp_dir = tempfile.mkdtemp() self.temp_dirs.append(temp_dir) + depth = 1 + branch = "18.0" + try: # Clone repository repo_name = repo_url.split('/')[-1].replace('.git', '') repo_path = os.path.join(temp_dir, repo_name) self.logger.info(f"Cloning {repo_url} to {repo_path}") - repo = git.Repo.clone_from(repo_url, repo_path) + + # Use token for private repositories if provided + clone_url = repo_url + if github_token and "github.com" in repo_url: + # Handle SSH URLs + if repo_url.startswith("git@"): + # SSH URL doesn't need token modification + pass + else: + # Add token to HTTPS URL + if repo_url.startswith("https://"): + clone_url = repo_url.replace("https://", f"https://{github_token}@") + elif repo_url.startswith("http://"): + clone_url = repo_url.replace("http://", f"http://{github_token}@") + else: + # For URLs like "github.com/user/repo" or "user/repo" + if repo_url.startswith("github.com/"): + clone_url = f"https://{github_token}@{repo_url}" + else: + # Assume it's a GitHub path like "user/repo" + clone_url = f"https://{github_token}@github.com/{repo_url}" + + repo = git.Repo.clone_from(clone_url, repo_path, depth=depth, branch=branch) # Extract code samples code_samples = self._extract_code_samples(repo_path, config) @@ -151,7 +178,7 @@ class DatasetProcessor: # Cleanup shutil.rmtree(temp_dir, ignore_errors=True) - def _extract_code_samples(self, repo_path: str, config: TrainingConfig) -> List[Dict]: + def _extract_code_samples(self, repo_path: str, config: AppConfig) -> List[Dict]: """ Extract code samples from a repository @@ -194,7 +221,7 @@ class DatasetProcessor: return True return False - def _process_code_file(self, file_path: Path, repo_path: Path, config: TrainingConfig) -> Optional[Dict]: + def _process_code_file(self, file_path: Path, repo_path: Path, config: AppConfig) -> Optional[Dict]: """ Process a single code file into a training sample diff --git a/src/main.py b/src/main.py index 2e45019..239a415 100644 --- a/src/main.py +++ b/src/main.py @@ -15,7 +15,7 @@ sys.path.append(str(Path(__file__).parent)) from trainer import ModelTrainer from dataset_processor import DatasetProcessor -from config import TrainingConfig +from config import AppConfig from utils import setup_logging, check_gpu_memory @@ -59,6 +59,13 @@ def parse_arguments(): help="Logging level" ) + parser.add_argument( + "--github_token", + type=str, + default=None, + help="GitHub token for accessing private repositories" + ) + return parser.parse_args() @@ -80,7 +87,12 @@ def main(): logger.info(f"GPU Memory Info: {gpu_info}") # Load configuration - config = TrainingConfig.from_yaml(args.config) + logger.debug(f"Attempting to load config from: {args.config}") + logger.debug(f"AppConfig methods: {[m for m in dir(AppConfig) if not m.startswith('_')]}") + + # Load configuration using AppConfig + config = AppConfig.from_yaml(args.config) + logger.info("Configuration loaded successfully") # Process datasets from GitHub repositories @@ -89,7 +101,8 @@ def main(): train_dataset = dataset_processor.process_github_repos( repo_urls=[args.repo1, args.repo2], - config=config + config=config, + github_token=args.github_token ) logger.info(f"Dataset processed successfully. Size: {len(train_dataset)}")