fix some bugs

2025-08-22 19:33:17 +07:00 · 2025-08-22 19:33:17 +07:00 · c7a84c520c
commit c7a84c520c
parent c73b0d247a
5 changed files with 226 additions and 14 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,141 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+/models
+/unsloth_compiled_cache
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+build/
+temp/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+Pipfile.lock
+
+# poetry
+poetry.lock
+
+# PEP 582; used by pythonloc
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# VS Code settings
+.vscode/
--- a/README.md
+++ b/README.md
@ -32,9 +32,27 @@ A Python application for training various unsloth models using data from GitHub
 - Git
 - Dependencies listed in `requirements.txt`

+## Private Repository Support
+
+The application now supports processing private GitHub repositories by using a GitHub token for authentication.
+To use this feature:
+
+1. Generate a GitHub personal access token with appropriate permissions
+2. Pass the token using the `--github_token` command line argument
+3. Use private repository URLs in the same format as public repositories
+
+Supported URL formats for private repositories:
+- `https://github.com/user/private-repo.git`
+- `github.com/user/private-repo`
+- `user/private-repo`
+
 ## Installation

 1. Clone this repository
+2. if have CUDA GPU install PyTorch: 
+   ```bash
+   pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu129
+   ```
 2. Install dependencies:
   ```bash
   pip install -r requirements.txt
@ -56,6 +74,12 @@ python src/main.py \
 python run_training.py \
    --repo1 https://github.com/user/repo1 \
    --repo2 https://github.com/user/repo2
+
+# Using private repositories with a GitHub token
+python run_training.py \
+    --repo1 https://github.com/user/private-repo1 \
+    --repo2 https://github.com/user/private-repo2 \
+    --github_token YOUR_GITHUB_TOKEN
 ```

 ### Training Qwen3-8B
@ -72,6 +96,12 @@ python src/main.py \
 python run_training_qwen3.py \
    --repo1 https://github.com/user/repo1 \
    --repo2 https://github.com/user/repo2
+
+# Using private repositories with a GitHub token
+python run_training_qwen3.py \
+    --repo1 https://github.com/user/private-repo1 \
+    --repo2 https://github.com/user/private-repo2 \
+    --github_token YOUR_GITHUB_TOKEN
 ```

 ### Command Line Arguments
@ -81,6 +111,7 @@ python run_training_qwen3.py \
 - `--config`: Path to training configuration file (default: configs/training_config.yaml)
 - `--output_dir`: Directory to save trained model (default: ./models)
 - `--log_level`: Logging level (DEBUG, INFO, WARNING, ERROR)
+- `--github_token`: GitHub token for accessing private repositories (optional)

 ## Project Structure

--- a/requirements.txt
+++ b/requirements.txt
@ -1,10 +1,10 @@
 # Core ML libraries
-torch>=2.1.0
-torchvision>=0.16.0
-torchaudio>=2.1.0
+# torch>=2.1.0
+# torchvision>=0.16.0
+# torchaudio>=2.1.0

 # Unsloth for efficient model training
-unsloth[cu121]>=2024.5
+unsloth[cu129]>=2024.5
 unsloth_zoo>=2024.5

 # Transformers and tokenizers
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@ -16,7 +16,7 @@ import git
 from datasets import Dataset
 from tqdm import tqdm

-from config import TrainingConfig
+from config import AppConfig


 class DatasetProcessor:
@ -81,13 +81,14 @@ class DatasetProcessor:
        self.logger = logging.getLogger(__name__)
        self.temp_dirs = []

-    def process_github_repos(self, repo_urls: List[str], config: TrainingConfig) -> Dataset:
+    def process_github_repos(self, repo_urls: List[str], config: AppConfig, github_token: Optional[str] = None) -> Dataset:
        """
        Process multiple GitHub repositories into a training dataset

        Args:
            repo_urls: List of GitHub repository URLs
            config: Training configuration
+            github_token: Optional GitHub token for accessing private repositories

        Returns:
            Dataset ready for training
@ -97,7 +98,7 @@ class DatasetProcessor:
        for repo_url in repo_urls:
            try:
                self.logger.info(f"Processing repository: {repo_url}")
-                repo_samples = self._process_single_repo(repo_url, config)
+                repo_samples = self._process_single_repo(repo_url, config, github_token)
                all_code_samples.extend(repo_samples)
                self.logger.info(f"Extracted {len(repo_samples)} samples from {repo_url}")
            except Exception as e:
@ -120,13 +121,14 @@ class DatasetProcessor:
        self.logger.info(f"Dataset size after filtering: {len(dataset)}")
        return dataset

-    def _process_single_repo(self, repo_url: str, config: TrainingConfig) -> List[Dict]:
+    def _process_single_repo(self, repo_url: str, config: AppConfig, github_token: Optional[str] = None) -> List[Dict]:
        """
        Process a single GitHub repository

        Args:
            repo_url: GitHub repository URL
            config: Training configuration
+            github_token: Optional GitHub token for accessing private repositories

        Returns:
            List of code samples with metadata
@ -134,13 +136,38 @@ class DatasetProcessor:
        temp_dir = tempfile.mkdtemp()
        self.temp_dirs.append(temp_dir)

+        depth = 1
+        branch = "18.0"
+
        try:
            # Clone repository
            repo_name = repo_url.split('/')[-1].replace('.git', '')
            repo_path = os.path.join(temp_dir, repo_name)

            self.logger.info(f"Cloning {repo_url} to {repo_path}")
-            repo = git.Repo.clone_from(repo_url, repo_path)
+            
+            # Use token for private repositories if provided
+            clone_url = repo_url
+            if github_token and "github.com" in repo_url:
+                # Handle SSH URLs
+                if repo_url.startswith("git@"):
+                    # SSH URL doesn't need token modification
+                    pass
+                else:
+                    # Add token to HTTPS URL
+                    if repo_url.startswith("https://"):
+                        clone_url = repo_url.replace("https://", f"https://{github_token}@")
+                    elif repo_url.startswith("http://"):
+                        clone_url = repo_url.replace("http://", f"http://{github_token}@")
+                    else:
+                        # For URLs like "github.com/user/repo" or "user/repo"
+                        if repo_url.startswith("github.com/"):
+                            clone_url = f"https://{github_token}@{repo_url}"
+                        else:
+                            # Assume it's a GitHub path like "user/repo"
+                            clone_url = f"https://{github_token}@github.com/{repo_url}"
+            
+            repo = git.Repo.clone_from(clone_url, repo_path, depth=depth, branch=branch)

            # Extract code samples
            code_samples = self._extract_code_samples(repo_path, config)
@ -151,7 +178,7 @@ class DatasetProcessor:
            # Cleanup
            shutil.rmtree(temp_dir, ignore_errors=True)

-    def _extract_code_samples(self, repo_path: str, config: TrainingConfig) -> List[Dict]:
+    def _extract_code_samples(self, repo_path: str, config: AppConfig) -> List[Dict]:
        """
        Extract code samples from a repository

@ -194,7 +221,7 @@ class DatasetProcessor:
                return True
        return False

-    def _process_code_file(self, file_path: Path, repo_path: Path, config: TrainingConfig) -> Optional[Dict]:
+    def _process_code_file(self, file_path: Path, repo_path: Path, config: AppConfig) -> Optional[Dict]:
        """
        Process a single code file into a training sample

--- a/src/main.py
+++ b/src/main.py
@ -15,7 +15,7 @@ sys.path.append(str(Path(__file__).parent))

 from trainer import ModelTrainer
 from dataset_processor import DatasetProcessor
-from config import TrainingConfig
+from config import AppConfig
 from utils import setup_logging, check_gpu_memory


@ -59,6 +59,13 @@ def parse_arguments():
        help="Logging level"
    )

+    parser.add_argument(
+        "--github_token",
+        type=str,
+        default=None,
+        help="GitHub token for accessing private repositories"
+    )
+
    return parser.parse_args()


@ -80,7 +87,12 @@ def main():
        logger.info(f"GPU Memory Info: {gpu_info}")

        # Load configuration
-        config = TrainingConfig.from_yaml(args.config)
+        logger.debug(f"Attempting to load config from: {args.config}")
+        logger.debug(f"AppConfig methods: {[m for m in dir(AppConfig) if not m.startswith('_')]}")
+
+        # Load configuration using AppConfig
+        config = AppConfig.from_yaml(args.config)
+
        logger.info("Configuration loaded successfully")

        # Process datasets from GitHub repositories
@ -89,7 +101,8 @@ def main():

        train_dataset = dataset_processor.process_github_repos(
            repo_urls=[args.repo1, args.repo2],
-            config=config
+            config=config,
+            github_token=args.github_token
        )

        logger.info(f"Dataset processed successfully. Size: {len(train_dataset)}")