From c73b0d247a1fd5150d366553e3050789dfc4edac Mon Sep 17 00:00:00 2001 From: suherdy yacob Date: Fri, 22 Aug 2025 16:33:30 +0700 Subject: [PATCH] first commit --- README.md | 183 +++++++++++++++++ compare_configs.py | 99 +++++++++ configs/training_config.yaml | 132 ++++++++++++ configs/training_config_qwen3.yaml | 132 ++++++++++++ requirements.txt | 55 +++++ run_training.py | 22 ++ run_training_qwen3.py | 31 +++ src/__init__.py | 6 + src/config.py | 227 ++++++++++++++++++++ src/dataset_processor.py | 250 ++++++++++++++++++++++ src/main.py | 110 ++++++++++ src/trainer.py | 284 +++++++++++++++++++++++++ src/utils.py | 319 +++++++++++++++++++++++++++++ 13 files changed, 1850 insertions(+) create mode 100644 README.md create mode 100644 compare_configs.py create mode 100644 configs/training_config.yaml create mode 100644 configs/training_config_qwen3.yaml create mode 100644 requirements.txt create mode 100644 run_training.py create mode 100644 run_training_qwen3.py create mode 100644 src/__init__.py create mode 100644 src/config.py create mode 100644 src/dataset_processor.py create mode 100644 src/main.py create mode 100644 src/trainer.py create mode 100644 src/utils.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..84c3f81 --- /dev/null +++ b/README.md @@ -0,0 +1,183 @@ +# AI Trainer + +A Python application for training various unsloth models using data from GitHub repositories. Supports both Qwen2.5-Coder and Qwen3 models optimized for RTX3070 8GB VRAM. + +## Supported Models + +### 1. Qwen2.5-Coder-7B-Instruct (Default) +- **Model**: `unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit` +- **Best for**: Code generation, code completion, programming tasks +- **Memory Usage**: Moderate (~6-7GB VRAM) +- **Config**: `configs/training_config.yaml` + +### 2. Qwen3-8B +- **Model**: `unsloth/Qwen3-8B-bnb-4bit` +- **Best for**: General instruction following, broader language tasks +- **Memory Usage**: Higher (~7-8GB VRAM) +- **Config**: `configs/training_config_qwen3.yaml` + +## Features + +- **Dataset Processing**: Automatically processes code from GitHub repositories +- **Memory Optimized**: Designed for RTX3070 8GB VRAM with no CPU offloading +- **Configurable Training**: YAML-based configuration system +- **Progress Logging**: Comprehensive logging and monitoring +- **Modular Design**: Clean separation of concerns with dataset processing, training, and utilities +- **Multi-Model Support**: Easy switching between different model architectures + +## Requirements + +- Python 3.8+ +- CUDA-compatible GPU (tested with RTX3070 8GB VRAM) +- Git +- Dependencies listed in `requirements.txt` + +## Installation + +1. Clone this repository +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +## Usage + +### Training Qwen2.5-Coder-7B (Default) +```bash +# Using the main script +python src/main.py \ + --repo1 https://github.com/user/repo1 \ + --repo2 https://github.com/user/repo2 \ + --config configs/training_config.yaml \ + --output_dir ./models \ + --log_level INFO + +# Or using the runner script +python run_training.py \ + --repo1 https://github.com/user/repo1 \ + --repo2 https://github.com/user/repo2 +``` + +### Training Qwen3-8B +```bash +# Using the main script with Qwen3 config +python src/main.py \ + --repo1 https://github.com/user/repo1 \ + --repo2 https://github.com/user/repo2 \ + --config configs/training_config_qwen3.yaml \ + --output_dir ./models \ + --log_level INFO + +# Or using the dedicated Qwen3 runner +python run_training_qwen3.py \ + --repo1 https://github.com/user/repo1 \ + --repo2 https://github.com/user/repo2 +``` + +### Command Line Arguments + +- `--repo1`: First GitHub repository URL (required) +- `--repo2`: Second GitHub repository URL (required) +- `--config`: Path to training configuration file (default: configs/training_config.yaml) +- `--output_dir`: Directory to save trained model (default: ./models) +- `--log_level`: Logging level (DEBUG, INFO, WARNING, ERROR) + +## Project Structure + +``` +ai_trainer/ +├── src/ +│ ├── __init__.py +│ ├── main.py # Main entry point +│ ├── trainer.py # Model training logic +│ ├── dataset_processor.py # GitHub repository processing +│ ├── config.py # Configuration management +│ └── utils.py # Utility functions +├── configs/ +│ └── training_config.yaml # Training configuration +├── data/ +│ └── processed/ # Processed datasets +├── models/ # Trained models +├── logs/ # Training logs +├── requirements.txt +└── README.md +``` + +## Memory Optimization + +This application is specifically optimized for RTX3070 8GB VRAM: +- Uses 4-bit quantization (bnb-4bit) +- Gradient checkpointing enabled +- No CPU offloading +- Optimized batch sizes for 8GB VRAM +- Memory-efficient data loading + +## Configuration + +### Qwen2.5-Coder-7B Configuration +**File**: `configs/training_config.yaml` + +```yaml +model: + name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit" + max_seq_length: 2048 + +training: + per_device_train_batch_size: 2 + gradient_accumulation_steps: 4 + learning_rate: 2.0e-4 + num_train_epochs: 3 + +memory: + use_gradient_checkpointing: true + offload_to_cpu: false + max_memory_usage: 0.85 +``` + +### Qwen3-8B Configuration +**File**: `configs/training_config_qwen3.yaml` + +```yaml +model: + name: "unsloth/Qwen3-8B-bnb-4bit" + max_seq_length: 2048 + +training: + per_device_train_batch_size: 1 # More conservative + gradient_accumulation_steps: 8 # Higher accumulation + learning_rate: 1.0e-4 # Lower learning rate + num_train_epochs: 3 + +memory: + use_gradient_checkpointing: true + offload_to_cpu: false + max_memory_usage: 0.95 # More aggressive memory usage +``` + +### Key Differences + +| Setting | Qwen2.5-Coder | Qwen3-8B | Reason | +|---------|---------------|----------|---------| +| Batch Size | 2 | 1 | Larger model needs smaller batches | +| Gradient Accumulation | 4 | 8 | Maintains effective batch size | +| Learning Rate | 2e-4 | 1e-4 | Larger model needs more conservative LR | +| Memory Usage | 85% | 95% | Qwen3 can use more VRAM | +| Effective Batch Size | 8 | 8 | Same training dynamics | + +## Model Selection Guide + +### Choose Qwen2.5-Coder-7B when: +- You want to fine-tune specifically for **code generation** tasks +- Working with **programming languages** and technical content +- Need **code completion** and **code understanding** capabilities +- Prefer **moderate memory usage** (~6-7GB VRAM) + +### Choose Qwen3-8B when: +- You need **general instruction following** capabilities +- Working with **mixed content** (code + natural language) +- Want **broader language understanding** and generation +- Have **sufficient VRAM** (~7-8GB) and prefer newer architecture + +## License + +MIT License \ No newline at end of file diff --git a/compare_configs.py b/compare_configs.py new file mode 100644 index 0000000..856264c --- /dev/null +++ b/compare_configs.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +Compare training configurations for different models +""" + +import yaml +from pathlib import Path +from colorama import init, Fore, Style + +init(autoreset=True) + +def load_config(config_path): + """Load YAML configuration""" + with open(config_path, 'r') as f: + return yaml.safe_load(f) + +def compare_configs(): + """Compare the two training configurations""" + print(f"\n{Fore.CYAN}{'='*80}{Style.RESET_ALL}") + print(f"{Fore.CYAN}AI TRAINER - MODEL CONFIGURATION COMPARISON{Style.RESET_ALL}") + print(f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}") + + # Load configurations + qwen25_config = load_config('configs/training_config.yaml') + qwen3_config = load_config('configs/training_config_qwen3.yaml') + + # Model comparison + print(f"\n{Fore.GREEN}📊 MODEL COMPARISON{Style.RESET_ALL}") + print(f"{'Setting':<25} {'Qwen2.5-Coder-7B':<20} {'Qwen3-8B':<15}") + print(f"{'-'*60}") + + print(f"{'Model Name':<25} {qwen25_config['model']['name']:<20} {qwen3_config['model']['name']:<15}") + print(f"{'Max Seq Length':<25} {qwen25_config['model']['max_seq_length']:<20} {qwen3_config['model']['max_seq_length']:<15}") + + # Training comparison + print(f"\n{Fore.GREEN}⚙️ TRAINING PARAMETERS{Style.RESET_ALL}") + print(f"{'Parameter':<25} {'Qwen2.5-Coder-7B':<20} {'Qwen3-8B':<15} {'Difference':<15}") + print(f"{'-'*75}") + + training_params = [ + ('Batch Size', 'per_device_train_batch_size'), + ('Gradient Accumulation', 'gradient_accumulation_steps'), + ('Learning Rate', 'learning_rate'), + ('Warmup Steps', 'warmup_steps'), + ('Epochs', 'num_train_epochs') + ] + + for param_name, param_key in training_params: + qwen25_val = qwen25_config['training'][param_key] + qwen3_val = qwen3_config['training'][param_key] + diff = "🔻" if qwen3_val < qwen25_val else "🔺" if qwen3_val > qwen25_val else "➡️" + + print(f"{param_name:<25} {qwen25_val:<20} {qwen3_val:<15} {diff}") + + # Memory comparison + print(f"\n{Fore.GREEN}🧠 MEMORY SETTINGS{Style.RESET_ALL}") + print(f"{'Setting':<25} {'Qwen2.5-Coder-7B':<20} {'Qwen3-8B':<15}") + print(f"{'-'*60}") + + memory_params = [ + ('Max Memory Usage', 'max_memory_usage'), + ('Gradient Checkpointing', 'use_gradient_checkpointing'), + ('CPU Offloading', 'offload_to_cpu') + ] + + for param_name, param_key in memory_params: + qwen25_val = qwen25_config['memory'][param_key] + qwen3_val = qwen3_config['memory'][param_key] + print(f"{param_name:<25} {qwen25_val:<20} {qwen3_val:<15}") + + # Usage guide + print(f"\n{Fore.YELLOW}💡 RECOMMENDATION GUIDE{Style.RESET_ALL}") + print(f"{'='*80}") + + print(f"\n{Fore.BLUE}Use Qwen2.5-Coder-7B when:{Style.RESET_ALL}") + print(f" • You want to fine-tune for code generation tasks") + print(f" • Working primarily with programming languages") + print(f" • Need code completion and understanding") + print(f" • Prefer moderate memory usage (~6-7GB VRAM)") + + print(f"\n{Fore.BLUE}Use Qwen3-8B when:{Style.RESET_ALL}") + print(f" • You need general instruction following") + print(f" • Working with mixed code and natural language") + print(f" • Want broader language understanding") + print(f" • Have sufficient VRAM (~7-8GB)") + + print(f"\n{Fore.GREEN}🚀 QUICK START COMMANDS{Style.RESET_ALL}") + print(f"{'='*80}") + + print(f"\n{Fore.CYAN}For Qwen2.5-Coder-7B:{Style.RESET_ALL}") + print(f"python run_training.py --repo1 --repo2 ") + + print(f"\n{Fore.CYAN}For Qwen3-8B:{Style.RESET_ALL}") + print(f"python run_training_qwen3.py --repo1 --repo2 ") + + print(f"\n{Fore.CYAN}{'='*80}{Style.RESET_ALL}") + +if __name__ == "__main__": + compare_configs() \ No newline at end of file diff --git a/configs/training_config.yaml b/configs/training_config.yaml new file mode 100644 index 0000000..d5302df --- /dev/null +++ b/configs/training_config.yaml @@ -0,0 +1,132 @@ +# Training configuration optimized for RTX3070 8GB VRAM +# AI Trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit + +model: + name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit" + max_seq_length: 2048 + trust_remote_code: true + use_fast_tokenizer: true + padding_side: "left" + truncation_side: "left" + +training: + # Memory-optimized batch size for RTX3070 8GB + per_device_train_batch_size: 2 + gradient_accumulation_steps: 4 + + # Training parameters + num_train_epochs: 3 + learning_rate: 2.0e-4 + warmup_steps: 10 + warmup_ratio: 0.1 + + # Logging and saving + logging_steps: 1 + save_steps: 100 + save_total_limit: 3 + + # Evaluation + evaluation_strategy: "steps" + eval_steps: 100 + load_best_model_at_end: true + metric_for_best_model: "loss" + greater_is_better: false + + # Data loading + dataloader_num_workers: 2 + dataloader_pin_memory: true + remove_unused_columns: false + + # Memory optimization - CRITICAL for RTX3070 8GB + use_gradient_checkpointing: true + offload_to_cpu: false # Explicitly no CPU offloading + + # Optimizer settings + optim: "adamw_torch" + weight_decay: 0.01 + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_epsilon: 1.0e-8 + max_grad_norm: 1.0 + + # Learning rate scheduler + lr_scheduler_type: "cosine" + + # Precision - BF16 for better stability on modern GPUs + bf16: true + fp16: false + tf32: true + + # Dataset settings + dataset_shuffle: true + dataset_seed: 42 + + # Output settings + output_dir: "./models" + logging_dir: "./logs" + report_to: ["tensorboard"] + +dataset: + # File filtering + min_file_size: 10 + max_file_size: 10000 + + # Supported programming languages + supported_languages: + - python + - javascript + - typescript + - java + - cpp + - c + - csharp + - php + - ruby + - go + - rust + - swift + - kotlin + - scala + - sql + - bash + - yaml + - json + - xml + - html + - css + - markdown + + # Files and directories to exclude + exclude_patterns: + - "\\.git/" + - "__pycache__/" + - "\\.pytest_cache/" + - "node_modules/" + - "\\.venv/" + - "venv/" + - "package-lock\\.json$" + - "yarn\\.lock$" + - "\\.log$" + - "\\.tmp$" + - "\\.bak$" + - "~\\$.*" + - "\\.swp$" + - "\\.swo$" + - "\\.DS_Store" + - "\\.pyc$" + - "\\.pyo$" + - "\\.pyd$" + - "\\.so$" + - "\\.dll$" + - "\\.exe$" + +memory: + # Memory management for RTX3070 8GB + max_memory_usage: 0.85 # Use up to 85% of GPU memory + enable_memory_tracking: true + clear_cache_between_epochs: true + + # Attention optimization + use_memory_efficient_attention: true + attention_slicing: true + slice_size: 1 \ No newline at end of file diff --git a/configs/training_config_qwen3.yaml b/configs/training_config_qwen3.yaml new file mode 100644 index 0000000..dbfab51 --- /dev/null +++ b/configs/training_config_qwen3.yaml @@ -0,0 +1,132 @@ +# Training configuration optimized for RTX3070 8GB VRAM - Qwen3-8B Model +# AI Trainer for unsloth/Qwen3-8B-bnb-4bit + +model: + name: "unsloth/Qwen3-8B-bnb-4bit" + max_seq_length: 2048 + trust_remote_code: true + use_fast_tokenizer: true + padding_side: "left" + truncation_side: "left" + +training: + # Memory-optimized batch size for RTX3070 8GB with Qwen3-8B + per_device_train_batch_size: 1 # More conservative for larger model + gradient_accumulation_steps: 8 # Higher accumulation to maintain effective batch size + + # Training parameters + num_train_epochs: 3 + learning_rate: 1.0e-4 # Slightly lower for larger model + warmup_steps: 15 + warmup_ratio: 0.1 + + # Logging and saving + logging_steps: 1 + save_steps: 100 + save_total_limit: 3 + + # Evaluation + evaluation_strategy: "steps" + eval_steps: 100 + load_best_model_at_end: true + metric_for_best_model: "loss" + greater_is_better: false + + # Data loading + dataloader_num_workers: 2 + dataloader_pin_memory: true + remove_unused_columns: false + + # Memory optimization - CRITICAL for RTX3070 8GB with 8B model + use_gradient_checkpointing: true + offload_to_cpu: false # Explicitly no CPU offloading + + # Optimizer settings + optim: "adamw_torch" + weight_decay: 0.01 + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_epsilon: 1.0e-8 + max_grad_norm: 1.0 + + # Learning rate scheduler + lr_scheduler_type: "cosine" + + # Precision - BF16 for better stability on modern GPUs + bf16: true + fp16: false + tf32: true + + # Dataset settings + dataset_shuffle: true + dataset_seed: 42 + + # Output settings + output_dir: "./models" + logging_dir: "./logs" + report_to: ["tensorboard"] + +dataset: + # File filtering + min_file_size: 10 + max_file_size: 10000 + + # Supported programming languages + supported_languages: + - python + - javascript + - typescript + - java + - cpp + - c + - csharp + - php + - ruby + - go + - rust + - swift + - kotlin + - scala + - sql + - bash + - yaml + - json + - xml + - html + - css + - markdown + + # Files and directories to exclude + exclude_patterns: + - "\\.git/" + - "__pycache__/" + - "\\.pytest_cache/" + - "node_modules/" + - "\\.venv/" + - "venv/" + - "package-lock\\.json$" + - "yarn\\.lock$" + - "\\.log$" + - "\\.tmp$" + - "\\.bak$" + - "~\\$.*" + - "\\.swp$" + - "\\.swo$" + - "\\.DS_Store" + - "\\.pyc$" + - "\\.pyo$" + - "\\.pyd$" + - "\\.so$" + - "\\.dll$" + - "\\.exe$" + +memory: + # Memory management for RTX3070 8GB with Qwen3-8B + max_memory_usage: 0.95 # Use up to 95% for more aggressive memory usage + enable_memory_tracking: true + clear_cache_between_epochs: true + + # Attention optimization + use_memory_efficient_attention: true + attention_slicing: true + slice_size: 1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dd72d9e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,55 @@ +# Core ML libraries +torch>=2.1.0 +torchvision>=0.16.0 +torchaudio>=2.1.0 + +# Unsloth for efficient model training +unsloth[cu121]>=2024.5 +unsloth_zoo>=2024.5 + +# Transformers and tokenizers +transformers>=4.38.0 +tokenizers>=0.15.0 +sentencepiece>=0.1.99 + +# Datasets and data processing +datasets>=2.18.0 +pandas>=2.0.0 +numpy>=1.24.0 + +# Git and repository handling +GitPython>=3.1.0 +requests>=2.31.0 + +# Configuration and utilities +PyYAML>=6.0.0 +tqdm>=4.65.0 +colorama>=0.4.6 +python-dotenv>=1.0.0 + +# Memory optimization +bitsandbytes>=0.43.0 +accelerate>=0.27.0 + +# Logging and monitoring +tensorboard>=2.14.0 +wandb>=0.16.0 + +# Code processing +tree-sitter>=0.20.0 +tree-sitter-python>=0.20.0 +tree-sitter-javascript>=0.20.0 +tree-sitter-typescript>=0.20.0 +tree-sitter-java>=0.20.0 +tree-sitter-go>=0.20.0 +tree-sitter-rust>=0.20.0 + +# Optional: for model quantization and optimization +optimum>=1.17.0 +auto-gptq>=0.6.0 + +# Development and testing +pytest>=7.4.0 +black>=23.0.0 +isort>=5.12.0 +flake8>=6.0.0 \ No newline at end of file diff --git a/run_training.py b/run_training.py new file mode 100644 index 0000000..c260dde --- /dev/null +++ b/run_training.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +""" +Simple training runner script for AI Trainer +""" + +import os +import sys +from pathlib import Path + +# Add src to path +sys.path.append(str(Path(__file__).parent / "src")) + +from main import main + +if __name__ == "__main__": + # Set environment variables for better CUDA performance + os.environ['CUDA_VISIBLE_DEVICES'] = '0' + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512' + os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + # Run the main training application + main() \ No newline at end of file diff --git a/run_training_qwen3.py b/run_training_qwen3.py new file mode 100644 index 0000000..a3a9e4c --- /dev/null +++ b/run_training_qwen3.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +""" +Training runner script for unsloth/Qwen3-8B-bnb-4bit model +Optimized for RTX3070 8GB VRAM +""" + +import os +import sys +from pathlib import Path + +# Add src to path +sys.path.append(str(Path(__file__).parent / "src")) + +from main import main + +if __name__ == "__main__": + # Set environment variables for better CUDA performance with Qwen3 + os.environ['CUDA_VISIBLE_DEVICES'] = '0' + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512' + os.environ['TOKENIZERS_PARALLELISM'] = 'false' + + # Use Qwen3 configuration by default + if '--config' not in sys.argv: + sys.argv.extend(['--config', 'configs/training_config_qwen3.yaml']) + + print("🚀 Starting training with unsloth/Qwen3-8B-bnb-4bit model") + print("📊 Configuration: configs/training_config_qwen3.yaml") + print("🧠 Memory optimization: RTX3070 8GB mode") + + # Run the main training application + main() \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..701b686 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,6 @@ +""" +AI Trainer - Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model +""" + +__version__ = "1.0.0" +__author__ = "AI Trainer" \ No newline at end of file diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..dddcbdc --- /dev/null +++ b/src/config.py @@ -0,0 +1,227 @@ +""" +Configuration management for AI Trainer +Handles training parameters and model settings +""" + +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Union + +import yaml + + +@dataclass +class ModelConfig: + """Model-specific configuration""" + name: str = "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit" + max_seq_length: int = 2048 + trust_remote_code: bool = True + use_fast_tokenizer: bool = True + padding_side: str = "left" + truncation_side: str = "left" + + +@dataclass +class TrainingConfig: + """Training configuration""" + per_device_train_batch_size: int = 2 + gradient_accumulation_steps: int = 4 + num_train_epochs: int = 3 + learning_rate: float = 2e-4 + warmup_steps: int = 10 + logging_steps: int = 1 + save_steps: int = 100 + save_total_limit: int = 3 + evaluation_strategy: str = "steps" + eval_steps: int = 100 + load_best_model_at_end: bool = True + metric_for_best_model: str = "loss" + greater_is_better: bool = False + dataloader_num_workers: int = 2 + dataloader_pin_memory: bool = True + remove_unused_columns: bool = False + label_names: List[str] = None + + # Memory optimization for RTX3070 8GB + use_gradient_checkpointing: bool = True + offload_to_cpu: bool = False # Explicitly no CPU offloading + use_reentrant: bool = True + gradient_checkpointing_kwargs: Dict = None + + # Optimizer settings + optim: str = "adamw_torch" + weight_decay: float = 0.01 + adam_beta1: float = 0.9 + adam_beta2: float = 0.999 + adam_epsilon: float = 1e-8 + max_grad_norm: float = 1.0 + + # Learning rate scheduler + lr_scheduler_type: str = "cosine" + warmup_ratio: float = 0.1 + + # Precision settings + bf16: bool = True + fp16: bool = False + tf32: bool = True + + # Dataset processing + dataset_shuffle: bool = True + dataset_seed: int = 42 + + # Output settings + output_dir: str = "./models" + logging_dir: str = "./logs" + report_to: List[str] = None + + def __post_init__(self): + if self.label_names is None: + self.label_names = ["labels"] + + if self.gradient_checkpointing_kwargs is None: + self.gradient_checkpointing_kwargs = {"use_reentrant": self.use_reentrant} + + if self.report_to is None: + self.report_to = ["tensorboard"] + + +@dataclass +class DatasetConfig: + """Dataset processing configuration""" + min_file_size: int = 10 + max_file_size: int = 10000 # Characters + supported_languages: List[str] = None + exclude_patterns: List[str] = None + + def __post_init__(self): + if self.supported_languages is None: + self.supported_languages = [ + 'python', 'javascript', 'typescript', 'java', 'cpp', 'c', + 'csharp', 'php', 'ruby', 'go', 'rust', 'swift', 'kotlin', + 'scala', 'sql', 'bash', 'yaml', 'json', 'xml', 'html', 'css' + ] + + if self.exclude_patterns is None: + self.exclude_patterns = [ + r'\.git/', + r'__pycache__/', + r'node_modules/', + r'\.venv/', + r'package-lock\.json$', + r'\.log$' + ] + + +@dataclass +class MemoryConfig: + """Memory optimization settings for RTX3070 8GB""" + max_memory_usage: float = 0.85 # Use up to 85% of GPU memory + enable_memory_tracking: bool = True + clear_cache_between_epochs: bool = True + use_memory_efficient_attention: bool = True + attention_slicing: bool = True + slice_size: int = 1 + + +@dataclass +class AppConfig: + """Main application configuration""" + model: ModelConfig + training: TrainingConfig + dataset: DatasetConfig + memory: MemoryConfig + + @classmethod + def from_yaml(cls, config_path: Union[str, Path]) -> "AppConfig": + """Load configuration from YAML file""" + config_path = Path(config_path) + + if not config_path.exists(): + # Create default configuration + config = cls( + model=ModelConfig(), + training=TrainingConfig(), + dataset=DatasetConfig(), + memory=MemoryConfig() + ) + config.save_yaml(config_path) + return config + + with open(config_path, 'r', encoding='utf-8') as f: + config_dict = yaml.safe_load(f) + + # Parse nested configurations + model_config = ModelConfig(**config_dict.get('model', {})) + training_config = TrainingConfig(**config_dict.get('training', {})) + dataset_config = DatasetConfig(**config_dict.get('dataset', {})) + memory_config = MemoryConfig(**config_dict.get('memory', {})) + + return cls( + model=model_config, + training=training_config, + dataset=dataset_config, + memory=memory_config + ) + + def save_yaml(self, config_path: Union[str, Path]): + """Save configuration to YAML file""" + config_path = Path(config_path) + config_path.parent.mkdir(parents=True, exist_ok=True) + + config_dict = { + 'model': { + 'name': self.model.name, + 'max_seq_length': self.model.max_seq_length, + 'trust_remote_code': self.model.trust_remote_code, + 'use_fast_tokenizer': self.model.use_fast_tokenizer, + 'padding_side': self.model.padding_side, + 'truncation_side': self.model.truncation_side + }, + 'training': { + 'per_device_train_batch_size': self.training.per_device_train_batch_size, + 'gradient_accumulation_steps': self.training.gradient_accumulation_steps, + 'num_train_epochs': self.training.num_train_epochs, + 'learning_rate': self.training.learning_rate, + 'warmup_steps': self.training.warmup_steps, + 'logging_steps': self.training.logging_steps, + 'save_steps': self.training.save_steps, + 'save_total_limit': self.training.save_total_limit, + 'evaluation_strategy': self.training.evaluation_strategy, + 'eval_steps': self.training.eval_steps, + 'load_best_model_at_end': self.training.load_best_model_at_end, + 'metric_for_best_model': self.training.metric_for_best_model, + 'greater_is_better': self.training.greater_is_better, + 'dataloader_num_workers': self.training.dataloader_num_workers, + 'dataloader_pin_memory': self.training.dataloader_pin_memory, + 'remove_unused_columns': self.training.remove_unused_columns, + 'use_gradient_checkpointing': self.training.use_gradient_checkpointing, + 'offload_to_cpu': self.training.offload_to_cpu, + 'optim': self.training.optim, + 'weight_decay': self.training.weight_decay, + 'lr_scheduler_type': self.training.lr_scheduler_type, + 'warmup_ratio': self.training.warmup_ratio, + 'bf16': self.training.bf16, + 'fp16': self.training.fp16, + 'tf32': self.training.tf32, + 'dataset_shuffle': self.training.dataset_shuffle, + 'dataset_seed': self.training.dataset_seed + }, + 'dataset': { + 'min_file_size': self.dataset.min_file_size, + 'max_file_size': self.dataset.max_file_size, + 'supported_languages': self.dataset.supported_languages, + 'exclude_patterns': self.dataset.exclude_patterns + }, + 'memory': { + 'max_memory_usage': self.memory.max_memory_usage, + 'enable_memory_tracking': self.memory.enable_memory_tracking, + 'clear_cache_between_epochs': self.memory.clear_cache_between_epochs, + 'use_memory_efficient_attention': self.memory.use_memory_efficient_attention, + 'attention_slicing': self.memory.attention_slicing, + 'slice_size': self.memory.slice_size + } + } + + with open(config_path, 'w', encoding='utf-8') as f: + yaml.dump(config_dict, f, default_flow_style=False, indent=2) \ No newline at end of file diff --git a/src/dataset_processor.py b/src/dataset_processor.py new file mode 100644 index 0000000..8f0f776 --- /dev/null +++ b/src/dataset_processor.py @@ -0,0 +1,250 @@ +""" +Dataset processor for GitHub repositories +Processes code from GitHub repositories into training datasets +""" + +import json +import logging +import os +import re +import shutil +import tempfile +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import git +from datasets import Dataset +from tqdm import tqdm + +from config import TrainingConfig + + +class DatasetProcessor: + """Processes GitHub repositories into training datasets""" + + # Supported file extensions for code training + CODE_EXTENSIONS = { + '.py': 'python', + '.js': 'javascript', + '.ts': 'typescript', + '.java': 'java', + '.cpp': 'cpp', + '.c': 'c', + '.h': 'c', + '.hpp': 'cpp', + '.cs': 'csharp', + '.php': 'php', + '.rb': 'ruby', + '.go': 'go', + '.rs': 'rust', + '.swift': 'swift', + '.kt': 'kotlin', + '.scala': 'scala', + '.sql': 'sql', + '.sh': 'bash', + '.yaml': 'yaml', + '.yml': 'yaml', + '.json': 'json', + '.xml': 'xml', + '.html': 'html', + '.css': 'css', + '.md': 'markdown' + } + + # Files and directories to exclude + EXCLUDE_PATTERNS = [ + r'\.git/', + r'__pycache__/', + r'\.pytest_cache/', + r'node_modules/', + r'\.venv/', + r'venv/', + r'\.DS_Store', + r'\.pyc$', + r'\.pyo$', + r'\.pyd$', + r'\.so$', + r'\.dll$', + r'\.exe$', + r'\.bin$', + r'package-lock\.json$', + r'yarn\.lock$', + r'\.log$', + r'\.tmp$', + r'\.bak$', + r'~\$.*', + r'\.swp$', + r'\.swo$' + ] + + def __init__(self): + self.logger = logging.getLogger(__name__) + self.temp_dirs = [] + + def process_github_repos(self, repo_urls: List[str], config: TrainingConfig) -> Dataset: + """ + Process multiple GitHub repositories into a training dataset + + Args: + repo_urls: List of GitHub repository URLs + config: Training configuration + + Returns: + Dataset ready for training + """ + all_code_samples = [] + + for repo_url in repo_urls: + try: + self.logger.info(f"Processing repository: {repo_url}") + repo_samples = self._process_single_repo(repo_url, config) + all_code_samples.extend(repo_samples) + self.logger.info(f"Extracted {len(repo_samples)} samples from {repo_url}") + except Exception as e: + self.logger.error(f"Failed to process repository {repo_url}: {str(e)}") + continue + + if not all_code_samples: + raise ValueError("No code samples extracted from any repository") + + self.logger.info(f"Total samples collected: {len(all_code_samples)}") + + # Create HuggingFace dataset + dataset = Dataset.from_list(all_code_samples) + + # Filter by sequence length + dataset = dataset.filter( + lambda x: len(x['text'].split()) <= config.model.max_seq_length + ) + + self.logger.info(f"Dataset size after filtering: {len(dataset)}") + return dataset + + def _process_single_repo(self, repo_url: str, config: TrainingConfig) -> List[Dict]: + """ + Process a single GitHub repository + + Args: + repo_url: GitHub repository URL + config: Training configuration + + Returns: + List of code samples with metadata + """ + temp_dir = tempfile.mkdtemp() + self.temp_dirs.append(temp_dir) + + try: + # Clone repository + repo_name = repo_url.split('/')[-1].replace('.git', '') + repo_path = os.path.join(temp_dir, repo_name) + + self.logger.info(f"Cloning {repo_url} to {repo_path}") + repo = git.Repo.clone_from(repo_url, repo_path) + + # Extract code samples + code_samples = self._extract_code_samples(repo_path, config) + + return code_samples + + finally: + # Cleanup + shutil.rmtree(temp_dir, ignore_errors=True) + + def _extract_code_samples(self, repo_path: str, config: TrainingConfig) -> List[Dict]: + """ + Extract code samples from a repository + + Args: + repo_path: Path to cloned repository + config: Training configuration + + Returns: + List of code samples + """ + code_samples = [] + repo_path_obj = Path(repo_path) + + # Find all code files + code_files = [] + for ext in self.CODE_EXTENSIONS: + code_files.extend(repo_path_obj.rglob(f'*{ext}')) + + self.logger.info(f"Found {len(code_files)} code files") + + for code_file in tqdm(code_files, desc="Processing code files"): + try: + if self._should_exclude_file(str(code_file.relative_to(repo_path))): + continue + + sample = self._process_code_file(code_file, repo_path_obj, config) + if sample: + code_samples.append(sample) + + except Exception as e: + self.logger.warning(f"Failed to process {code_file}: {str(e)}") + continue + + return code_samples + + def _should_exclude_file(self, relative_path: str) -> bool: + """Check if a file should be excluded based on patterns""" + for pattern in self.EXCLUDE_PATTERNS: + if re.search(pattern, relative_path): + return True + return False + + def _process_code_file(self, file_path: Path, repo_path: Path, config: TrainingConfig) -> Optional[Dict]: + """ + Process a single code file into a training sample + + Args: + file_path: Path to the code file + repo_path: Path to the repository root + config: Training configuration + + Returns: + Dictionary containing the processed sample or None if invalid + """ + try: + # Read file content + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + # Skip if file is too small or too large + if len(content.strip()) < 10: + return None + if len(content) > config.model.max_seq_length * 4: # Rough character limit + return None + + # Get relative path for context + relative_path = file_path.relative_to(repo_path) + + # Determine language + extension = file_path.suffix.lower() + language = self.CODE_EXTENSIONS.get(extension, 'unknown') + + # Create training sample + sample = { + 'text': content, + 'language': language, + 'file_path': str(relative_path), + 'repo_name': repo_path.name, + 'file_size': len(content), + 'line_count': len(content.splitlines()) + } + + return sample + + except Exception as e: + self.logger.warning(f"Error processing {file_path}: {str(e)}") + return None + + def cleanup(self): + """Clean up temporary directories""" + for temp_dir in self.temp_dirs: + try: + shutil.rmtree(temp_dir, ignore_errors=True) + except Exception as e: + self.logger.warning(f"Failed to cleanup {temp_dir}: {str(e)}") + self.temp_dirs.clear() \ No newline at end of file diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..2e45019 --- /dev/null +++ b/src/main.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +""" +Main entry point for AI Trainer application +Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model +""" + +import argparse +import logging +import os +import sys +from pathlib import Path + +# Add src to path for imports +sys.path.append(str(Path(__file__).parent)) + +from trainer import ModelTrainer +from dataset_processor import DatasetProcessor +from config import TrainingConfig +from utils import setup_logging, check_gpu_memory + + +def parse_arguments(): + """Parse command line arguments""" + parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model") + + parser.add_argument( + "--config", + type=str, + default="configs/training_config.yaml", + help="Path to training configuration file" + ) + + parser.add_argument( + "--repo1", + type=str, + required=True, + help="First GitHub repository URL" + ) + + parser.add_argument( + "--repo2", + type=str, + required=True, + help="Second GitHub repository URL" + ) + + parser.add_argument( + "--output_dir", + type=str, + default="./models", + help="Directory to save trained model" + ) + + parser.add_argument( + "--log_level", + type=str, + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], + help="Logging level" + ) + + return parser.parse_args() + + +def main(): + """Main application entry point""" + args = parse_arguments() + + # Setup logging + setup_logging(args.log_level) + logger = logging.getLogger(__name__) + + logger.info("Starting AI Trainer for Qwen2.5-Coder-7B-Instruct-bnb-4bit") + logger.info(f"Repository 1: {args.repo1}") + logger.info(f"Repository 2: {args.repo2}") + + try: + # Check GPU memory + gpu_info = check_gpu_memory() + logger.info(f"GPU Memory Info: {gpu_info}") + + # Load configuration + config = TrainingConfig.from_yaml(args.config) + logger.info("Configuration loaded successfully") + + # Process datasets from GitHub repositories + dataset_processor = DatasetProcessor() + logger.info("Processing datasets from GitHub repositories...") + + train_dataset = dataset_processor.process_github_repos( + repo_urls=[args.repo1, args.repo2], + config=config + ) + + logger.info(f"Dataset processed successfully. Size: {len(train_dataset)}") + + # Initialize and run trainer + trainer = ModelTrainer(config=config, output_dir=args.output_dir) + logger.info("Starting model training...") + + trained_model_path = trainer.train(train_dataset) + logger.info(f"Training completed! Model saved to: {trained_model_path}") + + except Exception as e: + logger.error(f"Training failed with error: {str(e)}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/trainer.py b/src/trainer.py new file mode 100644 index 0000000..f9c6175 --- /dev/null +++ b/src/trainer.py @@ -0,0 +1,284 @@ +""" +Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit +Optimized for RTX3070 8GB VRAM with no CPU offloading +""" + +import logging +import os +import gc +import torch +from pathlib import Path +from typing import Optional, Dict, Any + +import torch.nn as nn +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + Trainer, + TrainingArguments, + DataCollatorForLanguageModeling +) +from datasets import Dataset +from unsloth import FastLanguageModel, is_bfloat16_supported + +from config import AppConfig +from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage + + +class ModelTrainer: + """Trainer class for fine-tuning the Qwen2.5-Coder model""" + + def __init__(self, config: AppConfig, output_dir: str = "./models"): + """ + Initialize the model trainer + + Args: + config: Application configuration + output_dir: Directory to save the trained model + """ + self.config = config + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + self.logger = logging.getLogger(__name__) + + # Model and tokenizer + self.model = None + self.tokenizer = None + + # Training components + self.trainer = None + + # Memory tracking + self.initial_memory = None + + def train(self, train_dataset: Dataset) -> str: + """ + Train the model on the provided dataset + + Args: + train_dataset: Dataset for training + + Returns: + Path to the saved model + """ + try: + self.logger.info("Starting model training...") + + # Check initial GPU memory + self._check_initial_setup() + + # Load model and tokenizer + self._load_model_and_tokenizer() + + # Prepare dataset + tokenized_dataset = self._prepare_dataset(train_dataset) + + # Setup trainer + self._setup_trainer(tokenized_dataset) + + # Start training + self.logger.info("Beginning training loop...") + self.trainer.train() + + # Save final model + final_model_path = self._save_model() + + self.logger.info(f"Training completed successfully! Model saved to: {final_model_path}") + return str(final_model_path) + + except Exception as e: + self.logger.error(f"Training failed: {str(e)}") + raise + finally: + self._cleanup() + + def _check_initial_setup(self): + """Check initial GPU memory and setup""" + gpu_info = check_gpu_memory() + self.logger.info(f"GPU Memory Info: {gpu_info}") + + # Store initial memory usage + self.initial_memory = get_memory_usage() + self.logger.info(".2f") + + # Verify CUDA availability + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available. This trainer requires a CUDA-compatible GPU.") + + self.logger.info(f"CUDA device: {torch.cuda.get_device_name()}") + self.logger.info(f"CUDA version: {torch.version.cuda}") + + def _load_model_and_tokenizer(self): + """Load the model and tokenizer with memory optimization""" + self.logger.info(f"Loading model: {self.config.model.name}") + + # Clear cache before loading + clear_gpu_cache() + + try: + # Load model with unsloth for memory efficiency + self.model, self.tokenizer = FastLanguageModel.from_pretrained( + model_name=self.config.model.name, + max_seq_length=self.config.model.max_seq_length, + dtype=None, # Auto-detect + load_in_4bit=True, # Use 4-bit quantization + token=None, # Use default token + ) + + # Configure model for training + self.model = FastLanguageModel.get_peft_model( + self.model, + r=16, # LoRA rank + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj" + ], + lora_alpha=16, + lora_dropout=0, # Supports any, but = 0 is optimized + bias="none", # Supports any, but = "none" is optimized + use_gradient_checkpointing=self.config.training.use_gradient_checkpointing, + random_state=3407, + use_rslora=False, # We support rank stabilized LoRA + loftq_config=None, # And LoftQ + ) + + self.logger.info("Model and tokenizer loaded successfully") + + except Exception as e: + self.logger.error(f"Failed to load model: {str(e)}") + raise + + def _prepare_dataset(self, train_dataset: Dataset) -> Dataset: + """Prepare and tokenize the dataset""" + self.logger.info("Preparing dataset...") + + def tokenize_function(examples): + return self.tokenizer( + examples["text"], + padding="max_length", + truncation=True, + max_length=self.config.model.max_seq_length, + return_tensors="pt" + ) + + # Tokenize dataset + tokenized_dataset = train_dataset.map( + tokenize_function, + batched=True, + remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"], + desc="Tokenizing dataset" + ) + + self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}") + return tokenized_dataset + + def _setup_trainer(self, tokenized_dataset: Dataset): + """Setup the HuggingFace trainer with memory optimizations""" + self.logger.info("Setting up trainer...") + + # Training arguments optimized for RTX3070 8GB + training_args = TrainingArguments( + output_dir=str(self.output_dir / "checkpoints"), + num_train_epochs=self.config.training.num_train_epochs, + per_device_train_batch_size=self.config.training.per_device_train_batch_size, + gradient_accumulation_steps=self.config.training.gradient_accumulation_steps, + learning_rate=self.config.training.learning_rate, + warmup_steps=self.config.training.warmup_steps, + warmup_ratio=self.config.training.warmup_ratio, + logging_steps=self.config.training.logging_steps, + save_steps=self.config.training.save_steps, + save_total_limit=self.config.training.save_total_limit, + evaluation_strategy=self.config.training.evaluation_strategy, + eval_steps=self.config.training.eval_steps, + load_best_model_at_end=self.config.training.load_best_model_at_end, + metric_for_best_model=self.config.training.metric_for_best_model, + greater_is_better=self.config.training.greater_is_better, + optim=self.config.training.optim, + weight_decay=self.config.training.weight_decay, + lr_scheduler_type=self.config.training.lr_scheduler_type, + adam_beta1=self.config.training.adam_beta1, + adam_beta2=self.config.training.adam_beta2, + adam_epsilon=self.config.training.adam_epsilon, + max_grad_norm=self.config.training.max_grad_norm, + dataloader_num_workers=self.config.training.dataloader_num_workers, + dataloader_pin_memory=self.config.training.dataloader_pin_memory, + remove_unused_columns=self.config.training.remove_unused_columns, + bf16=self.config.training.bf16 if is_bfloat16_supported() else False, + fp16=self.config.training.fp16, + tf32=self.config.training.tf32, + report_to=self.config.training.report_to, + logging_dir=self.config.training.logging_dir, + seed=self.config.training.dataset_seed, + data_seed=self.config.training.dataset_seed, + dataloader_drop_last=True, # Better memory management + gradient_checkpointing=self.config.training.use_gradient_checkpointing, + # Memory optimization settings + ddp_find_unused_parameters=False, + per_device_eval_batch_size=self.config.training.per_device_train_batch_size, + ) + + # Data collator + data_collator = DataCollatorForLanguageModeling( + tokenizer=self.tokenizer, + mlm=False # Causal language modeling + ) + + # Initialize trainer + self.trainer = Trainer( + model=self.model, + args=training_args, + train_dataset=tokenized_dataset, + eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo) + data_collator=data_collator, + tokenizer=self.tokenizer, + ) + + self.logger.info("Trainer setup completed") + + def _save_model(self) -> Path: + """Save the trained model""" + self.logger.info("Saving model...") + + # Create final model directory + final_model_dir = self.output_dir / "final_model" + final_model_dir.mkdir(parents=True, exist_ok=True) + + try: + # Save the model + self.model.save_pretrained(str(final_model_dir)) + self.tokenizer.save_pretrained(str(final_model_dir)) + + # Save configuration + self.config.save_yaml(final_model_dir / "training_config.yaml") + + self.logger.info(f"Model saved to: {final_model_dir}") + return final_model_dir + + except Exception as e: + self.logger.error(f"Failed to save model: {str(e)}") + raise + + def _cleanup(self): + """Clean up resources""" + try: + # Clear GPU cache + clear_gpu_cache() + + # Force garbage collection + gc.collect() + + # Delete model and tokenizer to free memory + if self.model is not None: + del self.model + if self.tokenizer is not None: + del self.tokenizer + if self.trainer is not None: + del self.trainer + + # Final memory cleanup + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e: + self.logger.warning(f"Error during cleanup: {str(e)}") \ No newline at end of file diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..c87b677 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,319 @@ +""" +Utility functions for AI Trainer +Memory management, logging, and helper functions optimized for RTX3070 8GB VRAM +""" + +import gc +import logging +import os +import sys +from pathlib import Path +from typing import Dict, Optional, Tuple, Any + +import torch +import psutil +from colorama import init, Fore, Back, Style + +# Initialize colorama for cross-platform colored output +init(autoreset=True) + + +def setup_logging(log_level: str = "INFO", log_file: Optional[str] = None) -> logging.Logger: + """ + Setup logging configuration with colored console output + + Args: + log_level: Logging level (DEBUG, INFO, WARNING, ERROR) + log_file: Optional log file path + + Returns: + Configured logger + """ + # Create formatter with colors + class ColoredFormatter(logging.Formatter): + COLORS = { + 'DEBUG': Fore.CYAN, + 'INFO': Fore.GREEN, + 'WARNING': Fore.YELLOW, + 'ERROR': Fore.RED, + 'CRITICAL': Fore.RED + Back.WHITE + } + + def format(self, record): + # Add color to the level name + if record.levelname in self.COLORS: + colored_levelname = f"{self.COLORS[record.levelname]}{record.levelname}{Style.RESET_ALL}" + record.levelname = colored_levelname + return super().format(record) + + # Create logger + logger = logging.getLogger() + logger.setLevel(getattr(logging, log_level.upper())) + + # Console handler with colors + console_handler = logging.StreamHandler(sys.stdout) + console_formatter = ColoredFormatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + console_handler.setFormatter(console_formatter) + logger.addHandler(console_handler) + + # File handler if specified + if log_file: + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + + file_handler = logging.FileHandler(log_path) + file_formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + file_handler.setFormatter(file_formatter) + logger.addHandler(file_handler) + + return logger + + +def check_gpu_memory() -> Dict[str, Any]: + """ + Check GPU memory status and availability + + Returns: + Dictionary with GPU memory information + """ + if not torch.cuda.is_available(): + return {"error": "CUDA not available"} + + try: + device = torch.cuda.current_device() + total_memory = torch.cuda.get_device_properties(device).total_memory + allocated_memory = torch.cuda.memory_allocated(device) + reserved_memory = torch.cuda.memory_reserved(device) + free_memory = total_memory - allocated_memory + + return { + "device": torch.cuda.get_device_name(device), + "device_id": device, + "total_memory_gb": round(total_memory / (1024**3), 2), + "allocated_memory_gb": round(allocated_memory / (1024**3), 2), + "reserved_memory_gb": round(reserved_memory / (1024**3), 2), + "free_memory_gb": round(free_memory / (1024**3), 2), + "memory_utilization": round((allocated_memory / total_memory) * 100, 2), + "cuda_version": torch.version.cuda, + "cudnn_version": torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else "N/A" + } + except Exception as e: + return {"error": f"Failed to get GPU info: {str(e)}"} + + +def get_memory_usage() -> Dict[str, float]: + """ + Get system memory usage + + Returns: + Dictionary with memory usage information + """ + try: + # GPU memory + gpu_memory = check_gpu_memory() + + # System memory + system_memory = psutil.virtual_memory() + + return { + "gpu_total_gb": gpu_memory.get("total_memory_gb", 0), + "gpu_allocated_gb": gpu_memory.get("allocated_memory_gb", 0), + "gpu_free_gb": gpu_memory.get("free_memory_gb", 0), + "system_total_gb": round(system_memory.total / (1024**3), 2), + "system_available_gb": round(system_memory.available / (1024**3), 2), + "system_used_gb": round(system_memory.used / (1024**3), 2), + "system_memory_percent": system_memory.percent + } + except Exception as e: + return {"error": f"Failed to get memory usage: {str(e)}"} + + +def clear_gpu_cache(): + """Clear GPU cache and perform garbage collection""" + try: + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + + # Force garbage collection + gc.collect() + + except Exception as e: + print(f"Warning: Failed to clear GPU cache: {str(e)}") + + +def optimize_memory_settings(): + """Apply memory optimization settings for RTX3070""" + try: + if torch.cuda.is_available(): + # Set memory fraction to prevent out-of-memory + torch.cuda.set_per_process_memory_fraction(0.85) # Use 85% of GPU memory + + # Enable TF32 for better performance + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + + # Optimize CUDA memory allocator + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512' + + except Exception as e: + print(f"Warning: Failed to optimize memory settings: {str(e)}") + + +def format_bytes(bytes_value: int) -> str: + """ + Format bytes into human readable format + + Args: + bytes_value: Number of bytes + + Returns: + Formatted string (e.g., "1.5 GB") + """ + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if bytes_value < 1024.0: + return ".1f" + bytes_value /= 1024.0 + return ".1f" + + +def print_system_info(): + """Print comprehensive system information""" + print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}") + print(f"{Fore.CYAN}SYSTEM INFORMATION{Style.RESET_ALL}") + print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}") + + # GPU Information + gpu_info = check_gpu_memory() + if "error" not in gpu_info: + print(f"\n{Fore.GREEN}GPU Information:{Style.RESET_ALL}") + print(f" Device: {gpu_info['device']}") + print(f" CUDA Version: {gpu_info['cuda_version']}") + print(f" Total Memory: {gpu_info['total_memory_gb']} GB") + print(f" Allocated Memory: {gpu_info['allocated_memory_gb']} GB") + print(f" Free Memory: {gpu_info['free_memory_gb']} GB") + print(f" Memory Utilization: {gpu_info['memory_utilization']}%") + else: + print(f"\n{Fore.RED}GPU Information: {gpu_info['error']}{Style.RESET_ALL}") + + # System Memory + system_memory = psutil.virtual_memory() + print(f"\n{Fore.GREEN}System Memory:{Style.RESET_ALL}") + print(f" Total: {format_bytes(system_memory.total)}") + print(f" Available: {format_bytes(system_memory.available)}") + print(f" Used: {format_bytes(system_memory.used)}") + print(f" Usage: {system_memory.percent}%") + + # CPU Information + print(f"\n{Fore.GREEN}CPU Information:{Style.RESET_ALL}") + print(f" Cores: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical") + print(f" CPU Usage: {psutil.cpu_percent()}%") + + print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}") + + +def validate_environment(): + """Validate that the environment is suitable for training""" + issues = [] + + # Check CUDA availability + if not torch.cuda.is_available(): + issues.append("CUDA is not available. A CUDA-compatible GPU is required.") + + # Check GPU memory + if torch.cuda.is_available(): + gpu_info = check_gpu_memory() + if "total_memory_gb" in gpu_info: + total_memory = gpu_info["total_memory_gb"] + if total_memory < 8: + issues.append(f"GPU memory ({total_memory} GB) may be insufficient. Recommended: 8GB+") + + # Check required Python modules + required_modules = ['torch', 'transformers', 'datasets', 'git'] + for module in required_modules: + try: + __import__(module) + except ImportError: + issues.append(f"Required module '{module}' is not installed.") + + if issues: + print(f"\n{Fore.YELLOW}Environment Validation Issues:{Style.RESET_ALL}") + for issue in issues: + print(f" - {issue}") + return False + + print(f"\n{Fore.GREEN}Environment validation passed!{Style.RESET_ALL}") + return True + + +def create_training_summary(config, training_time: float, final_model_path: str) -> str: + """ + Create a summary of the training session + + Args: + config: Training configuration + training_time: Training time in seconds + final_model_path: Path to the saved model + + Returns: + Formatted summary string + """ + summary = ".1f"".2f"f""" +{Fore.CYAN}{'='*60}{Style.RESET_ALL} +TRAINING SUMMARY +{Fore.CYAN}{'='*60}{Style.RESET_ALL} + +Configuration: + Model: {config.model.name} + Epochs: {config.training.num_train_epochs} + Batch Size: {config.training.per_device_train_batch_size} + Gradient Accumulation: {config.training.gradient_accumulation_steps} + Learning Rate: {config.training.learning_rate} + Max Sequence Length: {config.model.max_seq_length} + +Performance: + Training Time: {training_time:.2f} seconds ({training_time/3600:.2f} hours) + Effective Batch Size: {config.training.per_device_train_batch_size * config.training.gradient_accumulation_steps} + +Output: + Model Saved To: {final_model_path} + +Memory Settings: + Gradient Checkpointing: {config.training.use_gradient_checkpointing} + CPU Offloading: {config.training.offload_to_cpu} + BF16 Enabled: {config.training.bf16} + +{Fore.CYAN}{'='*60}{Style.RESET_ALL} +""" + + return summary + + +def safe_import(module_name: str, fallback: Any = None): + """ + Safely import a module with fallback + + Args: + module_name: Name of the module to import + fallback: Fallback value if import fails + + Returns: + Imported module or fallback + """ + try: + return __import__(module_name) + except ImportError: + return fallback + + +# Initialize memory optimization settings on import +try: + optimize_memory_settings() +except Exception: + pass # Ignore errors during initialization \ No newline at end of file