# AI Trainer for Qwen Models on Google Colab (T4 GPU)

This notebook allows you to train Qwen models on GitHub repositories using Google Colab's T4 GPU with 13GB VRAM.

## 1. Setup Environment

First, let's install the required dependencies.

In [None]:
# Install required packages
!pip install unsloth[cu121] bitsandbytes
!pip install transformers datasets
!pip install accelerate peft
!pip install GitPython PyYAML

In [None]:
# Set environment variables for optimal GPU performance
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['DISABLE_TORCH_COMPILE'] = '1'

print("Environment variables set successfully!")

## 2. Import Libraries

Let's import all necessary libraries.

In [None]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import git
from pathlib import Path

## 3. Configuration

Configuration optimized for T4 GPU with 13GB VRAM.

In [None]:
# Model configuration
MODEL_NAME = "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
MAX_SEQ_LENGTH = 2048

# Training configuration for T4 GPU (13GB VRAM)
TRAINING_CONFIG = {
    'per_device_train_batch_size': 1,
    'gradient_accumulation_steps': 8,
    'max_steps': 100,
    'learning_rate': 2e-4,
    'use_gradient_checkpointing': True,
    'bf16': True
}

## 4. Load Model

Load the Qwen model with Unsloth for memory efficiency.

In [None]:
# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,
)

# Configure model for training
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=TRAINING_CONFIG['use_gradient_checkpointing'],
    random_state=3407,
)

## 5. Process GitHub Repositories

Extract code from GitHub repositories for training.

In [None]:
def process_github_repo(repo_url):
    """Process a GitHub repository and extract code samples"""
    import tempfile
    from datasets import Dataset
    
    with tempfile.TemporaryDirectory() as temp_dir:
        # Clone repository
        repo_name = repo_url.split('/')[-1].replace('.git', '')
        repo_path = f"{temp_dir}/{repo_name}"
        
        print(f"Cloning {repo_url}...")
        repo = git.Repo.clone_from(repo_url, repo_path, depth=1)
        
        # Extract Python files as example
        code_samples = []
        py_files = Path(repo_path).rglob('*.py')
        
        for py_file in list(py_files)[:10]:  # Limit to first 10 files
            try:
                with open(py_file, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                
                if len(content.strip()) > 10:  # Skip tiny files
                    code_samples.append({
                        'text': content,
                        'repo_name': repo_name,
                        'file_path': str(py_file.relative_to(repo_path))
                    })
            except Exception as e:
                print(f"Error processing {py_file}: {e}")
                continue
        
        return Dataset.from_list(code_samples)

# Example usage (replace with your own repositories)
# dataset = process_github_repo("https://github.com/your-username/your-repo.git")

## 5.1 Advanced Dataset Processing

For more comprehensive dataset processing with support for multiple file types, you can use this advanced processor:

In [None]:
class AdvancedDatasetProcessor:
    """Advanced processor for GitHub repositories with comprehensive file support"""
    
    # Supported file extensions
    CODE_EXTENSIONS = {
        '.py': 'python', '.js': 'javascript', '.ts': 'typescript',
        '.java': 'java', '.cpp': 'cpp', '.c': 'c', '.cs': 'csharp',
        '.php': 'php', '.rb': 'ruby', '.go': 'go', '.rs': 'rust',
        '.swift': 'swift', '.kt': 'kotlin', '.scala': 'scala',
        '.sql': 'sql', '.sh': 'bash', '.yaml': 'yaml', '.yml': 'yaml',
        '.json': 'json', '.xml': 'xml', '.html': 'html', '.css': 'css',
        '.md': 'markdown'
    }
    
    def __init__(self):
        pass
        
    def process_github_repos(self, repo_urls, max_files_per_repo=50):
        """Process multiple GitHub repositories into a training dataset"""
        all_code_samples = []
        
        for repo_url in repo_urls:
            try:
                print(f"Processing repository: {repo_url}")
                repo_samples = self._process_single_repo(repo_url, max_files_per_repo)
                all_code_samples.extend(repo_samples)
                print(f"Extracted {len(repo_samples)} samples from {repo_url}")
            except Exception as e:
                print(f"Failed to process repository {repo_url}: {str(e)}")
                continue
        
        if not all_code_samples:
            raise ValueError("No code samples extracted from any repository")
        
        print(f"Total samples collected: {len(all_code_samples)}")
        
        # Create HuggingFace dataset
        from datasets import Dataset
        dataset = Dataset.from_list(all_code_samples)
        return dataset
    
    def _process_single_repo(self, repo_url, max_files_per_repo):
        """Process a single GitHub repository"""
        import tempfile
        
        with tempfile.TemporaryDirectory() as temp_dir:
            try:
                # Clone repository
                repo_name = repo_url.split('/')[-1].replace('.git', '')
                repo_path = f"{temp_dir}/{repo_name}"
                
                print(f"Cloning {repo_url}...")
                repo = git.Repo.clone_from(repo_url, repo_path, depth=1)
                
                # Extract code samples
                code_samples = self._extract_code_samples(repo_path, max_files_per_repo)
                
                return code_samples
                
            finally:
                print(f"Finished processing {repo_url}")
    
    def _extract_code_samples(self, repo_path, max_files_per_repo):
        """Extract code samples from a repository"""
        code_samples = []
        repo_path_obj = Path(repo_path)
        
        # Find all code files
        code_files = []
        for ext in self.CODE_EXTENSIONS:
            code_files.extend(repo_path_obj.rglob(f'*{ext}'))
        
        print(f"Found {len(code_files)} code files")
        
        # Limit files per repo to prevent memory issues
        code_files = code_files[:max_files_per_repo]
        
        for code_file in code_files:
            try:
                if self._should_exclude_file(str(code_file.relative_to(repo_path))):
                    continue
                
                sample = self._process_code_file(code_file, repo_path_obj)
                if sample:
                    code_samples.append(sample)
                    
            except Exception as e:
                print(f"Failed to process {code_file}: {str(e)}")
                continue
        
        return code_samples
    
    def _should_exclude_file(self, relative_path):
        """Check if a file should be excluded based on patterns"""
        import re
        exclude_patterns = [
            r'\.git/', r'__pycache__/', r'node_modules/',
            r'\.venv/', r'venv/', r'package-lock\.json$',
            r'\.log$', r'\.tmp$', r'~\$.*', r'\.swp$',
            r'\.DS_Store', r'\.pyc$'
        ]
        for pattern in exclude_patterns:
            if re.search(pattern, relative_path):
                return True
        return False
    
    def _process_code_file(self, file_path, repo_path):
        """Process a single code file into a training sample"""
        try:
            # Read file content
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            # Skip if file is too small or too large
            if len(content.strip()) < 10:
                return None
            if len(content) > 100000:  # Rough limit
                return None
            
            # Get relative path for context
            relative_path = file_path.relative_to(repo_path)
            
            # Determine language
            extension = file_path.suffix.lower()
            language = self.CODE_EXTENSIONS.get(extension, 'unknown')
            
            # Create training sample
            sample = {
                'text': content,
                'language': language,
                'file_path': str(relative_path),
                'repo_name': repo_path.name,
                'file_size': len(content),
                'line_count': len(content.splitlines())
            }
            
            return sample
            
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
            return None

# Example usage:
# processor = AdvancedDatasetProcessor()
# dataset = processor.process_github_repos([
#     "https://github.com/karpathy/nanoGPT.git"
# ])

## 6. Training

Set up and run the training process.

In [None]:
# Create a simple example dataset if you don't have your own
from datasets import Dataset

# Example dataset - replace with your own data
example_data = [
    {"text": "def hello_world():\n    print('Hello, World!')"},
    {"text": "class Calculator:\n    def add(self, a, b):\n        return a + b"},
    {"text": "import numpy as np\n\narr = np.array([1, 2, 3])\nprint(arr)"}
]

dataset = Dataset.from_list(example_data)
print(f"Example dataset created with {len(dataset)} samples")

In [None]:
# Tokenize dataset
def tokenize_function(examples):
    # Simple tokenization - replace with more sophisticated approach for your use case
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# Set up trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    packing=True,
    args=TrainingArguments(
        per_device_train_batch_size=TRAINING_CONFIG['per_device_train_batch_size'],
        gradient_accumulation_steps=TRAINING_CONFIG['gradient_accumulation_steps'],
        max_steps=TRAINING_CONFIG['max_steps'],
        learning_rate=TRAINING_CONFIG['learning_rate'],
        fp16=not TRAINING_CONFIG['bf16'],
        bf16=TRAINING_CONFIG['bf16'],
        logging_steps=1,
        save_steps=50,
        output_dir="./model_output",
        optim="adamw_torch",
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
    ),
)

In [None]:
# Start training
print("Starting training...")
trainer.train()
print("Training completed!")

## 7. Save Model

Save the trained model.

In [None]:
# Save the model
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")
print("Model saved successfully!")