ai_github_trainer/ai_trainer_t4_colab.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# AI Trainer for Qwen Models on Google Colab (T4 GPU)\n",
    "\n",
    "This notebook allows you to train Qwen models on GitHub repositories using Google Colab's T4 GPU with 13GB VRAM."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Setup Environment\n",
    "\n",
    "First, let's install the required dependencies."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install required packages\n",
    "!pip install unsloth[cu121] bitsandbytes\n",
    "!pip install transformers datasets\n",
    "!pip install accelerate peft\n",
    "!pip install GitPython PyYAML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set environment variables for optimal GPU performance\n",
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n",
    "os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'\n",
    "os.environ['TOKENIZERS_PARALLELISM'] = 'false'\n",
    "os.environ['DISABLE_TORCH_COMPILE'] = '1'\n",
    "\n",
    "print(\"Environment variables set successfully!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Import Libraries\n",
    "\n",
    "Let's import all necessary libraries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from unsloth import FastLanguageModel\n",
    "from trl import SFTTrainer\n",
    "from transformers import TrainingArguments\n",
    "import git\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Configuration\n",
    "\n",
    "Configuration optimized for T4 GPU with 13GB VRAM."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Model configuration\n",
    "MODEL_NAME = \"unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit\"\n",
    "MAX_SEQ_LENGTH = 2048\n",
    "\n",
    "# Training configuration for T4 GPU (13GB VRAM)\n",
    "TRAINING_CONFIG = {\n",
    "    'per_device_train_batch_size': 1,\n",
    "    'gradient_accumulation_steps': 8,\n",
    "    'max_steps': 100,\n",
    "    'learning_rate': 2e-4,\n",
    "    'use_gradient_checkpointing': True,\n",
    "    'bf16': True\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Load Model\n",
    "\n",
    "Load the Qwen model with Unsloth for memory efficiency."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load model and tokenizer\n",
    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
    "    model_name=MODEL_NAME,\n",
    "    max_seq_length=MAX_SEQ_LENGTH,\n",
    "    dtype=None,\n",
    "    load_in_4bit=True,\n",
    ")\n",
    "\n",
    "# Configure model for training\n",
    "model = FastLanguageModel.get_peft_model(\n",
    "    model,\n",
    "    r=16,\n",
    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
    "    lora_alpha=16,\n",
    "    lora_dropout=0,\n",
    "    bias=\"none\",\n",
    "    use_gradient_checkpointing=TRAINING_CONFIG['use_gradient_checkpointing'],\n",
    "    random_state=3407,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Process GitHub Repositories\n",
    "\n",
    "Extract code from GitHub repositories for training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_github_repo(repo_url):\n",
    "    \"\"\"Process a GitHub repository and extract code samples\"\"\"\n",
    "    import tempfile\n",
    "    from datasets import Dataset\n",
    "    \n",
    "    with tempfile.TemporaryDirectory() as temp_dir:\n",
    "        # Clone repository\n",
    "        repo_name = repo_url.split('/')[-1].replace('.git', '')\n",
    "        repo_path = f\"{temp_dir}/{repo_name}\"\n",
    "        \n",
    "        print(f\"Cloning {repo_url}...\")\n",
    "        repo = git.Repo.clone_from(repo_url, repo_path, depth=1)\n",
    "        \n",
    "        # Extract Python files as example\n",
    "        code_samples = []\n",
    "        py_files = Path(repo_path).rglob('*.py')\n",
    "        \n",
    "        for py_file in list(py_files)[:10]:  # Limit to first 10 files\n",
    "            try:\n",
    "                with open(py_file, 'r', encoding='utf-8', errors='ignore') as f:\n",
    "                    content = f.read()\n",
    "                \n",
    "                if len(content.strip()) > 10:  # Skip tiny files\n",
    "                    code_samples.append({\n",
    "                        'text': content,\n",
    "                        'repo_name': repo_name,\n",
    "                        'file_path': str(py_file.relative_to(repo_path))\n",
    "                    })\n",
    "            except Exception as e:\n",
    "                print(f\"Error processing {py_file}: {e}\")\n",
    "                continue\n",
    "        \n",
    "        return Dataset.from_list(code_samples)\n",
    "\n",
    "# Example usage (replace with your own repositories)\n",
    "# dataset = process_github_repo(\"https://github.com/your-username/your-repo.git\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.1 Advanced Dataset Processing\n",
    "\n",
    "For more comprehensive dataset processing with support for multiple file types, you can use this advanced processor:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class AdvancedDatasetProcessor:\n",
    "    \"\"\"Advanced processor for GitHub repositories with comprehensive file support\"\"\"\n",
    "    \n",
    "    # Supported file extensions\n",
    "    CODE_EXTENSIONS = {\n",
    "        '.py': 'python', '.js': 'javascript', '.ts': 'typescript',\n",
    "        '.java': 'java', '.cpp': 'cpp', '.c': 'c', '.cs': 'csharp',\n",
    "        '.php': 'php', '.rb': 'ruby', '.go': 'go', '.rs': 'rust',\n",
    "        '.swift': 'swift', '.kt': 'kotlin', '.scala': 'scala',\n",
    "        '.sql': 'sql', '.sh': 'bash', '.yaml': 'yaml', '.yml': 'yaml',\n",
    "        '.json': 'json', '.xml': 'xml', '.html': 'html', '.css': 'css',\n",
    "        '.md': 'markdown'\n",
    "    }\n",
    "    \n",
    "    def __init__(self):\n",
    "        pass\n",
    "        \n",
    "    def process_github_repos(self, repo_urls, max_files_per_repo=50):\n",
    "        \"\"\"Process multiple GitHub repositories into a training dataset\"\"\"\n",
    "        all_code_samples = []\n",
    "        \n",
    "        for repo_url in repo_urls:\n",
    "            try:\n",
    "                print(f\"Processing repository: {repo_url}\")\n",
    "                repo_samples = self._process_single_repo(repo_url, max_files_per_repo)\n",
    "                all_code_samples.extend(repo_samples)\n",
    "                print(f\"Extracted {len(repo_samples)} samples from {repo_url}\")\n",
    "            except Exception as e:\n",
    "                print(f\"Failed to process repository {repo_url}: {str(e)}\")\n",
    "                continue\n",
    "        \n",
    "        if not all_code_samples:\n",
    "            raise ValueError(\"No code samples extracted from any repository\")\n",
    "        \n",
    "        print(f\"Total samples collected: {len(all_code_samples)}\")\n",
    "        \n",
    "        # Create HuggingFace dataset\n",
    "        from datasets import Dataset\n",
    "        dataset = Dataset.from_list(all_code_samples)\n",
    "        return dataset\n",
    "    \n",
    "    def _process_single_repo(self, repo_url, max_files_per_repo):\n",
    "        \"\"\"Process a single GitHub repository\"\"\"\n",
    "        import tempfile\n",
    "        \n",
    "        with tempfile.TemporaryDirectory() as temp_dir:\n",
    "            try:\n",
    "                # Clone repository\n",
    "                repo_name = repo_url.split('/')[-1].replace('.git', '')\n",
    "                repo_path = f\"{temp_dir}/{repo_name}\"\n",
    "                \n",
    "                print(f\"Cloning {repo_url}...\")\n",
    "                repo = git.Repo.clone_from(repo_url, repo_path, depth=1)\n",
    "                \n",
    "                # Extract code samples\n",
    "                code_samples = self._extract_code_samples(repo_path, max_files_per_repo)\n",
    "                \n",
    "                return code_samples\n",
    "                \n",
    "            finally:\n",
    "                print(f\"Finished processing {repo_url}\")\n",
    "    \n",
    "    def _extract_code_samples(self, repo_path, max_files_per_repo):\n",
    "        \"\"\"Extract code samples from a repository\"\"\"\n",
    "        code_samples = []\n",
    "        repo_path_obj = Path(repo_path)\n",
    "        \n",
    "        # Find all code files\n",
    "        code_files = []\n",
    "        for ext in self.CODE_EXTENSIONS:\n",
    "            code_files.extend(repo_path_obj.rglob(f'*{ext}'))\n",
    "        \n",
    "        print(f\"Found {len(code_files)} code files\")\n",
    "        \n",
    "        # Limit files per repo to prevent memory issues\n",
    "        code_files = code_files[:max_files_per_repo]\n",
    "        \n",
    "        for code_file in code_files:\n",
    "            try:\n",
    "                if self._should_exclude_file(str(code_file.relative_to(repo_path))):\n",
    "                    continue\n",
    "                \n",
    "                sample = self._process_code_file(code_file, repo_path_obj)\n",
    "                if sample:\n",
    "                    code_samples.append(sample)\n",
    "                    \n",
    "            except Exception as e:\n",
    "                print(f\"Failed to process {code_file}: {str(e)}\")\n",
    "                continue\n",
    "        \n",
    "        return code_samples\n",
    "    \n",
    "    def _should_exclude_file(self, relative_path):\n",
    "        \"\"\"Check if a file should be excluded based on patterns\"\"\"\n",
    "        import re\n",
    "        exclude_patterns = [\n",
    "            r'\\.git/', r'__pycache__/', r'node_modules/',\n",
    "            r'\\.venv/', r'venv/', r'package-lock\\.json$',\n",
    "            r'\\.log$', r'\\.tmp$', r'~\\$.*', r'\\.swp$',\n",
    "            r'\\.DS_Store', r'\\.pyc$'\n",
    "        ]\n",
    "        for pattern in exclude_patterns:\n",
    "            if re.search(pattern, relative_path):\n",
    "                return True\n",
    "        return False\n",
    "    \n",
    "    def _process_code_file(self, file_path, repo_path):\n",
    "        \"\"\"Process a single code file into a training sample\"\"\"\n",
    "        try:\n",
    "            # Read file content\n",
    "            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
    "                content = f.read()\n",
    "            \n",
    "            # Skip if file is too small or too large\n",
    "            if len(content.strip()) < 10:\n",
    "                return None\n",
    "            if len(content) > 100000:  # Rough limit\n",
    "                return None\n",
    "            \n",
    "            # Get relative path for context\n",
    "            relative_path = file_path.relative_to(repo_path)\n",
    "            \n",
    "            # Determine language\n",
    "            extension = file_path.suffix.lower()\n",
    "            language = self.CODE_EXTENSIONS.get(extension, 'unknown')\n",
    "            \n",
    "            # Create training sample\n",
    "            sample = {\n",
    "                'text': content,\n",
    "                'language': language,\n",
    "                'file_path': str(relative_path),\n",
    "                'repo_name': repo_path.name,\n",
    "                'file_size': len(content),\n",
    "                'line_count': len(content.splitlines())\n",
    "            }\n",
    "            \n",
    "            return sample\n",
    "            \n",
    "        except Exception as e:\n",
    "            print(f\"Error processing {file_path}: {str(e)}\")\n",
    "            return None\n",
    "\n",
    "# Example usage:\n",
    "# processor = AdvancedDatasetProcessor()\n",
    "# dataset = processor.process_github_repos([\n",
    "#     \"https://github.com/karpathy/nanoGPT.git\"\n",
    "# ])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Training\n",
    "\n",
    "Set up and run the training process."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a simple example dataset if you don't have your own\n",
    "from datasets import Dataset\n",
    "\n",
    "# Example dataset - replace with your own data\n",
    "example_data = [\n",
    "    {\"text\": \"def hello_world():\\n    print('Hello, World!')\"},\n",
    "    {\"text\": \"class Calculator:\\n    def add(self, a, b):\\n        return a + b\"},\n",
    "    {\"text\": \"import numpy as np\\n\\narr = np.array([1, 2, 3])\\nprint(arr)\"}\n",
    "]\n",
    "\n",
    "dataset = Dataset.from_list(example_data)\n",
    "print(f\"Example dataset created with {len(dataset)} samples\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenize dataset\n",
    "def tokenize_function(examples):\n",
    "    # Simple tokenization - replace with more sophisticated approach for your use case\n",
    "    return tokenizer(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=512)\n",
    "\n",
    "tokenized_dataset = dataset.map(tokenize_function, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set up trainer\n",
    "trainer = SFTTrainer(\n",
    "    model=model,\n",
    "    tokenizer=tokenizer,\n",
    "    train_dataset=tokenized_dataset,\n",
    "    dataset_text_field=\"text\",\n",
    "    max_seq_length=MAX_SEQ_LENGTH,\n",
    "    packing=True,\n",
    "    args=TrainingArguments(\n",
    "        per_device_train_batch_size=TRAINING_CONFIG['per_device_train_batch_size'],\n",
    "        gradient_accumulation_steps=TRAINING_CONFIG['gradient_accumulation_steps'],\n",
    "        max_steps=TRAINING_CONFIG['max_steps'],\n",
    "        learning_rate=TRAINING_CONFIG['learning_rate'],\n",
    "        fp16=not TRAINING_CONFIG['bf16'],\n",
    "        bf16=TRAINING_CONFIG['bf16'],\n",
    "        logging_steps=1,\n",
    "        save_steps=50,\n",
    "        output_dir=\"./model_output\",\n",
    "        optim=\"adamw_torch\",\n",
    "        lr_scheduler_type=\"cosine\",\n",
    "        warmup_ratio=0.1,\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Start training\n",
    "print(\"Starting training...\")\n",
    "trainer.train()\n",
    "print(\"Training completed!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Save Model\n",
    "\n",
    "Save the trained model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the model\n",
    "model.save_pretrained(\"./trained_model\")\n",
    "tokenizer.save_pretrained(\"./trained_model\")\n",
    "print(\"Model saved successfully!\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}