diff --git a/ai_trainer_t4_colab.ipynb b/ai_trainer_t4_colab.ipynb new file mode 100644 index 0000000..838f56b --- /dev/null +++ b/ai_trainer_t4_colab.ipynb @@ -0,0 +1,313 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AI Trainer for Qwen Models on Google Colab (T4 GPU)\n", + "\n", + "This notebook allows you to train Qwen models on GitHub repositories using Google Colab's T4 GPU with 13GB VRAM." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup Environment\n", + "\n", + "First, let's install the required dependencies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install required packages\n", + "!pip install unsloth[cu121] bitsandbytes\n", + "!pip install transformers datasets\n", + "!pip install accelerate peft\n", + "!pip install GitPython PyYAML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set environment variables for optimal GPU performance\n", + "import os\n", + "os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n", + "os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'\n", + "os.environ['TOKENIZERS_PARALLELISM'] = 'false'\n", + "os.environ['DISABLE_TORCH_COMPILE'] = '1'\n", + "\n", + "print(\"Environment variables set successfully!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Import Libraries\n", + "\n", + "Let's import all necessary libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from unsloth import FastLanguageModel\n", + "from trl import SFTTrainer\n", + "from transformers import TrainingArguments\n", + "import git\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Configuration\n", + "\n", + "Configuration optimized for T4 GPU with 13GB VRAM." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Model configuration\n", + "MODEL_NAME = \"unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit\"\n", + "MAX_SEQ_LENGTH = 2048\n", + "\n", + "# Training configuration for T4 GPU (13GB VRAM)\n", + "TRAINING_CONFIG = {\n", + " 'per_device_train_batch_size': 1,\n", + " 'gradient_accumulation_steps': 8,\n", + " 'max_steps': 100,\n", + " 'learning_rate': 2e-4,\n", + " 'use_gradient_checkpointing': True,\n", + " 'bf16': True\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Load Model\n", + "\n", + "Load the Qwen model with Unsloth for memory efficiency." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load model and tokenizer\n", + "model, tokenizer = FastLanguageModel.from_pretrained(\n", + " model_name=MODEL_NAME,\n", + " max_seq_length=MAX_SEQ_LENGTH,\n", + " dtype=None,\n", + " load_in_4bit=True,\n", + ")\n", + "\n", + "# Configure model for training\n", + "model = FastLanguageModel.get_peft_model(\n", + " model,\n", + " r=16,\n", + " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", + " lora_alpha=16,\n", + " lora_dropout=0,\n", + " bias=\"none\",\n", + " use_gradient_checkpointing=TRAINING_CONFIG['use_gradient_checkpointing'],\n", + " random_state=3407,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Process GitHub Repositories\n", + "\n", + "Extract code from GitHub repositories for training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def process_github_repo(repo_url):\n", + " \"\"\"Process a GitHub repository and extract code samples\"\"\"\n", + " import tempfile\n", + " from datasets import Dataset\n", + " \n", + " with tempfile.TemporaryDirectory() as temp_dir:\n", + " # Clone repository\n", + " repo_name = repo_url.split('/')[-1].replace('.git', '')\n", + " repo_path = f\"{temp_dir}/{repo_name}\"\n", + " \n", + " print(f\"Cloning {repo_url}...\")\n", + " repo = git.Repo.clone_from(repo_url, repo_path, depth=1)\n", + " \n", + " # Extract Python files as example\n", + " code_samples = []\n", + " py_files = Path(repo_path).rglob('*.py')\n", + " \n", + " for py_file in list(py_files)[:10]: # Limit to first 10 files\n", + " try:\n", + " with open(py_file, 'r', encoding='utf-8', errors='ignore') as f:\n", + " content = f.read()\n", + " \n", + " if len(content.strip()) > 10: # Skip tiny files\n", + " code_samples.append({\n", + " 'text': content,\n", + " 'repo_name': repo_name,\n", + " 'file_path': str(py_file.relative_to(repo_path))\n", + " })\n", + " except Exception as e:\n", + " print(f\"Error processing {py_file}: {e}\")\n", + " continue\n", + " \n", + " return Dataset.from_list(code_samples)\n", + "\n", + "# Example usage (replace with your own repositories)\n", + "# dataset = process_github_repo(\"https://github.com/your-username/your-repo.git\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Training\n", + "\n", + "Set up and run the training process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a simple example dataset if you don't have your own\n", + "from datasets import Dataset\n", + "\n", + "# Example dataset - replace with your own data\n", + "example_data = [\n", + " {\"text\": \"def hello_world():\\n print('Hello, World!')\"},\n", + " {\"text\": \"class Calculator:\\n def add(self, a, b):\\n return a + b\"},\n", + " {\"text\": \"import numpy as np\\n\\narr = np.array([1, 2, 3])\\nprint(arr)\"}\n", + "]\n", + "\n", + "dataset = Dataset.from_list(example_data)\n", + "print(f\"Example dataset created with {len(dataset)} samples\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Tokenize dataset\n", + "def tokenize_function(examples):\n", + " # Simple tokenization - replace with more sophisticated approach for your use case\n", + " return tokenizer(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=512)\n", + "\n", + "tokenized_dataset = dataset.map(tokenize_function, batched=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set up trainer\n", + "trainer = SFTTrainer(\n", + " model=model,\n", + " tokenizer=tokenizer,\n", + " train_dataset=tokenized_dataset,\n", + " dataset_text_field=\"text\",\n", + " max_seq_length=MAX_SEQ_LENGTH,\n", + " packing=True,\n", + " args=TrainingArguments(\n", + " per_device_train_batch_size=TRAINING_CONFIG['per_device_train_batch_size'],\n", + " gradient_accumulation_steps=TRAINING_CONFIG['gradient_accumulation_steps'],\n", + " max_steps=TRAINING_CONFIG['max_steps'],\n", + " learning_rate=TRAINING_CONFIG['learning_rate'],\n", + " fp16=not TRAINING_CONFIG['bf16'],\n", + " bf16=TRAINING_CONFIG['bf16'],\n", + " logging_steps=1,\n", + " save_steps=50,\n", + " output_dir=\"./model_output\",\n", + " optim=\"adamw_torch\",\n", + " lr_scheduler_type=\"cosine\",\n", + " warmup_ratio=0.1,\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start training\n", + "print(\"Starting training...\")\n", + "trainer.train()\n", + "print(\"Training completed!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Save Model\n", + "\n", + "Save the trained model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save the model\n", + "model.save_pretrained(\"./trained_model\")\n", + "tokenizer.save_pretrained(\"./trained_model\")\n", + "print(\"Model saved successfully!\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/configs/training_config.yaml b/configs/training_config.yaml index 7602927..7a4ecbd 100644 --- a/configs/training_config.yaml +++ b/configs/training_config.yaml @@ -16,7 +16,7 @@ training: max_steps: 50 # Training parameters - num_train_epochs: 3 + num_train_epochs: 1 learning_rate: 2.0e-4 warmup_steps: 10 warmup_ratio: 0.1 diff --git a/src/config.py b/src/config.py index cdace58..cbcf162 100644 --- a/src/config.py +++ b/src/config.py @@ -28,7 +28,7 @@ class TrainingConfig: per_device_train_batch_size: int = 2 gradient_accumulation_steps: int = 4 max_steps: int = 10 - num_train_epochs: int = 3 + num_train_epochs: int = 2 learning_rate: float = 2e-4 warmup_steps: int = 10 logging_steps: int = 1