create single notebook file for google colab
This commit is contained in:
parent
6ef5aa08e0
commit
81c8524809
313
ai_trainer_t4_colab.ipynb
Normal file
313
ai_trainer_t4_colab.ipynb
Normal file
@ -0,0 +1,313 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# AI Trainer for Qwen Models on Google Colab (T4 GPU)\n",
|
||||
"\n",
|
||||
"This notebook allows you to train Qwen models on GitHub repositories using Google Colab's T4 GPU with 13GB VRAM."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1. Setup Environment\n",
|
||||
"\n",
|
||||
"First, let's install the required dependencies."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install required packages\n",
|
||||
"!pip install unsloth[cu121] bitsandbytes\n",
|
||||
"!pip install transformers datasets\n",
|
||||
"!pip install accelerate peft\n",
|
||||
"!pip install GitPython PyYAML"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set environment variables for optimal GPU performance\n",
|
||||
"import os\n",
|
||||
"os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n",
|
||||
"os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'\n",
|
||||
"os.environ['TOKENIZERS_PARALLELISM'] = 'false'\n",
|
||||
"os.environ['DISABLE_TORCH_COMPILE'] = '1'\n",
|
||||
"\n",
|
||||
"print(\"Environment variables set successfully!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 2. Import Libraries\n",
|
||||
"\n",
|
||||
"Let's import all necessary libraries."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from unsloth import FastLanguageModel\n",
|
||||
"from trl import SFTTrainer\n",
|
||||
"from transformers import TrainingArguments\n",
|
||||
"import git\n",
|
||||
"from pathlib import Path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3. Configuration\n",
|
||||
"\n",
|
||||
"Configuration optimized for T4 GPU with 13GB VRAM."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Model configuration\n",
|
||||
"MODEL_NAME = \"unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit\"\n",
|
||||
"MAX_SEQ_LENGTH = 2048\n",
|
||||
"\n",
|
||||
"# Training configuration for T4 GPU (13GB VRAM)\n",
|
||||
"TRAINING_CONFIG = {\n",
|
||||
" 'per_device_train_batch_size': 1,\n",
|
||||
" 'gradient_accumulation_steps': 8,\n",
|
||||
" 'max_steps': 100,\n",
|
||||
" 'learning_rate': 2e-4,\n",
|
||||
" 'use_gradient_checkpointing': True,\n",
|
||||
" 'bf16': True\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4. Load Model\n",
|
||||
"\n",
|
||||
"Load the Qwen model with Unsloth for memory efficiency."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load model and tokenizer\n",
|
||||
"model, tokenizer = FastLanguageModel.from_pretrained(\n",
|
||||
" model_name=MODEL_NAME,\n",
|
||||
" max_seq_length=MAX_SEQ_LENGTH,\n",
|
||||
" dtype=None,\n",
|
||||
" load_in_4bit=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Configure model for training\n",
|
||||
"model = FastLanguageModel.get_peft_model(\n",
|
||||
" model,\n",
|
||||
" r=16,\n",
|
||||
" target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
|
||||
" lora_alpha=16,\n",
|
||||
" lora_dropout=0,\n",
|
||||
" bias=\"none\",\n",
|
||||
" use_gradient_checkpointing=TRAINING_CONFIG['use_gradient_checkpointing'],\n",
|
||||
" random_state=3407,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 5. Process GitHub Repositories\n",
|
||||
"\n",
|
||||
"Extract code from GitHub repositories for training."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def process_github_repo(repo_url):\n",
|
||||
" \"\"\"Process a GitHub repository and extract code samples\"\"\"\n",
|
||||
" import tempfile\n",
|
||||
" from datasets import Dataset\n",
|
||||
" \n",
|
||||
" with tempfile.TemporaryDirectory() as temp_dir:\n",
|
||||
" # Clone repository\n",
|
||||
" repo_name = repo_url.split('/')[-1].replace('.git', '')\n",
|
||||
" repo_path = f\"{temp_dir}/{repo_name}\"\n",
|
||||
" \n",
|
||||
" print(f\"Cloning {repo_url}...\")\n",
|
||||
" repo = git.Repo.clone_from(repo_url, repo_path, depth=1)\n",
|
||||
" \n",
|
||||
" # Extract Python files as example\n",
|
||||
" code_samples = []\n",
|
||||
" py_files = Path(repo_path).rglob('*.py')\n",
|
||||
" \n",
|
||||
" for py_file in list(py_files)[:10]: # Limit to first 10 files\n",
|
||||
" try:\n",
|
||||
" with open(py_file, 'r', encoding='utf-8', errors='ignore') as f:\n",
|
||||
" content = f.read()\n",
|
||||
" \n",
|
||||
" if len(content.strip()) > 10: # Skip tiny files\n",
|
||||
" code_samples.append({\n",
|
||||
" 'text': content,\n",
|
||||
" 'repo_name': repo_name,\n",
|
||||
" 'file_path': str(py_file.relative_to(repo_path))\n",
|
||||
" })\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error processing {py_file}: {e}\")\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" return Dataset.from_list(code_samples)\n",
|
||||
"\n",
|
||||
"# Example usage (replace with your own repositories)\n",
|
||||
"# dataset = process_github_repo(\"https://github.com/your-username/your-repo.git\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 6. Training\n",
|
||||
"\n",
|
||||
"Set up and run the training process."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a simple example dataset if you don't have your own\n",
|
||||
"from datasets import Dataset\n",
|
||||
"\n",
|
||||
"# Example dataset - replace with your own data\n",
|
||||
"example_data = [\n",
|
||||
" {\"text\": \"def hello_world():\\n print('Hello, World!')\"},\n",
|
||||
" {\"text\": \"class Calculator:\\n def add(self, a, b):\\n return a + b\"},\n",
|
||||
" {\"text\": \"import numpy as np\\n\\narr = np.array([1, 2, 3])\\nprint(arr)\"}\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"dataset = Dataset.from_list(example_data)\n",
|
||||
"print(f\"Example dataset created with {len(dataset)} samples\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Tokenize dataset\n",
|
||||
"def tokenize_function(examples):\n",
|
||||
" # Simple tokenization - replace with more sophisticated approach for your use case\n",
|
||||
" return tokenizer(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=512)\n",
|
||||
"\n",
|
||||
"tokenized_dataset = dataset.map(tokenize_function, batched=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set up trainer\n",
|
||||
"trainer = SFTTrainer(\n",
|
||||
" model=model,\n",
|
||||
" tokenizer=tokenizer,\n",
|
||||
" train_dataset=tokenized_dataset,\n",
|
||||
" dataset_text_field=\"text\",\n",
|
||||
" max_seq_length=MAX_SEQ_LENGTH,\n",
|
||||
" packing=True,\n",
|
||||
" args=TrainingArguments(\n",
|
||||
" per_device_train_batch_size=TRAINING_CONFIG['per_device_train_batch_size'],\n",
|
||||
" gradient_accumulation_steps=TRAINING_CONFIG['gradient_accumulation_steps'],\n",
|
||||
" max_steps=TRAINING_CONFIG['max_steps'],\n",
|
||||
" learning_rate=TRAINING_CONFIG['learning_rate'],\n",
|
||||
" fp16=not TRAINING_CONFIG['bf16'],\n",
|
||||
" bf16=TRAINING_CONFIG['bf16'],\n",
|
||||
" logging_steps=1,\n",
|
||||
" save_steps=50,\n",
|
||||
" output_dir=\"./model_output\",\n",
|
||||
" optim=\"adamw_torch\",\n",
|
||||
" lr_scheduler_type=\"cosine\",\n",
|
||||
" warmup_ratio=0.1,\n",
|
||||
" ),\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Start training\n",
|
||||
"print(\"Starting training...\")\n",
|
||||
"trainer.train()\n",
|
||||
"print(\"Training completed!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 7. Save Model\n",
|
||||
"\n",
|
||||
"Save the trained model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Save the model\n",
|
||||
"model.save_pretrained(\"./trained_model\")\n",
|
||||
"tokenizer.save_pretrained(\"./trained_model\")\n",
|
||||
"print(\"Model saved successfully!\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.10.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@ -16,7 +16,7 @@ training:
|
||||
max_steps: 50
|
||||
|
||||
# Training parameters
|
||||
num_train_epochs: 3
|
||||
num_train_epochs: 1
|
||||
learning_rate: 2.0e-4
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
|
||||
@ -28,7 +28,7 @@ class TrainingConfig:
|
||||
per_device_train_batch_size: int = 2
|
||||
gradient_accumulation_steps: int = 4
|
||||
max_steps: int = 10
|
||||
num_train_epochs: int = 3
|
||||
num_train_epochs: int = 2
|
||||
learning_rate: float = 2e-4
|
||||
warmup_steps: int = 10
|
||||
logging_steps: int = 1
|
||||
|
||||
Loading…
Reference in New Issue
Block a user