ai_github_trainer/src/trainer.py
2025-08-22 21:53:40 +07:00

290 lines
10 KiB
Python

"""
Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit
Optimized for RTX3070 8GB VRAM with no CPU offloading
"""
import logging
import os
import gc
import torch
from pathlib import Path
from typing import Optional, Dict, Any
import torch.nn as nn
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling,
DataCollatorForSeq2Seq
)
from trl import SFTConfig, SFTTrainer
from datasets import Dataset
from unsloth import FastLanguageModel, is_bfloat16_supported
from config import AppConfig
from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage
class ModelTrainer:
"""Trainer class for fine-tuning the Qwen2.5-Coder model"""
def __init__(self, config: AppConfig, output_dir: str = "./models"):
"""
Initialize the model trainer
Args:
config: Application configuration
output_dir: Directory to save the trained model
"""
self.config = config
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.logger = logging.getLogger(__name__)
# Model and tokenizer
self.model = None
self.tokenizer = None
# Training components
self.trainer = None
# Memory tracking
self.initial_memory = None
def train(self, train_dataset: Dataset) -> str:
"""
Train the model on the provided dataset
Args:
train_dataset: Dataset for training
Returns:
Path to the saved model
"""
try:
self.logger.info("Starting model training...")
# Check initial GPU memory
self._check_initial_setup()
# Load model and tokenizer
self._load_model_and_tokenizer()
# Prepare dataset
tokenized_dataset = self._prepare_dataset(train_dataset)
# Setup trainer
self._setup_trainer(tokenized_dataset)
# Start training
self.logger.info("Beginning training loop...")
self.trainer.train()
# Save final model
final_model_path = self._save_model()
self.logger.info(f"Training completed successfully! Model saved to: {final_model_path}")
return str(final_model_path)
except Exception as e:
self.logger.error(f"Training failed: {str(e)}")
raise
finally:
self._cleanup()
def _check_initial_setup(self):
"""Check initial GPU memory and setup"""
gpu_info = check_gpu_memory()
self.logger.info(f"GPU Memory Info: {gpu_info}")
# Store initial memory usage
self.initial_memory = get_memory_usage()
self.logger.info(".2f")
# Verify CUDA availability
if not torch.cuda.is_available():
raise RuntimeError("CUDA is not available. This trainer requires a CUDA-compatible GPU.")
self.logger.info(f"CUDA device: {torch.cuda.get_device_name()}")
self.logger.info(f"CUDA version: {torch.version.cuda}")
def _load_model_and_tokenizer(self):
"""Load the model and tokenizer with memory optimization"""
self.logger.info(f"Loading model: {self.config.model.name}")
# Clear cache before loading
clear_gpu_cache()
try:
# Load model with unsloth for memory efficiency
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
model_name=self.config.model.name,
max_seq_length=self.config.model.max_seq_length,
dtype=None, # Auto-detect
load_in_4bit=True, # Use 4-bit quantization
token=None, # Use default token
)
# Configure model for training
self.model = FastLanguageModel.get_peft_model(
self.model,
r=16, # LoRA rank
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
lora_alpha=16,
lora_dropout=0, # Supports any, but = 0 is optimized
bias="none", # Supports any, but = "none" is optimized
use_gradient_checkpointing=self.config.training.use_gradient_checkpointing,
random_state=3407,
use_rslora=False, # We support rank stabilized LoRA
loftq_config=None, # And LoftQ
)
self.logger.info("Model and tokenizer loaded successfully")
except Exception as e:
self.logger.error(f"Failed to load model: {str(e)}")
raise
def _prepare_dataset(self, train_dataset: Dataset) -> Dataset:
"""Prepare and tokenize the dataset"""
self.logger.info("Preparing dataset...")
def tokenize_function(examples):
return self.tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=self.config.model.max_seq_length,
return_tensors="pt"
)
# Tokenize dataset
tokenized_dataset = train_dataset.map(
tokenize_function,
batched=True,
remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"],
desc="Tokenizing dataset"
)
self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}")
return tokenized_dataset
def _setup_trainer(self, tokenized_dataset: Dataset):
"""Setup the HuggingFace trainer with memory optimizations"""
self.logger.info("Setting up trainer...")
# Training arguments optimized for RTX3070 8GB
training_args = SFTConfig(
output_dir=str(self.output_dir / "checkpoints"),
num_train_epochs=self.config.training.num_train_epochs,
per_device_train_batch_size=self.config.training.per_device_train_batch_size,
gradient_accumulation_steps=self.config.training.gradient_accumulation_steps,
learning_rate=self.config.training.learning_rate,
warmup_steps=self.config.training.warmup_steps,
warmup_ratio=self.config.training.warmup_ratio,
logging_steps=self.config.training.logging_steps,
save_steps=self.config.training.save_steps,
save_total_limit=self.config.training.save_total_limit,
eval_strategy=self.config.training.eval_strategy,
eval_steps=self.config.training.eval_steps,
load_best_model_at_end=self.config.training.load_best_model_at_end,
metric_for_best_model=self.config.training.metric_for_best_model,
greater_is_better=self.config.training.greater_is_better,
optim=self.config.training.optim,
weight_decay=self.config.training.weight_decay,
lr_scheduler_type=self.config.training.lr_scheduler_type,
adam_beta1=self.config.training.adam_beta1,
adam_beta2=self.config.training.adam_beta2,
adam_epsilon=self.config.training.adam_epsilon,
max_grad_norm=self.config.training.max_grad_norm,
dataloader_num_workers=self.config.training.dataloader_num_workers,
dataloader_pin_memory=self.config.training.dataloader_pin_memory,
remove_unused_columns=self.config.training.remove_unused_columns,
bf16=self.config.training.bf16 if is_bfloat16_supported() else False,
fp16=self.config.training.fp16,
tf32=self.config.training.tf32,
report_to=self.config.training.report_to,
logging_dir=self.config.training.logging_dir,
seed=self.config.training.dataset_seed,
data_seed=self.config.training.dataset_seed,
dataloader_drop_last=True, # Better memory management
gradient_checkpointing=self.config.training.use_gradient_checkpointing,
# Memory optimization settings
ddp_find_unused_parameters=False,
per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
)
# Data collator
# data_collator = DataCollatorForLanguageModeling(
# tokenizer=self.tokenizer,
# mlm=False # Causal language modeling
# )
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
# Initialize trainer
self.trainer = SFTTrainer(
model=self.model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
data_collator=data_collator,
tokenizer=self.tokenizer,
dataset_text_field="text",
packing=False # Can make training 5x faster for short sequences.
)
self.logger.info("Trainer setup completed")
def _save_model(self) -> Path:
"""Save the trained model"""
self.logger.info("Saving model...")
# Create final model directory
final_model_dir = self.output_dir / "final_model"
final_model_dir.mkdir(parents=True, exist_ok=True)
try:
# Save the model
self.model.save_pretrained(str(final_model_dir))
self.tokenizer.save_pretrained(str(final_model_dir))
# Save configuration
self.config.save_yaml(final_model_dir / "training_config.yaml")
self.logger.info(f"Model saved to: {final_model_dir}")
return final_model_dir
except Exception as e:
self.logger.error(f"Failed to save model: {str(e)}")
raise
def _cleanup(self):
"""Clean up resources"""
try:
# Clear GPU cache
clear_gpu_cache()
# Force garbage collection
gc.collect()
# Delete model and tokenizer to free memory
if self.model is not None:
del self.model
if self.tokenizer is not None:
del self.tokenizer
if self.trainer is not None:
del self.trainer
# Final memory cleanup
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
self.logger.warning(f"Error during cleanup: {str(e)}")