290 lines
10 KiB
Python
290 lines
10 KiB
Python
"""
|
|
Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit
|
|
Optimized for RTX3070 8GB VRAM with no CPU offloading
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import gc
|
|
import torch
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any
|
|
|
|
import torch.nn as nn
|
|
from transformers import (
|
|
AutoModelForCausalLM,
|
|
AutoTokenizer,
|
|
Trainer,
|
|
TrainingArguments,
|
|
DataCollatorForLanguageModeling,
|
|
DataCollatorForSeq2Seq
|
|
)
|
|
from trl import SFTConfig, SFTTrainer
|
|
from datasets import Dataset
|
|
from unsloth import FastLanguageModel, is_bfloat16_supported
|
|
|
|
from config import AppConfig
|
|
from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage
|
|
|
|
|
|
class ModelTrainer:
|
|
"""Trainer class for fine-tuning the Qwen2.5-Coder model"""
|
|
|
|
def __init__(self, config: AppConfig, output_dir: str = "./models"):
|
|
"""
|
|
Initialize the model trainer
|
|
|
|
Args:
|
|
config: Application configuration
|
|
output_dir: Directory to save the trained model
|
|
"""
|
|
self.config = config
|
|
self.output_dir = Path(output_dir)
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
# Model and tokenizer
|
|
self.model = None
|
|
self.tokenizer = None
|
|
|
|
# Training components
|
|
self.trainer = None
|
|
|
|
# Memory tracking
|
|
self.initial_memory = None
|
|
|
|
def train(self, train_dataset: Dataset) -> str:
|
|
"""
|
|
Train the model on the provided dataset
|
|
|
|
Args:
|
|
train_dataset: Dataset for training
|
|
|
|
Returns:
|
|
Path to the saved model
|
|
"""
|
|
try:
|
|
self.logger.info("Starting model training...")
|
|
|
|
# Check initial GPU memory
|
|
self._check_initial_setup()
|
|
|
|
# Load model and tokenizer
|
|
self._load_model_and_tokenizer()
|
|
|
|
# Prepare dataset
|
|
tokenized_dataset = self._prepare_dataset(train_dataset)
|
|
|
|
# Setup trainer
|
|
self._setup_trainer(tokenized_dataset)
|
|
|
|
# Start training
|
|
self.logger.info("Beginning training loop...")
|
|
self.trainer.train()
|
|
|
|
# Save final model
|
|
final_model_path = self._save_model()
|
|
|
|
self.logger.info(f"Training completed successfully! Model saved to: {final_model_path}")
|
|
return str(final_model_path)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Training failed: {str(e)}")
|
|
raise
|
|
finally:
|
|
self._cleanup()
|
|
|
|
def _check_initial_setup(self):
|
|
"""Check initial GPU memory and setup"""
|
|
gpu_info = check_gpu_memory()
|
|
self.logger.info(f"GPU Memory Info: {gpu_info}")
|
|
|
|
# Store initial memory usage
|
|
self.initial_memory = get_memory_usage()
|
|
self.logger.info(".2f")
|
|
|
|
# Verify CUDA availability
|
|
if not torch.cuda.is_available():
|
|
raise RuntimeError("CUDA is not available. This trainer requires a CUDA-compatible GPU.")
|
|
|
|
self.logger.info(f"CUDA device: {torch.cuda.get_device_name()}")
|
|
self.logger.info(f"CUDA version: {torch.version.cuda}")
|
|
|
|
def _load_model_and_tokenizer(self):
|
|
"""Load the model and tokenizer with memory optimization"""
|
|
self.logger.info(f"Loading model: {self.config.model.name}")
|
|
|
|
# Clear cache before loading
|
|
clear_gpu_cache()
|
|
|
|
try:
|
|
# Load model with unsloth for memory efficiency
|
|
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
|
|
model_name=self.config.model.name,
|
|
max_seq_length=self.config.model.max_seq_length,
|
|
dtype=None, # Auto-detect
|
|
load_in_4bit=True, # Use 4-bit quantization
|
|
token=None, # Use default token
|
|
)
|
|
|
|
# Configure model for training
|
|
self.model = FastLanguageModel.get_peft_model(
|
|
self.model,
|
|
r=16, # LoRA rank
|
|
target_modules=[
|
|
"q_proj", "k_proj", "v_proj", "o_proj",
|
|
"gate_proj", "up_proj", "down_proj"
|
|
],
|
|
lora_alpha=16,
|
|
lora_dropout=0, # Supports any, but = 0 is optimized
|
|
bias="none", # Supports any, but = "none" is optimized
|
|
use_gradient_checkpointing=self.config.training.use_gradient_checkpointing,
|
|
random_state=3407,
|
|
use_rslora=False, # We support rank stabilized LoRA
|
|
loftq_config=None, # And LoftQ
|
|
)
|
|
|
|
self.logger.info("Model and tokenizer loaded successfully")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to load model: {str(e)}")
|
|
raise
|
|
|
|
def _prepare_dataset(self, train_dataset: Dataset) -> Dataset:
|
|
"""Prepare and tokenize the dataset"""
|
|
self.logger.info("Preparing dataset...")
|
|
|
|
def tokenize_function(examples):
|
|
return self.tokenizer(
|
|
examples["text"],
|
|
padding="max_length",
|
|
truncation=True,
|
|
max_length=self.config.model.max_seq_length,
|
|
return_tensors="pt"
|
|
)
|
|
|
|
# Tokenize dataset
|
|
tokenized_dataset = train_dataset.map(
|
|
tokenize_function,
|
|
batched=True,
|
|
remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"],
|
|
desc="Tokenizing dataset"
|
|
)
|
|
|
|
self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}")
|
|
return tokenized_dataset
|
|
|
|
def _setup_trainer(self, tokenized_dataset: Dataset):
|
|
"""Setup the HuggingFace trainer with memory optimizations"""
|
|
self.logger.info("Setting up trainer...")
|
|
|
|
# Training arguments optimized for RTX3070 8GB
|
|
training_args = SFTConfig(
|
|
output_dir=str(self.output_dir / "checkpoints"),
|
|
num_train_epochs=self.config.training.num_train_epochs,
|
|
per_device_train_batch_size=self.config.training.per_device_train_batch_size,
|
|
gradient_accumulation_steps=self.config.training.gradient_accumulation_steps,
|
|
learning_rate=self.config.training.learning_rate,
|
|
warmup_steps=self.config.training.warmup_steps,
|
|
warmup_ratio=self.config.training.warmup_ratio,
|
|
logging_steps=self.config.training.logging_steps,
|
|
save_steps=self.config.training.save_steps,
|
|
save_total_limit=self.config.training.save_total_limit,
|
|
eval_strategy=self.config.training.eval_strategy,
|
|
eval_steps=self.config.training.eval_steps,
|
|
load_best_model_at_end=self.config.training.load_best_model_at_end,
|
|
metric_for_best_model=self.config.training.metric_for_best_model,
|
|
greater_is_better=self.config.training.greater_is_better,
|
|
optim=self.config.training.optim,
|
|
weight_decay=self.config.training.weight_decay,
|
|
lr_scheduler_type=self.config.training.lr_scheduler_type,
|
|
adam_beta1=self.config.training.adam_beta1,
|
|
adam_beta2=self.config.training.adam_beta2,
|
|
adam_epsilon=self.config.training.adam_epsilon,
|
|
max_grad_norm=self.config.training.max_grad_norm,
|
|
dataloader_num_workers=self.config.training.dataloader_num_workers,
|
|
dataloader_pin_memory=self.config.training.dataloader_pin_memory,
|
|
remove_unused_columns=self.config.training.remove_unused_columns,
|
|
bf16=self.config.training.bf16 if is_bfloat16_supported() else False,
|
|
fp16=self.config.training.fp16,
|
|
tf32=self.config.training.tf32,
|
|
report_to=self.config.training.report_to,
|
|
logging_dir=self.config.training.logging_dir,
|
|
seed=self.config.training.dataset_seed,
|
|
data_seed=self.config.training.dataset_seed,
|
|
dataloader_drop_last=True, # Better memory management
|
|
gradient_checkpointing=self.config.training.use_gradient_checkpointing,
|
|
# Memory optimization settings
|
|
ddp_find_unused_parameters=False,
|
|
per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
|
|
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
|
|
)
|
|
|
|
# Data collator
|
|
# data_collator = DataCollatorForLanguageModeling(
|
|
# tokenizer=self.tokenizer,
|
|
# mlm=False # Causal language modeling
|
|
# )
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
|
|
|
|
# Initialize trainer
|
|
self.trainer = SFTTrainer(
|
|
model=self.model,
|
|
args=training_args,
|
|
train_dataset=tokenized_dataset,
|
|
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
|
|
data_collator=data_collator,
|
|
tokenizer=self.tokenizer,
|
|
dataset_text_field="text",
|
|
packing=False # Can make training 5x faster for short sequences.
|
|
)
|
|
|
|
self.logger.info("Trainer setup completed")
|
|
|
|
def _save_model(self) -> Path:
|
|
"""Save the trained model"""
|
|
self.logger.info("Saving model...")
|
|
|
|
# Create final model directory
|
|
final_model_dir = self.output_dir / "final_model"
|
|
final_model_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
try:
|
|
# Save the model
|
|
self.model.save_pretrained(str(final_model_dir))
|
|
self.tokenizer.save_pretrained(str(final_model_dir))
|
|
|
|
# Save configuration
|
|
self.config.save_yaml(final_model_dir / "training_config.yaml")
|
|
|
|
self.logger.info(f"Model saved to: {final_model_dir}")
|
|
return final_model_dir
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to save model: {str(e)}")
|
|
raise
|
|
|
|
def _cleanup(self):
|
|
"""Clean up resources"""
|
|
try:
|
|
# Clear GPU cache
|
|
clear_gpu_cache()
|
|
|
|
# Force garbage collection
|
|
gc.collect()
|
|
|
|
# Delete model and tokenizer to free memory
|
|
if self.model is not None:
|
|
del self.model
|
|
if self.tokenizer is not None:
|
|
del self.tokenizer
|
|
if self.trainer is not None:
|
|
del self.trainer
|
|
|
|
# Final memory cleanup
|
|
if torch.cuda.is_available():
|
|
torch.cuda.empty_cache()
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Error during cleanup: {str(e)}") |