fix many bugs
This commit is contained in:
parent
dc14dc4c2c
commit
f39220b192
@ -3,7 +3,7 @@
|
||||
|
||||
model:
|
||||
name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
|
||||
max_seq_length: 2048
|
||||
max_seq_length: 512
|
||||
trust_remote_code: true
|
||||
use_fast_tokenizer: true
|
||||
padding_side: "left"
|
||||
@ -11,8 +11,8 @@ model:
|
||||
|
||||
training:
|
||||
# Memory-optimized batch size for RTX3070 8GB
|
||||
per_device_train_batch_size: 2
|
||||
gradient_accumulation_steps: 4
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 16
|
||||
max_steps: 30
|
||||
|
||||
# Training parameters
|
||||
@ -34,7 +34,7 @@ training:
|
||||
greater_is_better: false
|
||||
|
||||
# Data loading
|
||||
dataloader_num_workers: 2
|
||||
dataloader_num_workers: 0 # Temporarily disabled for debugging
|
||||
dataloader_pin_memory: true
|
||||
remove_unused_columns: false
|
||||
|
||||
@ -42,13 +42,20 @@ training:
|
||||
use_gradient_checkpointing: true
|
||||
offload_to_cpu: false # Explicitly no CPU offloading
|
||||
|
||||
# Additional memory optimizations
|
||||
dataloader_drop_last: true
|
||||
|
||||
# Aggressive memory settings for 8GB GPU
|
||||
per_device_eval_batch_size: 1
|
||||
eval_accumulation_steps: 1
|
||||
|
||||
# Optimizer settings
|
||||
optim: "paged_adamw_8bit"
|
||||
optim: "adamw_torch"
|
||||
weight_decay: 0.01
|
||||
adam_beta1: 0.9
|
||||
adam_beta2: 0.999
|
||||
adam_epsilon: 1.0e-8
|
||||
max_grad_norm: 1.0
|
||||
max_grad_norm: 0.5
|
||||
|
||||
# Learning rate scheduler
|
||||
lr_scheduler_type: "linear"
|
||||
@ -58,6 +65,9 @@ training:
|
||||
fp16: false
|
||||
tf32: true
|
||||
|
||||
# Disable torch compilation to avoid generator tracing issues
|
||||
torch_compile: false
|
||||
|
||||
# Dataset settings
|
||||
dataset_shuffle: true
|
||||
dataset_seed: 3407
|
||||
|
||||
@ -27,6 +27,7 @@ class TrainingConfig:
|
||||
"""Training configuration"""
|
||||
per_device_train_batch_size: int = 2
|
||||
gradient_accumulation_steps: int = 4
|
||||
max_steps: int = 10
|
||||
num_train_epochs: int = 3
|
||||
learning_rate: float = 2e-4
|
||||
warmup_steps: int = 10
|
||||
@ -41,6 +42,7 @@ class TrainingConfig:
|
||||
dataloader_num_workers: int = 2
|
||||
dataloader_pin_memory: bool = True
|
||||
remove_unused_columns: bool = False
|
||||
dataloader_drop_last: bool = True
|
||||
label_names: List[str] = None
|
||||
|
||||
# Memory optimization for RTX3070 8GB
|
||||
@ -55,7 +57,9 @@ class TrainingConfig:
|
||||
adam_beta1: float = 0.9
|
||||
adam_beta2: float = 0.999
|
||||
adam_epsilon: float = 1e-8
|
||||
max_grad_norm: float = 1.0
|
||||
max_grad_norm: float = 0.5
|
||||
per_device_eval_batch_size: int = 1
|
||||
eval_accumulation_steps: int = 1
|
||||
|
||||
# Learning rate scheduler
|
||||
lr_scheduler_type: str = "cosine"
|
||||
@ -66,6 +70,9 @@ class TrainingConfig:
|
||||
fp16: bool = False
|
||||
tf32: bool = True
|
||||
|
||||
# Compilation settings
|
||||
torch_compile: bool = False
|
||||
|
||||
# Dataset processing
|
||||
dataset_shuffle: bool = True
|
||||
dataset_seed: int = 42
|
||||
@ -181,6 +188,7 @@ class AppConfig:
|
||||
'training': {
|
||||
'per_device_train_batch_size': self.training.per_device_train_batch_size,
|
||||
'gradient_accumulation_steps': self.training.gradient_accumulation_steps,
|
||||
'max_steps': self.training.max_steps,
|
||||
'num_train_epochs': self.training.num_train_epochs,
|
||||
'learning_rate': self.training.learning_rate,
|
||||
'warmup_steps': self.training.warmup_steps,
|
||||
@ -195,17 +203,25 @@ class AppConfig:
|
||||
'dataloader_num_workers': self.training.dataloader_num_workers,
|
||||
'dataloader_pin_memory': self.training.dataloader_pin_memory,
|
||||
'remove_unused_columns': self.training.remove_unused_columns,
|
||||
'dataloader_drop_last': self.training.dataloader_drop_last,
|
||||
'use_gradient_checkpointing': self.training.use_gradient_checkpointing,
|
||||
'offload_to_cpu': self.training.offload_to_cpu,
|
||||
'optim': self.training.optim,
|
||||
'weight_decay': self.training.weight_decay,
|
||||
'adam_beta1': self.training.adam_beta1,
|
||||
'adam_beta2': self.training.adam_beta2,
|
||||
'adam_epsilon': self.training.adam_epsilon,
|
||||
'max_grad_norm': self.training.max_grad_norm,
|
||||
'per_device_eval_batch_size': self.training.per_device_eval_batch_size,
|
||||
'eval_accumulation_steps': self.training.eval_accumulation_steps,
|
||||
'lr_scheduler_type': self.training.lr_scheduler_type,
|
||||
'warmup_ratio': self.training.warmup_ratio,
|
||||
'bf16': self.training.bf16,
|
||||
'fp16': self.training.fp16,
|
||||
'tf32': self.training.tf32,
|
||||
'dataset_shuffle': self.training.dataset_shuffle,
|
||||
'dataset_seed': self.training.dataset_seed
|
||||
'dataset_seed': self.training.dataset_seed,
|
||||
'torch_compile': self.training.torch_compile
|
||||
},
|
||||
'dataset': {
|
||||
'min_file_size': self.dataset.min_file_size,
|
||||
|
||||
@ -178,6 +178,7 @@ class DatasetProcessor:
|
||||
return code_samples
|
||||
|
||||
finally:
|
||||
self.logger.info(f"Finished processing {repo_url}")
|
||||
# Cleanup temporary directories, but keep gitclone folder
|
||||
# if temp_dir != "./gitclone":
|
||||
# shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
17
src/main.py
17
src/main.py
@ -4,6 +4,17 @@ Main entry point for AI Trainer application
|
||||
Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model
|
||||
"""
|
||||
|
||||
# Disable torch compilation before importing any modules
|
||||
import os
|
||||
os.environ['DISABLE_TORCH_COMPILE'] = '1'
|
||||
os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
|
||||
os.environ['TORCH_COMPILE_DISABLE'] = '1'
|
||||
|
||||
from trainer import ModelTrainer
|
||||
from dataset_processor import DatasetProcessor
|
||||
from config import AppConfig
|
||||
from utils import setup_logging, check_gpu_memory
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
@ -13,12 +24,6 @@ from pathlib import Path
|
||||
# Add src to path for imports
|
||||
sys.path.append(str(Path(__file__).parent))
|
||||
|
||||
from trainer import ModelTrainer
|
||||
from dataset_processor import DatasetProcessor
|
||||
from config import AppConfig
|
||||
from utils import setup_logging, check_gpu_memory
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
"""Parse command line arguments"""
|
||||
parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model")
|
||||
|
||||
174
src/trainer.py
174
src/trainer.py
@ -3,13 +3,21 @@ Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit
|
||||
Optimized for RTX3070 8GB VRAM with no CPU offloading
|
||||
"""
|
||||
|
||||
# Disable torch compilation before importing any modules that might use it
|
||||
import os
|
||||
os.environ['DISABLE_TORCH_COMPILE'] = '1'
|
||||
os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
|
||||
os.environ['TORCH_COMPILE_DISABLE'] = '1'
|
||||
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
|
||||
|
||||
from unsloth import FastLanguageModel, is_bfloat16_supported
|
||||
from unsloth.chat_templates import get_chat_template
|
||||
import logging
|
||||
import os
|
||||
import gc
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
import torch.nn as nn
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
@ -21,8 +29,6 @@ from transformers import (
|
||||
)
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from datasets import Dataset
|
||||
from unsloth import FastLanguageModel, is_bfloat16_supported
|
||||
|
||||
from config import AppConfig
|
||||
from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage
|
||||
|
||||
@ -125,9 +131,18 @@ class ModelTrainer:
|
||||
max_seq_length=self.config.model.max_seq_length,
|
||||
dtype=None, # Auto-detect
|
||||
load_in_4bit=True, # Use 4-bit quantization
|
||||
token=None, # Use default token
|
||||
token="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", # Use default token
|
||||
)
|
||||
|
||||
# Log tokenizer attributes for debugging
|
||||
self.logger.info(f"Tokenizer type: {type(self.tokenizer)}")
|
||||
self.logger.info(f"Tokenizer class: {self.tokenizer.__class__}")
|
||||
self.logger.info(f"Tokenizer has 'unsloth_push_to_hub': {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
|
||||
|
||||
# List all methods containing 'unsloth' for debugging
|
||||
unsloth_methods = [attr for attr in dir(self.tokenizer) if 'unsloth' in attr.lower()]
|
||||
self.logger.info(f"Tokenizer unsloth methods: {unsloth_methods}")
|
||||
|
||||
# Configure model for training
|
||||
self.model = FastLanguageModel.get_peft_model(
|
||||
self.model,
|
||||
@ -152,31 +167,126 @@ class ModelTrainer:
|
||||
raise
|
||||
|
||||
def _prepare_dataset(self, train_dataset: Dataset) -> Dataset:
|
||||
"""Prepare and tokenize the dataset"""
|
||||
self.logger.info("Preparing dataset...")
|
||||
"""Prepare and tokenize the dataset for Qwen2.5-Coder"""
|
||||
self.logger.info("Preparing dataset for Qwen2.5-Coder...")
|
||||
|
||||
# Apply chat template for Qwen2.5-Coder if available
|
||||
try:
|
||||
chat_template = get_chat_template("qwen")
|
||||
if chat_template and isinstance(chat_template, str):
|
||||
self.tokenizer.chat_template = chat_template
|
||||
self.logger.info("Applied Qwen chat template from string")
|
||||
else:
|
||||
self.logger.warning(f"Invalid chat template received: {type(chat_template)}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not apply Qwen chat template: {e}")
|
||||
# Fallback to default formatting
|
||||
pass
|
||||
|
||||
def tokenize_function(examples):
|
||||
return self.tokenizer(
|
||||
examples["text"],
|
||||
# Format examples as instruction-following pairs for code training
|
||||
formatted_texts = []
|
||||
for text in examples["text"]:
|
||||
# Create an instruction format appropriate for code training
|
||||
# For Qwen2.5-Coder, we can use a code completion or analysis format
|
||||
messages = [
|
||||
{"role": "user", "content": "Analyze and understand the following code:"},
|
||||
{"role": "assistant", "content": text}
|
||||
]
|
||||
|
||||
# Apply chat template if available, otherwise use simple formatting
|
||||
try:
|
||||
# Log tokenizer state in multiprocessing context
|
||||
import multiprocessing
|
||||
self.logger.debug(f"Tokenize function - Process ID: {multiprocessing.current_process().pid}")
|
||||
self.logger.debug(f"Tokenize function - Tokenizer type: {type(self.tokenizer)}")
|
||||
self.logger.debug(f"Tokenize function - Has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
|
||||
self.logger.debug(f"Tokenize function - Has chat_template: {hasattr(self.tokenizer, 'chat_template')}")
|
||||
if hasattr(self.tokenizer, 'chat_template'):
|
||||
self.logger.debug(f"Tokenize function - chat_template type: {type(self.tokenizer.chat_template)}")
|
||||
|
||||
# Check if tokenizer has apply_chat_template method
|
||||
if hasattr(self.tokenizer, 'apply_chat_template'):
|
||||
formatted_text = self.tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=False # We're training on the full conversation
|
||||
)
|
||||
else:
|
||||
self.logger.warning("Tokenizer does not have apply_chat_template method, using fallback")
|
||||
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
|
||||
|
||||
except AttributeError as e:
|
||||
if 'unsloth_push_to_hub' in str(e):
|
||||
self.logger.error(f"AttributeError in multiprocessing context: {e}")
|
||||
self.logger.error(f"Tokenizer type: {type(self.tokenizer)}")
|
||||
self.logger.error(f"Tokenizer attributes: {[attr for attr in dir(self.tokenizer) if not attr.startswith('_')]}")
|
||||
elif 'padding_side' in str(e):
|
||||
self.logger.warning(f"Chat template padding_side error: {e}")
|
||||
self.logger.warning("Using fallback formatting due to chat template issue")
|
||||
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
|
||||
else:
|
||||
raise
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error applying chat template: {e}, using fallback formatting")
|
||||
# Fallback to simple formatting with special tokens
|
||||
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
|
||||
|
||||
formatted_texts.append(formatted_text)
|
||||
|
||||
# Tokenize with proper padding and truncation for Qwen2.5-Coder
|
||||
tokenized = self.tokenizer(
|
||||
formatted_texts,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
max_length=self.config.model.max_seq_length,
|
||||
return_tensors="pt"
|
||||
return_tensors="pt",
|
||||
add_special_tokens=True
|
||||
)
|
||||
|
||||
# For causal language modeling, we need to create proper labels
|
||||
# Clone input_ids to create labels
|
||||
labels = tokenized["input_ids"].clone()
|
||||
|
||||
# Try to mask the user part of the conversation
|
||||
# Find the assistant token to determine where the assistant response starts
|
||||
try:
|
||||
# Convert to string to find the assistant token
|
||||
decoded_tokens = self.tokenizer.batch_decode(tokenized["input_ids"], skip_special_tokens=False)
|
||||
for i, decoded in enumerate(decoded_tokens):
|
||||
# Find where the assistant response starts
|
||||
assistant_start = decoded.find("<|im_start|>assistant")
|
||||
if assistant_start != -1:
|
||||
# Find the actual token position
|
||||
# We'll mask everything before the assistant response with -100
|
||||
assistant_tokens = self.tokenizer("<|im_start|>assistant", add_special_tokens=False)["input_ids"]
|
||||
if len(assistant_tokens) > 0:
|
||||
# Find where the assistant token first appears
|
||||
assistant_token_id = assistant_tokens[0]
|
||||
assistant_positions = (tokenized["input_ids"][i] == assistant_token_id).nonzero(as_tuple=True)[0]
|
||||
if len(assistant_positions) > 0:
|
||||
# Mask everything before the assistant token
|
||||
labels[i, :assistant_positions[0]] = -100
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not mask user tokens: {e}")
|
||||
# Fallback: Just use the input_ids as labels
|
||||
pass
|
||||
|
||||
tokenized["labels"] = labels
|
||||
return tokenized
|
||||
|
||||
# Tokenize dataset
|
||||
tokenized_dataset = train_dataset.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"],
|
||||
desc="Tokenizing dataset"
|
||||
desc="Tokenizing dataset for Qwen2.5-Coder"
|
||||
)
|
||||
|
||||
self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}")
|
||||
self.logger.info(f"Dataset tokenized for Qwen2.5-Coder. Size: {len(tokenized_dataset)}")
|
||||
return tokenized_dataset
|
||||
|
||||
def _setup_trainer(self, tokenized_dataset: Dataset):
|
||||
"""Setup the HuggingFace trainer with memory optimizations"""
|
||||
self.logger.info("Setting up trainer...")
|
||||
|
||||
# Training arguments optimized for RTX3070 8GB
|
||||
@ -218,7 +328,10 @@ class ModelTrainer:
|
||||
# Memory optimization settings
|
||||
ddp_find_unused_parameters=False,
|
||||
per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
|
||||
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
|
||||
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10,
|
||||
# Explicitly disable torch compilation
|
||||
torch_compile=False,
|
||||
torch_compile_backend=None
|
||||
)
|
||||
|
||||
# Data collator
|
||||
@ -228,17 +341,32 @@ class ModelTrainer:
|
||||
# )
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
|
||||
|
||||
# Log data collator setup for debugging
|
||||
self.logger.info(f"Data collator created with tokenizer type: {type(self.tokenizer)}")
|
||||
self.logger.info(f"Data collator tokenizer has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
|
||||
|
||||
# Initialize trainer
|
||||
self.trainer = SFTTrainer(
|
||||
model=self.model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_dataset,
|
||||
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
|
||||
data_collator=data_collator,
|
||||
tokenizer=self.tokenizer,
|
||||
dataset_text_field="text",
|
||||
packing=False # Can make training 5x faster for short sequences.
|
||||
)
|
||||
self.logger.info("Initializing SFTTrainer...")
|
||||
|
||||
# Environment variables for torch compilation are set at module level
|
||||
self.logger.info("Torch compilation environment variables set at module level")
|
||||
|
||||
try:
|
||||
self.trainer = SFTTrainer(
|
||||
model=self.model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_dataset,
|
||||
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
|
||||
data_collator=data_collator,
|
||||
tokenizer=self.tokenizer,
|
||||
# dataset_text_field="text",
|
||||
# packing=False # Can make training 5x faster for short sequences.
|
||||
)
|
||||
self.logger.info("SFTTrainer initialized successfully")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to initialize SFTTrainer: {e}")
|
||||
self.logger.error(f"Error type: {type(e)}")
|
||||
raise
|
||||
|
||||
self.logger.info("Trainer setup completed")
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user