fix many bugs
This commit is contained in:
parent
dc14dc4c2c
commit
f39220b192
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
model:
|
model:
|
||||||
name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
|
name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
|
||||||
max_seq_length: 2048
|
max_seq_length: 512
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
use_fast_tokenizer: true
|
use_fast_tokenizer: true
|
||||||
padding_side: "left"
|
padding_side: "left"
|
||||||
@ -11,8 +11,8 @@ model:
|
|||||||
|
|
||||||
training:
|
training:
|
||||||
# Memory-optimized batch size for RTX3070 8GB
|
# Memory-optimized batch size for RTX3070 8GB
|
||||||
per_device_train_batch_size: 2
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 4
|
gradient_accumulation_steps: 16
|
||||||
max_steps: 30
|
max_steps: 30
|
||||||
|
|
||||||
# Training parameters
|
# Training parameters
|
||||||
@ -34,7 +34,7 @@ training:
|
|||||||
greater_is_better: false
|
greater_is_better: false
|
||||||
|
|
||||||
# Data loading
|
# Data loading
|
||||||
dataloader_num_workers: 2
|
dataloader_num_workers: 0 # Temporarily disabled for debugging
|
||||||
dataloader_pin_memory: true
|
dataloader_pin_memory: true
|
||||||
remove_unused_columns: false
|
remove_unused_columns: false
|
||||||
|
|
||||||
@ -42,13 +42,20 @@ training:
|
|||||||
use_gradient_checkpointing: true
|
use_gradient_checkpointing: true
|
||||||
offload_to_cpu: false # Explicitly no CPU offloading
|
offload_to_cpu: false # Explicitly no CPU offloading
|
||||||
|
|
||||||
|
# Additional memory optimizations
|
||||||
|
dataloader_drop_last: true
|
||||||
|
|
||||||
|
# Aggressive memory settings for 8GB GPU
|
||||||
|
per_device_eval_batch_size: 1
|
||||||
|
eval_accumulation_steps: 1
|
||||||
|
|
||||||
# Optimizer settings
|
# Optimizer settings
|
||||||
optim: "paged_adamw_8bit"
|
optim: "adamw_torch"
|
||||||
weight_decay: 0.01
|
weight_decay: 0.01
|
||||||
adam_beta1: 0.9
|
adam_beta1: 0.9
|
||||||
adam_beta2: 0.999
|
adam_beta2: 0.999
|
||||||
adam_epsilon: 1.0e-8
|
adam_epsilon: 1.0e-8
|
||||||
max_grad_norm: 1.0
|
max_grad_norm: 0.5
|
||||||
|
|
||||||
# Learning rate scheduler
|
# Learning rate scheduler
|
||||||
lr_scheduler_type: "linear"
|
lr_scheduler_type: "linear"
|
||||||
@ -58,6 +65,9 @@ training:
|
|||||||
fp16: false
|
fp16: false
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
|
# Disable torch compilation to avoid generator tracing issues
|
||||||
|
torch_compile: false
|
||||||
|
|
||||||
# Dataset settings
|
# Dataset settings
|
||||||
dataset_shuffle: true
|
dataset_shuffle: true
|
||||||
dataset_seed: 3407
|
dataset_seed: 3407
|
||||||
|
|||||||
@ -27,6 +27,7 @@ class TrainingConfig:
|
|||||||
"""Training configuration"""
|
"""Training configuration"""
|
||||||
per_device_train_batch_size: int = 2
|
per_device_train_batch_size: int = 2
|
||||||
gradient_accumulation_steps: int = 4
|
gradient_accumulation_steps: int = 4
|
||||||
|
max_steps: int = 10
|
||||||
num_train_epochs: int = 3
|
num_train_epochs: int = 3
|
||||||
learning_rate: float = 2e-4
|
learning_rate: float = 2e-4
|
||||||
warmup_steps: int = 10
|
warmup_steps: int = 10
|
||||||
@ -41,6 +42,7 @@ class TrainingConfig:
|
|||||||
dataloader_num_workers: int = 2
|
dataloader_num_workers: int = 2
|
||||||
dataloader_pin_memory: bool = True
|
dataloader_pin_memory: bool = True
|
||||||
remove_unused_columns: bool = False
|
remove_unused_columns: bool = False
|
||||||
|
dataloader_drop_last: bool = True
|
||||||
label_names: List[str] = None
|
label_names: List[str] = None
|
||||||
|
|
||||||
# Memory optimization for RTX3070 8GB
|
# Memory optimization for RTX3070 8GB
|
||||||
@ -55,7 +57,9 @@ class TrainingConfig:
|
|||||||
adam_beta1: float = 0.9
|
adam_beta1: float = 0.9
|
||||||
adam_beta2: float = 0.999
|
adam_beta2: float = 0.999
|
||||||
adam_epsilon: float = 1e-8
|
adam_epsilon: float = 1e-8
|
||||||
max_grad_norm: float = 1.0
|
max_grad_norm: float = 0.5
|
||||||
|
per_device_eval_batch_size: int = 1
|
||||||
|
eval_accumulation_steps: int = 1
|
||||||
|
|
||||||
# Learning rate scheduler
|
# Learning rate scheduler
|
||||||
lr_scheduler_type: str = "cosine"
|
lr_scheduler_type: str = "cosine"
|
||||||
@ -66,6 +70,9 @@ class TrainingConfig:
|
|||||||
fp16: bool = False
|
fp16: bool = False
|
||||||
tf32: bool = True
|
tf32: bool = True
|
||||||
|
|
||||||
|
# Compilation settings
|
||||||
|
torch_compile: bool = False
|
||||||
|
|
||||||
# Dataset processing
|
# Dataset processing
|
||||||
dataset_shuffle: bool = True
|
dataset_shuffle: bool = True
|
||||||
dataset_seed: int = 42
|
dataset_seed: int = 42
|
||||||
@ -181,6 +188,7 @@ class AppConfig:
|
|||||||
'training': {
|
'training': {
|
||||||
'per_device_train_batch_size': self.training.per_device_train_batch_size,
|
'per_device_train_batch_size': self.training.per_device_train_batch_size,
|
||||||
'gradient_accumulation_steps': self.training.gradient_accumulation_steps,
|
'gradient_accumulation_steps': self.training.gradient_accumulation_steps,
|
||||||
|
'max_steps': self.training.max_steps,
|
||||||
'num_train_epochs': self.training.num_train_epochs,
|
'num_train_epochs': self.training.num_train_epochs,
|
||||||
'learning_rate': self.training.learning_rate,
|
'learning_rate': self.training.learning_rate,
|
||||||
'warmup_steps': self.training.warmup_steps,
|
'warmup_steps': self.training.warmup_steps,
|
||||||
@ -195,17 +203,25 @@ class AppConfig:
|
|||||||
'dataloader_num_workers': self.training.dataloader_num_workers,
|
'dataloader_num_workers': self.training.dataloader_num_workers,
|
||||||
'dataloader_pin_memory': self.training.dataloader_pin_memory,
|
'dataloader_pin_memory': self.training.dataloader_pin_memory,
|
||||||
'remove_unused_columns': self.training.remove_unused_columns,
|
'remove_unused_columns': self.training.remove_unused_columns,
|
||||||
|
'dataloader_drop_last': self.training.dataloader_drop_last,
|
||||||
'use_gradient_checkpointing': self.training.use_gradient_checkpointing,
|
'use_gradient_checkpointing': self.training.use_gradient_checkpointing,
|
||||||
'offload_to_cpu': self.training.offload_to_cpu,
|
'offload_to_cpu': self.training.offload_to_cpu,
|
||||||
'optim': self.training.optim,
|
'optim': self.training.optim,
|
||||||
'weight_decay': self.training.weight_decay,
|
'weight_decay': self.training.weight_decay,
|
||||||
|
'adam_beta1': self.training.adam_beta1,
|
||||||
|
'adam_beta2': self.training.adam_beta2,
|
||||||
|
'adam_epsilon': self.training.adam_epsilon,
|
||||||
|
'max_grad_norm': self.training.max_grad_norm,
|
||||||
|
'per_device_eval_batch_size': self.training.per_device_eval_batch_size,
|
||||||
|
'eval_accumulation_steps': self.training.eval_accumulation_steps,
|
||||||
'lr_scheduler_type': self.training.lr_scheduler_type,
|
'lr_scheduler_type': self.training.lr_scheduler_type,
|
||||||
'warmup_ratio': self.training.warmup_ratio,
|
'warmup_ratio': self.training.warmup_ratio,
|
||||||
'bf16': self.training.bf16,
|
'bf16': self.training.bf16,
|
||||||
'fp16': self.training.fp16,
|
'fp16': self.training.fp16,
|
||||||
'tf32': self.training.tf32,
|
'tf32': self.training.tf32,
|
||||||
'dataset_shuffle': self.training.dataset_shuffle,
|
'dataset_shuffle': self.training.dataset_shuffle,
|
||||||
'dataset_seed': self.training.dataset_seed
|
'dataset_seed': self.training.dataset_seed,
|
||||||
|
'torch_compile': self.training.torch_compile
|
||||||
},
|
},
|
||||||
'dataset': {
|
'dataset': {
|
||||||
'min_file_size': self.dataset.min_file_size,
|
'min_file_size': self.dataset.min_file_size,
|
||||||
|
|||||||
@ -178,6 +178,7 @@ class DatasetProcessor:
|
|||||||
return code_samples
|
return code_samples
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
self.logger.info(f"Finished processing {repo_url}")
|
||||||
# Cleanup temporary directories, but keep gitclone folder
|
# Cleanup temporary directories, but keep gitclone folder
|
||||||
# if temp_dir != "./gitclone":
|
# if temp_dir != "./gitclone":
|
||||||
# shutil.rmtree(temp_dir, ignore_errors=True)
|
# shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|||||||
17
src/main.py
17
src/main.py
@ -4,6 +4,17 @@ Main entry point for AI Trainer application
|
|||||||
Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model
|
Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Disable torch compilation before importing any modules
|
||||||
|
import os
|
||||||
|
os.environ['DISABLE_TORCH_COMPILE'] = '1'
|
||||||
|
os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
|
||||||
|
os.environ['TORCH_COMPILE_DISABLE'] = '1'
|
||||||
|
|
||||||
|
from trainer import ModelTrainer
|
||||||
|
from dataset_processor import DatasetProcessor
|
||||||
|
from config import AppConfig
|
||||||
|
from utils import setup_logging, check_gpu_memory
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@ -13,12 +24,6 @@ from pathlib import Path
|
|||||||
# Add src to path for imports
|
# Add src to path for imports
|
||||||
sys.path.append(str(Path(__file__).parent))
|
sys.path.append(str(Path(__file__).parent))
|
||||||
|
|
||||||
from trainer import ModelTrainer
|
|
||||||
from dataset_processor import DatasetProcessor
|
|
||||||
from config import AppConfig
|
|
||||||
from utils import setup_logging, check_gpu_memory
|
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
"""Parse command line arguments"""
|
"""Parse command line arguments"""
|
||||||
parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model")
|
parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model")
|
||||||
|
|||||||
174
src/trainer.py
174
src/trainer.py
@ -3,13 +3,21 @@ Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit
|
|||||||
Optimized for RTX3070 8GB VRAM with no CPU offloading
|
Optimized for RTX3070 8GB VRAM with no CPU offloading
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Disable torch compilation before importing any modules that might use it
|
||||||
|
import os
|
||||||
|
os.environ['DISABLE_TORCH_COMPILE'] = '1'
|
||||||
|
os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
|
||||||
|
os.environ['TORCH_COMPILE_DISABLE'] = '1'
|
||||||
|
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
|
||||||
|
|
||||||
|
from unsloth import FastLanguageModel, is_bfloat16_supported
|
||||||
|
from unsloth.chat_templates import get_chat_template
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import gc
|
import gc
|
||||||
import torch
|
import torch
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Dict, Any
|
from typing import Optional, Dict, Any
|
||||||
|
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoModelForCausalLM,
|
AutoModelForCausalLM,
|
||||||
@ -21,8 +29,6 @@ from transformers import (
|
|||||||
)
|
)
|
||||||
from trl import SFTConfig, SFTTrainer
|
from trl import SFTConfig, SFTTrainer
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
from unsloth import FastLanguageModel, is_bfloat16_supported
|
|
||||||
|
|
||||||
from config import AppConfig
|
from config import AppConfig
|
||||||
from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage
|
from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage
|
||||||
|
|
||||||
@ -125,9 +131,18 @@ class ModelTrainer:
|
|||||||
max_seq_length=self.config.model.max_seq_length,
|
max_seq_length=self.config.model.max_seq_length,
|
||||||
dtype=None, # Auto-detect
|
dtype=None, # Auto-detect
|
||||||
load_in_4bit=True, # Use 4-bit quantization
|
load_in_4bit=True, # Use 4-bit quantization
|
||||||
token=None, # Use default token
|
token="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", # Use default token
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Log tokenizer attributes for debugging
|
||||||
|
self.logger.info(f"Tokenizer type: {type(self.tokenizer)}")
|
||||||
|
self.logger.info(f"Tokenizer class: {self.tokenizer.__class__}")
|
||||||
|
self.logger.info(f"Tokenizer has 'unsloth_push_to_hub': {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
|
||||||
|
|
||||||
|
# List all methods containing 'unsloth' for debugging
|
||||||
|
unsloth_methods = [attr for attr in dir(self.tokenizer) if 'unsloth' in attr.lower()]
|
||||||
|
self.logger.info(f"Tokenizer unsloth methods: {unsloth_methods}")
|
||||||
|
|
||||||
# Configure model for training
|
# Configure model for training
|
||||||
self.model = FastLanguageModel.get_peft_model(
|
self.model = FastLanguageModel.get_peft_model(
|
||||||
self.model,
|
self.model,
|
||||||
@ -152,31 +167,126 @@ class ModelTrainer:
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
def _prepare_dataset(self, train_dataset: Dataset) -> Dataset:
|
def _prepare_dataset(self, train_dataset: Dataset) -> Dataset:
|
||||||
"""Prepare and tokenize the dataset"""
|
"""Prepare and tokenize the dataset for Qwen2.5-Coder"""
|
||||||
self.logger.info("Preparing dataset...")
|
self.logger.info("Preparing dataset for Qwen2.5-Coder...")
|
||||||
|
|
||||||
|
# Apply chat template for Qwen2.5-Coder if available
|
||||||
|
try:
|
||||||
|
chat_template = get_chat_template("qwen")
|
||||||
|
if chat_template and isinstance(chat_template, str):
|
||||||
|
self.tokenizer.chat_template = chat_template
|
||||||
|
self.logger.info("Applied Qwen chat template from string")
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"Invalid chat template received: {type(chat_template)}")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not apply Qwen chat template: {e}")
|
||||||
|
# Fallback to default formatting
|
||||||
|
pass
|
||||||
|
|
||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
return self.tokenizer(
|
# Format examples as instruction-following pairs for code training
|
||||||
examples["text"],
|
formatted_texts = []
|
||||||
|
for text in examples["text"]:
|
||||||
|
# Create an instruction format appropriate for code training
|
||||||
|
# For Qwen2.5-Coder, we can use a code completion or analysis format
|
||||||
|
messages = [
|
||||||
|
{"role": "user", "content": "Analyze and understand the following code:"},
|
||||||
|
{"role": "assistant", "content": text}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Apply chat template if available, otherwise use simple formatting
|
||||||
|
try:
|
||||||
|
# Log tokenizer state in multiprocessing context
|
||||||
|
import multiprocessing
|
||||||
|
self.logger.debug(f"Tokenize function - Process ID: {multiprocessing.current_process().pid}")
|
||||||
|
self.logger.debug(f"Tokenize function - Tokenizer type: {type(self.tokenizer)}")
|
||||||
|
self.logger.debug(f"Tokenize function - Has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
|
||||||
|
self.logger.debug(f"Tokenize function - Has chat_template: {hasattr(self.tokenizer, 'chat_template')}")
|
||||||
|
if hasattr(self.tokenizer, 'chat_template'):
|
||||||
|
self.logger.debug(f"Tokenize function - chat_template type: {type(self.tokenizer.chat_template)}")
|
||||||
|
|
||||||
|
# Check if tokenizer has apply_chat_template method
|
||||||
|
if hasattr(self.tokenizer, 'apply_chat_template'):
|
||||||
|
formatted_text = self.tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=False # We're training on the full conversation
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.warning("Tokenizer does not have apply_chat_template method, using fallback")
|
||||||
|
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
|
||||||
|
|
||||||
|
except AttributeError as e:
|
||||||
|
if 'unsloth_push_to_hub' in str(e):
|
||||||
|
self.logger.error(f"AttributeError in multiprocessing context: {e}")
|
||||||
|
self.logger.error(f"Tokenizer type: {type(self.tokenizer)}")
|
||||||
|
self.logger.error(f"Tokenizer attributes: {[attr for attr in dir(self.tokenizer) if not attr.startswith('_')]}")
|
||||||
|
elif 'padding_side' in str(e):
|
||||||
|
self.logger.warning(f"Chat template padding_side error: {e}")
|
||||||
|
self.logger.warning("Using fallback formatting due to chat template issue")
|
||||||
|
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error applying chat template: {e}, using fallback formatting")
|
||||||
|
# Fallback to simple formatting with special tokens
|
||||||
|
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
|
||||||
|
|
||||||
|
formatted_texts.append(formatted_text)
|
||||||
|
|
||||||
|
# Tokenize with proper padding and truncation for Qwen2.5-Coder
|
||||||
|
tokenized = self.tokenizer(
|
||||||
|
formatted_texts,
|
||||||
padding="max_length",
|
padding="max_length",
|
||||||
truncation=True,
|
truncation=True,
|
||||||
max_length=self.config.model.max_seq_length,
|
max_length=self.config.model.max_seq_length,
|
||||||
return_tensors="pt"
|
return_tensors="pt",
|
||||||
|
add_special_tokens=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# For causal language modeling, we need to create proper labels
|
||||||
|
# Clone input_ids to create labels
|
||||||
|
labels = tokenized["input_ids"].clone()
|
||||||
|
|
||||||
|
# Try to mask the user part of the conversation
|
||||||
|
# Find the assistant token to determine where the assistant response starts
|
||||||
|
try:
|
||||||
|
# Convert to string to find the assistant token
|
||||||
|
decoded_tokens = self.tokenizer.batch_decode(tokenized["input_ids"], skip_special_tokens=False)
|
||||||
|
for i, decoded in enumerate(decoded_tokens):
|
||||||
|
# Find where the assistant response starts
|
||||||
|
assistant_start = decoded.find("<|im_start|>assistant")
|
||||||
|
if assistant_start != -1:
|
||||||
|
# Find the actual token position
|
||||||
|
# We'll mask everything before the assistant response with -100
|
||||||
|
assistant_tokens = self.tokenizer("<|im_start|>assistant", add_special_tokens=False)["input_ids"]
|
||||||
|
if len(assistant_tokens) > 0:
|
||||||
|
# Find where the assistant token first appears
|
||||||
|
assistant_token_id = assistant_tokens[0]
|
||||||
|
assistant_positions = (tokenized["input_ids"][i] == assistant_token_id).nonzero(as_tuple=True)[0]
|
||||||
|
if len(assistant_positions) > 0:
|
||||||
|
# Mask everything before the assistant token
|
||||||
|
labels[i, :assistant_positions[0]] = -100
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Could not mask user tokens: {e}")
|
||||||
|
# Fallback: Just use the input_ids as labels
|
||||||
|
pass
|
||||||
|
|
||||||
|
tokenized["labels"] = labels
|
||||||
|
return tokenized
|
||||||
|
|
||||||
# Tokenize dataset
|
# Tokenize dataset
|
||||||
tokenized_dataset = train_dataset.map(
|
tokenized_dataset = train_dataset.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"],
|
remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"],
|
||||||
desc="Tokenizing dataset"
|
desc="Tokenizing dataset for Qwen2.5-Coder"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}")
|
self.logger.info(f"Dataset tokenized for Qwen2.5-Coder. Size: {len(tokenized_dataset)}")
|
||||||
return tokenized_dataset
|
return tokenized_dataset
|
||||||
|
|
||||||
def _setup_trainer(self, tokenized_dataset: Dataset):
|
def _setup_trainer(self, tokenized_dataset: Dataset):
|
||||||
"""Setup the HuggingFace trainer with memory optimizations"""
|
|
||||||
self.logger.info("Setting up trainer...")
|
self.logger.info("Setting up trainer...")
|
||||||
|
|
||||||
# Training arguments optimized for RTX3070 8GB
|
# Training arguments optimized for RTX3070 8GB
|
||||||
@ -218,7 +328,10 @@ class ModelTrainer:
|
|||||||
# Memory optimization settings
|
# Memory optimization settings
|
||||||
ddp_find_unused_parameters=False,
|
ddp_find_unused_parameters=False,
|
||||||
per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
|
per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
|
||||||
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
|
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10,
|
||||||
|
# Explicitly disable torch compilation
|
||||||
|
torch_compile=False,
|
||||||
|
torch_compile_backend=None
|
||||||
)
|
)
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
@ -228,17 +341,32 @@ class ModelTrainer:
|
|||||||
# )
|
# )
|
||||||
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
|
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
|
||||||
|
|
||||||
|
# Log data collator setup for debugging
|
||||||
|
self.logger.info(f"Data collator created with tokenizer type: {type(self.tokenizer)}")
|
||||||
|
self.logger.info(f"Data collator tokenizer has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
|
||||||
|
|
||||||
# Initialize trainer
|
# Initialize trainer
|
||||||
self.trainer = SFTTrainer(
|
self.logger.info("Initializing SFTTrainer...")
|
||||||
model=self.model,
|
|
||||||
args=training_args,
|
# Environment variables for torch compilation are set at module level
|
||||||
train_dataset=tokenized_dataset,
|
self.logger.info("Torch compilation environment variables set at module level")
|
||||||
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
|
|
||||||
data_collator=data_collator,
|
try:
|
||||||
tokenizer=self.tokenizer,
|
self.trainer = SFTTrainer(
|
||||||
dataset_text_field="text",
|
model=self.model,
|
||||||
packing=False # Can make training 5x faster for short sequences.
|
args=training_args,
|
||||||
)
|
train_dataset=tokenized_dataset,
|
||||||
|
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
|
||||||
|
data_collator=data_collator,
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
# dataset_text_field="text",
|
||||||
|
# packing=False # Can make training 5x faster for short sequences.
|
||||||
|
)
|
||||||
|
self.logger.info("SFTTrainer initialized successfully")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to initialize SFTTrainer: {e}")
|
||||||
|
self.logger.error(f"Error type: {type(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
self.logger.info("Trainer setup completed")
|
self.logger.info("Trainer setup completed")
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user