fix many bugs

This commit is contained in:
Suherdy Yacob 2025-08-22 23:28:17 +07:00
parent dc14dc4c2c
commit f39220b192
5 changed files with 197 additions and 37 deletions

View File

@ -3,7 +3,7 @@
model: model:
name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit" name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
max_seq_length: 2048 max_seq_length: 512
trust_remote_code: true trust_remote_code: true
use_fast_tokenizer: true use_fast_tokenizer: true
padding_side: "left" padding_side: "left"
@ -11,8 +11,8 @@ model:
training: training:
# Memory-optimized batch size for RTX3070 8GB # Memory-optimized batch size for RTX3070 8GB
per_device_train_batch_size: 2 per_device_train_batch_size: 1
gradient_accumulation_steps: 4 gradient_accumulation_steps: 16
max_steps: 30 max_steps: 30
# Training parameters # Training parameters
@ -34,7 +34,7 @@ training:
greater_is_better: false greater_is_better: false
# Data loading # Data loading
dataloader_num_workers: 2 dataloader_num_workers: 0 # Temporarily disabled for debugging
dataloader_pin_memory: true dataloader_pin_memory: true
remove_unused_columns: false remove_unused_columns: false
@ -42,13 +42,20 @@ training:
use_gradient_checkpointing: true use_gradient_checkpointing: true
offload_to_cpu: false # Explicitly no CPU offloading offload_to_cpu: false # Explicitly no CPU offloading
# Additional memory optimizations
dataloader_drop_last: true
# Aggressive memory settings for 8GB GPU
per_device_eval_batch_size: 1
eval_accumulation_steps: 1
# Optimizer settings # Optimizer settings
optim: "paged_adamw_8bit" optim: "adamw_torch"
weight_decay: 0.01 weight_decay: 0.01
adam_beta1: 0.9 adam_beta1: 0.9
adam_beta2: 0.999 adam_beta2: 0.999
adam_epsilon: 1.0e-8 adam_epsilon: 1.0e-8
max_grad_norm: 1.0 max_grad_norm: 0.5
# Learning rate scheduler # Learning rate scheduler
lr_scheduler_type: "linear" lr_scheduler_type: "linear"
@ -58,6 +65,9 @@ training:
fp16: false fp16: false
tf32: true tf32: true
# Disable torch compilation to avoid generator tracing issues
torch_compile: false
# Dataset settings # Dataset settings
dataset_shuffle: true dataset_shuffle: true
dataset_seed: 3407 dataset_seed: 3407

View File

@ -27,6 +27,7 @@ class TrainingConfig:
"""Training configuration""" """Training configuration"""
per_device_train_batch_size: int = 2 per_device_train_batch_size: int = 2
gradient_accumulation_steps: int = 4 gradient_accumulation_steps: int = 4
max_steps: int = 10
num_train_epochs: int = 3 num_train_epochs: int = 3
learning_rate: float = 2e-4 learning_rate: float = 2e-4
warmup_steps: int = 10 warmup_steps: int = 10
@ -41,6 +42,7 @@ class TrainingConfig:
dataloader_num_workers: int = 2 dataloader_num_workers: int = 2
dataloader_pin_memory: bool = True dataloader_pin_memory: bool = True
remove_unused_columns: bool = False remove_unused_columns: bool = False
dataloader_drop_last: bool = True
label_names: List[str] = None label_names: List[str] = None
# Memory optimization for RTX3070 8GB # Memory optimization for RTX3070 8GB
@ -55,7 +57,9 @@ class TrainingConfig:
adam_beta1: float = 0.9 adam_beta1: float = 0.9
adam_beta2: float = 0.999 adam_beta2: float = 0.999
adam_epsilon: float = 1e-8 adam_epsilon: float = 1e-8
max_grad_norm: float = 1.0 max_grad_norm: float = 0.5
per_device_eval_batch_size: int = 1
eval_accumulation_steps: int = 1
# Learning rate scheduler # Learning rate scheduler
lr_scheduler_type: str = "cosine" lr_scheduler_type: str = "cosine"
@ -66,6 +70,9 @@ class TrainingConfig:
fp16: bool = False fp16: bool = False
tf32: bool = True tf32: bool = True
# Compilation settings
torch_compile: bool = False
# Dataset processing # Dataset processing
dataset_shuffle: bool = True dataset_shuffle: bool = True
dataset_seed: int = 42 dataset_seed: int = 42
@ -181,6 +188,7 @@ class AppConfig:
'training': { 'training': {
'per_device_train_batch_size': self.training.per_device_train_batch_size, 'per_device_train_batch_size': self.training.per_device_train_batch_size,
'gradient_accumulation_steps': self.training.gradient_accumulation_steps, 'gradient_accumulation_steps': self.training.gradient_accumulation_steps,
'max_steps': self.training.max_steps,
'num_train_epochs': self.training.num_train_epochs, 'num_train_epochs': self.training.num_train_epochs,
'learning_rate': self.training.learning_rate, 'learning_rate': self.training.learning_rate,
'warmup_steps': self.training.warmup_steps, 'warmup_steps': self.training.warmup_steps,
@ -195,17 +203,25 @@ class AppConfig:
'dataloader_num_workers': self.training.dataloader_num_workers, 'dataloader_num_workers': self.training.dataloader_num_workers,
'dataloader_pin_memory': self.training.dataloader_pin_memory, 'dataloader_pin_memory': self.training.dataloader_pin_memory,
'remove_unused_columns': self.training.remove_unused_columns, 'remove_unused_columns': self.training.remove_unused_columns,
'dataloader_drop_last': self.training.dataloader_drop_last,
'use_gradient_checkpointing': self.training.use_gradient_checkpointing, 'use_gradient_checkpointing': self.training.use_gradient_checkpointing,
'offload_to_cpu': self.training.offload_to_cpu, 'offload_to_cpu': self.training.offload_to_cpu,
'optim': self.training.optim, 'optim': self.training.optim,
'weight_decay': self.training.weight_decay, 'weight_decay': self.training.weight_decay,
'adam_beta1': self.training.adam_beta1,
'adam_beta2': self.training.adam_beta2,
'adam_epsilon': self.training.adam_epsilon,
'max_grad_norm': self.training.max_grad_norm,
'per_device_eval_batch_size': self.training.per_device_eval_batch_size,
'eval_accumulation_steps': self.training.eval_accumulation_steps,
'lr_scheduler_type': self.training.lr_scheduler_type, 'lr_scheduler_type': self.training.lr_scheduler_type,
'warmup_ratio': self.training.warmup_ratio, 'warmup_ratio': self.training.warmup_ratio,
'bf16': self.training.bf16, 'bf16': self.training.bf16,
'fp16': self.training.fp16, 'fp16': self.training.fp16,
'tf32': self.training.tf32, 'tf32': self.training.tf32,
'dataset_shuffle': self.training.dataset_shuffle, 'dataset_shuffle': self.training.dataset_shuffle,
'dataset_seed': self.training.dataset_seed 'dataset_seed': self.training.dataset_seed,
'torch_compile': self.training.torch_compile
}, },
'dataset': { 'dataset': {
'min_file_size': self.dataset.min_file_size, 'min_file_size': self.dataset.min_file_size,

View File

@ -178,6 +178,7 @@ class DatasetProcessor:
return code_samples return code_samples
finally: finally:
self.logger.info(f"Finished processing {repo_url}")
# Cleanup temporary directories, but keep gitclone folder # Cleanup temporary directories, but keep gitclone folder
# if temp_dir != "./gitclone": # if temp_dir != "./gitclone":
# shutil.rmtree(temp_dir, ignore_errors=True) # shutil.rmtree(temp_dir, ignore_errors=True)

View File

@ -4,6 +4,17 @@ Main entry point for AI Trainer application
Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model
""" """
# Disable torch compilation before importing any modules
import os
os.environ['DISABLE_TORCH_COMPILE'] = '1'
os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
os.environ['TORCH_COMPILE_DISABLE'] = '1'
from trainer import ModelTrainer
from dataset_processor import DatasetProcessor
from config import AppConfig
from utils import setup_logging, check_gpu_memory
import argparse import argparse
import logging import logging
import os import os
@ -13,12 +24,6 @@ from pathlib import Path
# Add src to path for imports # Add src to path for imports
sys.path.append(str(Path(__file__).parent)) sys.path.append(str(Path(__file__).parent))
from trainer import ModelTrainer
from dataset_processor import DatasetProcessor
from config import AppConfig
from utils import setup_logging, check_gpu_memory
def parse_arguments(): def parse_arguments():
"""Parse command line arguments""" """Parse command line arguments"""
parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model") parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model")

View File

@ -3,13 +3,21 @@ Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit
Optimized for RTX3070 8GB VRAM with no CPU offloading Optimized for RTX3070 8GB VRAM with no CPU offloading
""" """
# Disable torch compilation before importing any modules that might use it
import os
os.environ['DISABLE_TORCH_COMPILE'] = '1'
os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
os.environ['TORCH_COMPILE_DISABLE'] = '1'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
import logging import logging
import os import os
import gc import gc
import torch import torch
from pathlib import Path from pathlib import Path
from typing import Optional, Dict, Any from typing import Optional, Dict, Any
import torch.nn as nn import torch.nn as nn
from transformers import ( from transformers import (
AutoModelForCausalLM, AutoModelForCausalLM,
@ -21,8 +29,6 @@ from transformers import (
) )
from trl import SFTConfig, SFTTrainer from trl import SFTConfig, SFTTrainer
from datasets import Dataset from datasets import Dataset
from unsloth import FastLanguageModel, is_bfloat16_supported
from config import AppConfig from config import AppConfig
from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage
@ -125,9 +131,18 @@ class ModelTrainer:
max_seq_length=self.config.model.max_seq_length, max_seq_length=self.config.model.max_seq_length,
dtype=None, # Auto-detect dtype=None, # Auto-detect
load_in_4bit=True, # Use 4-bit quantization load_in_4bit=True, # Use 4-bit quantization
token=None, # Use default token token="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", # Use default token
) )
# Log tokenizer attributes for debugging
self.logger.info(f"Tokenizer type: {type(self.tokenizer)}")
self.logger.info(f"Tokenizer class: {self.tokenizer.__class__}")
self.logger.info(f"Tokenizer has 'unsloth_push_to_hub': {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
# List all methods containing 'unsloth' for debugging
unsloth_methods = [attr for attr in dir(self.tokenizer) if 'unsloth' in attr.lower()]
self.logger.info(f"Tokenizer unsloth methods: {unsloth_methods}")
# Configure model for training # Configure model for training
self.model = FastLanguageModel.get_peft_model( self.model = FastLanguageModel.get_peft_model(
self.model, self.model,
@ -152,31 +167,126 @@ class ModelTrainer:
raise raise
def _prepare_dataset(self, train_dataset: Dataset) -> Dataset: def _prepare_dataset(self, train_dataset: Dataset) -> Dataset:
"""Prepare and tokenize the dataset""" """Prepare and tokenize the dataset for Qwen2.5-Coder"""
self.logger.info("Preparing dataset...") self.logger.info("Preparing dataset for Qwen2.5-Coder...")
# Apply chat template for Qwen2.5-Coder if available
try:
chat_template = get_chat_template("qwen")
if chat_template and isinstance(chat_template, str):
self.tokenizer.chat_template = chat_template
self.logger.info("Applied Qwen chat template from string")
else:
self.logger.warning(f"Invalid chat template received: {type(chat_template)}")
except Exception as e:
self.logger.warning(f"Could not apply Qwen chat template: {e}")
# Fallback to default formatting
pass
def tokenize_function(examples): def tokenize_function(examples):
return self.tokenizer( # Format examples as instruction-following pairs for code training
examples["text"], formatted_texts = []
for text in examples["text"]:
# Create an instruction format appropriate for code training
# For Qwen2.5-Coder, we can use a code completion or analysis format
messages = [
{"role": "user", "content": "Analyze and understand the following code:"},
{"role": "assistant", "content": text}
]
# Apply chat template if available, otherwise use simple formatting
try:
# Log tokenizer state in multiprocessing context
import multiprocessing
self.logger.debug(f"Tokenize function - Process ID: {multiprocessing.current_process().pid}")
self.logger.debug(f"Tokenize function - Tokenizer type: {type(self.tokenizer)}")
self.logger.debug(f"Tokenize function - Has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
self.logger.debug(f"Tokenize function - Has chat_template: {hasattr(self.tokenizer, 'chat_template')}")
if hasattr(self.tokenizer, 'chat_template'):
self.logger.debug(f"Tokenize function - chat_template type: {type(self.tokenizer.chat_template)}")
# Check if tokenizer has apply_chat_template method
if hasattr(self.tokenizer, 'apply_chat_template'):
formatted_text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False # We're training on the full conversation
)
else:
self.logger.warning("Tokenizer does not have apply_chat_template method, using fallback")
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
except AttributeError as e:
if 'unsloth_push_to_hub' in str(e):
self.logger.error(f"AttributeError in multiprocessing context: {e}")
self.logger.error(f"Tokenizer type: {type(self.tokenizer)}")
self.logger.error(f"Tokenizer attributes: {[attr for attr in dir(self.tokenizer) if not attr.startswith('_')]}")
elif 'padding_side' in str(e):
self.logger.warning(f"Chat template padding_side error: {e}")
self.logger.warning("Using fallback formatting due to chat template issue")
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
else:
raise
except Exception as e:
self.logger.warning(f"Error applying chat template: {e}, using fallback formatting")
# Fallback to simple formatting with special tokens
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
formatted_texts.append(formatted_text)
# Tokenize with proper padding and truncation for Qwen2.5-Coder
tokenized = self.tokenizer(
formatted_texts,
padding="max_length", padding="max_length",
truncation=True, truncation=True,
max_length=self.config.model.max_seq_length, max_length=self.config.model.max_seq_length,
return_tensors="pt" return_tensors="pt",
add_special_tokens=True
) )
# For causal language modeling, we need to create proper labels
# Clone input_ids to create labels
labels = tokenized["input_ids"].clone()
# Try to mask the user part of the conversation
# Find the assistant token to determine where the assistant response starts
try:
# Convert to string to find the assistant token
decoded_tokens = self.tokenizer.batch_decode(tokenized["input_ids"], skip_special_tokens=False)
for i, decoded in enumerate(decoded_tokens):
# Find where the assistant response starts
assistant_start = decoded.find("<|im_start|>assistant")
if assistant_start != -1:
# Find the actual token position
# We'll mask everything before the assistant response with -100
assistant_tokens = self.tokenizer("<|im_start|>assistant", add_special_tokens=False)["input_ids"]
if len(assistant_tokens) > 0:
# Find where the assistant token first appears
assistant_token_id = assistant_tokens[0]
assistant_positions = (tokenized["input_ids"][i] == assistant_token_id).nonzero(as_tuple=True)[0]
if len(assistant_positions) > 0:
# Mask everything before the assistant token
labels[i, :assistant_positions[0]] = -100
except Exception as e:
self.logger.warning(f"Could not mask user tokens: {e}")
# Fallback: Just use the input_ids as labels
pass
tokenized["labels"] = labels
return tokenized
# Tokenize dataset # Tokenize dataset
tokenized_dataset = train_dataset.map( tokenized_dataset = train_dataset.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"], remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"],
desc="Tokenizing dataset" desc="Tokenizing dataset for Qwen2.5-Coder"
) )
self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}") self.logger.info(f"Dataset tokenized for Qwen2.5-Coder. Size: {len(tokenized_dataset)}")
return tokenized_dataset return tokenized_dataset
def _setup_trainer(self, tokenized_dataset: Dataset): def _setup_trainer(self, tokenized_dataset: Dataset):
"""Setup the HuggingFace trainer with memory optimizations"""
self.logger.info("Setting up trainer...") self.logger.info("Setting up trainer...")
# Training arguments optimized for RTX3070 8GB # Training arguments optimized for RTX3070 8GB
@ -218,7 +328,10 @@ class ModelTrainer:
# Memory optimization settings # Memory optimization settings
ddp_find_unused_parameters=False, ddp_find_unused_parameters=False,
per_device_eval_batch_size=self.config.training.per_device_train_batch_size, per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10 max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10,
# Explicitly disable torch compilation
torch_compile=False,
torch_compile_backend=None
) )
# Data collator # Data collator
@ -228,7 +341,17 @@ class ModelTrainer:
# ) # )
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer) data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
# Log data collator setup for debugging
self.logger.info(f"Data collator created with tokenizer type: {type(self.tokenizer)}")
self.logger.info(f"Data collator tokenizer has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
# Initialize trainer # Initialize trainer
self.logger.info("Initializing SFTTrainer...")
# Environment variables for torch compilation are set at module level
self.logger.info("Torch compilation environment variables set at module level")
try:
self.trainer = SFTTrainer( self.trainer = SFTTrainer(
model=self.model, model=self.model,
args=training_args, args=training_args,
@ -236,9 +359,14 @@ class ModelTrainer:
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo) eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
data_collator=data_collator, data_collator=data_collator,
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
dataset_text_field="text", # dataset_text_field="text",
packing=False # Can make training 5x faster for short sequences. # packing=False # Can make training 5x faster for short sequences.
) )
self.logger.info("SFTTrainer initialized successfully")
except Exception as e:
self.logger.error(f"Failed to initialize SFTTrainer: {e}")
self.logger.error(f"Error type: {type(e)}")
raise
self.logger.info("Trainer setup completed") self.logger.info("Trainer setup completed")