fix many bugs

This commit is contained in:
Suherdy Yacob 2025-08-22 23:28:17 +07:00
parent dc14dc4c2c
commit f39220b192
5 changed files with 197 additions and 37 deletions

View File

@ -3,7 +3,7 @@
model:
name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
max_seq_length: 2048
max_seq_length: 512
trust_remote_code: true
use_fast_tokenizer: true
padding_side: "left"
@ -11,8 +11,8 @@ model:
training:
# Memory-optimized batch size for RTX3070 8GB
per_device_train_batch_size: 2
gradient_accumulation_steps: 4
per_device_train_batch_size: 1
gradient_accumulation_steps: 16
max_steps: 30
# Training parameters
@ -34,7 +34,7 @@ training:
greater_is_better: false
# Data loading
dataloader_num_workers: 2
dataloader_num_workers: 0 # Temporarily disabled for debugging
dataloader_pin_memory: true
remove_unused_columns: false
@ -42,13 +42,20 @@ training:
use_gradient_checkpointing: true
offload_to_cpu: false # Explicitly no CPU offloading
# Additional memory optimizations
dataloader_drop_last: true
# Aggressive memory settings for 8GB GPU
per_device_eval_batch_size: 1
eval_accumulation_steps: 1
# Optimizer settings
optim: "paged_adamw_8bit"
optim: "adamw_torch"
weight_decay: 0.01
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1.0e-8
max_grad_norm: 1.0
max_grad_norm: 0.5
# Learning rate scheduler
lr_scheduler_type: "linear"
@ -58,6 +65,9 @@ training:
fp16: false
tf32: true
# Disable torch compilation to avoid generator tracing issues
torch_compile: false
# Dataset settings
dataset_shuffle: true
dataset_seed: 3407

View File

@ -27,6 +27,7 @@ class TrainingConfig:
"""Training configuration"""
per_device_train_batch_size: int = 2
gradient_accumulation_steps: int = 4
max_steps: int = 10
num_train_epochs: int = 3
learning_rate: float = 2e-4
warmup_steps: int = 10
@ -41,6 +42,7 @@ class TrainingConfig:
dataloader_num_workers: int = 2
dataloader_pin_memory: bool = True
remove_unused_columns: bool = False
dataloader_drop_last: bool = True
label_names: List[str] = None
# Memory optimization for RTX3070 8GB
@ -55,7 +57,9 @@ class TrainingConfig:
adam_beta1: float = 0.9
adam_beta2: float = 0.999
adam_epsilon: float = 1e-8
max_grad_norm: float = 1.0
max_grad_norm: float = 0.5
per_device_eval_batch_size: int = 1
eval_accumulation_steps: int = 1
# Learning rate scheduler
lr_scheduler_type: str = "cosine"
@ -66,6 +70,9 @@ class TrainingConfig:
fp16: bool = False
tf32: bool = True
# Compilation settings
torch_compile: bool = False
# Dataset processing
dataset_shuffle: bool = True
dataset_seed: int = 42
@ -181,6 +188,7 @@ class AppConfig:
'training': {
'per_device_train_batch_size': self.training.per_device_train_batch_size,
'gradient_accumulation_steps': self.training.gradient_accumulation_steps,
'max_steps': self.training.max_steps,
'num_train_epochs': self.training.num_train_epochs,
'learning_rate': self.training.learning_rate,
'warmup_steps': self.training.warmup_steps,
@ -195,17 +203,25 @@ class AppConfig:
'dataloader_num_workers': self.training.dataloader_num_workers,
'dataloader_pin_memory': self.training.dataloader_pin_memory,
'remove_unused_columns': self.training.remove_unused_columns,
'dataloader_drop_last': self.training.dataloader_drop_last,
'use_gradient_checkpointing': self.training.use_gradient_checkpointing,
'offload_to_cpu': self.training.offload_to_cpu,
'optim': self.training.optim,
'weight_decay': self.training.weight_decay,
'adam_beta1': self.training.adam_beta1,
'adam_beta2': self.training.adam_beta2,
'adam_epsilon': self.training.adam_epsilon,
'max_grad_norm': self.training.max_grad_norm,
'per_device_eval_batch_size': self.training.per_device_eval_batch_size,
'eval_accumulation_steps': self.training.eval_accumulation_steps,
'lr_scheduler_type': self.training.lr_scheduler_type,
'warmup_ratio': self.training.warmup_ratio,
'bf16': self.training.bf16,
'fp16': self.training.fp16,
'tf32': self.training.tf32,
'dataset_shuffle': self.training.dataset_shuffle,
'dataset_seed': self.training.dataset_seed
'dataset_seed': self.training.dataset_seed,
'torch_compile': self.training.torch_compile
},
'dataset': {
'min_file_size': self.dataset.min_file_size,

View File

@ -178,6 +178,7 @@ class DatasetProcessor:
return code_samples
finally:
self.logger.info(f"Finished processing {repo_url}")
# Cleanup temporary directories, but keep gitclone folder
# if temp_dir != "./gitclone":
# shutil.rmtree(temp_dir, ignore_errors=True)

View File

@ -4,6 +4,17 @@ Main entry point for AI Trainer application
Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model
"""
# Disable torch compilation before importing any modules
import os
os.environ['DISABLE_TORCH_COMPILE'] = '1'
os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
os.environ['TORCH_COMPILE_DISABLE'] = '1'
from trainer import ModelTrainer
from dataset_processor import DatasetProcessor
from config import AppConfig
from utils import setup_logging, check_gpu_memory
import argparse
import logging
import os
@ -13,12 +24,6 @@ from pathlib import Path
# Add src to path for imports
sys.path.append(str(Path(__file__).parent))
from trainer import ModelTrainer
from dataset_processor import DatasetProcessor
from config import AppConfig
from utils import setup_logging, check_gpu_memory
def parse_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model")

View File

@ -3,13 +3,21 @@ Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit
Optimized for RTX3070 8GB VRAM with no CPU offloading
"""
# Disable torch compilation before importing any modules that might use it
import os
os.environ['DISABLE_TORCH_COMPILE'] = '1'
os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
os.environ['TORCH_COMPILE_DISABLE'] = '1'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
import logging
import os
import gc
import torch
from pathlib import Path
from typing import Optional, Dict, Any
import torch.nn as nn
from transformers import (
AutoModelForCausalLM,
@ -21,8 +29,6 @@ from transformers import (
)
from trl import SFTConfig, SFTTrainer
from datasets import Dataset
from unsloth import FastLanguageModel, is_bfloat16_supported
from config import AppConfig
from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage
@ -125,9 +131,18 @@ class ModelTrainer:
max_seq_length=self.config.model.max_seq_length,
dtype=None, # Auto-detect
load_in_4bit=True, # Use 4-bit quantization
token=None, # Use default token
token="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", # Use default token
)
# Log tokenizer attributes for debugging
self.logger.info(f"Tokenizer type: {type(self.tokenizer)}")
self.logger.info(f"Tokenizer class: {self.tokenizer.__class__}")
self.logger.info(f"Tokenizer has 'unsloth_push_to_hub': {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
# List all methods containing 'unsloth' for debugging
unsloth_methods = [attr for attr in dir(self.tokenizer) if 'unsloth' in attr.lower()]
self.logger.info(f"Tokenizer unsloth methods: {unsloth_methods}")
# Configure model for training
self.model = FastLanguageModel.get_peft_model(
self.model,
@ -152,31 +167,126 @@ class ModelTrainer:
raise
def _prepare_dataset(self, train_dataset: Dataset) -> Dataset:
"""Prepare and tokenize the dataset"""
self.logger.info("Preparing dataset...")
"""Prepare and tokenize the dataset for Qwen2.5-Coder"""
self.logger.info("Preparing dataset for Qwen2.5-Coder...")
# Apply chat template for Qwen2.5-Coder if available
try:
chat_template = get_chat_template("qwen")
if chat_template and isinstance(chat_template, str):
self.tokenizer.chat_template = chat_template
self.logger.info("Applied Qwen chat template from string")
else:
self.logger.warning(f"Invalid chat template received: {type(chat_template)}")
except Exception as e:
self.logger.warning(f"Could not apply Qwen chat template: {e}")
# Fallback to default formatting
pass
def tokenize_function(examples):
return self.tokenizer(
examples["text"],
# Format examples as instruction-following pairs for code training
formatted_texts = []
for text in examples["text"]:
# Create an instruction format appropriate for code training
# For Qwen2.5-Coder, we can use a code completion or analysis format
messages = [
{"role": "user", "content": "Analyze and understand the following code:"},
{"role": "assistant", "content": text}
]
# Apply chat template if available, otherwise use simple formatting
try:
# Log tokenizer state in multiprocessing context
import multiprocessing
self.logger.debug(f"Tokenize function - Process ID: {multiprocessing.current_process().pid}")
self.logger.debug(f"Tokenize function - Tokenizer type: {type(self.tokenizer)}")
self.logger.debug(f"Tokenize function - Has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
self.logger.debug(f"Tokenize function - Has chat_template: {hasattr(self.tokenizer, 'chat_template')}")
if hasattr(self.tokenizer, 'chat_template'):
self.logger.debug(f"Tokenize function - chat_template type: {type(self.tokenizer.chat_template)}")
# Check if tokenizer has apply_chat_template method
if hasattr(self.tokenizer, 'apply_chat_template'):
formatted_text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False # We're training on the full conversation
)
else:
self.logger.warning("Tokenizer does not have apply_chat_template method, using fallback")
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
except AttributeError as e:
if 'unsloth_push_to_hub' in str(e):
self.logger.error(f"AttributeError in multiprocessing context: {e}")
self.logger.error(f"Tokenizer type: {type(self.tokenizer)}")
self.logger.error(f"Tokenizer attributes: {[attr for attr in dir(self.tokenizer) if not attr.startswith('_')]}")
elif 'padding_side' in str(e):
self.logger.warning(f"Chat template padding_side error: {e}")
self.logger.warning("Using fallback formatting due to chat template issue")
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
else:
raise
except Exception as e:
self.logger.warning(f"Error applying chat template: {e}, using fallback formatting")
# Fallback to simple formatting with special tokens
formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
formatted_texts.append(formatted_text)
# Tokenize with proper padding and truncation for Qwen2.5-Coder
tokenized = self.tokenizer(
formatted_texts,
padding="max_length",
truncation=True,
max_length=self.config.model.max_seq_length,
return_tensors="pt"
return_tensors="pt",
add_special_tokens=True
)
# For causal language modeling, we need to create proper labels
# Clone input_ids to create labels
labels = tokenized["input_ids"].clone()
# Try to mask the user part of the conversation
# Find the assistant token to determine where the assistant response starts
try:
# Convert to string to find the assistant token
decoded_tokens = self.tokenizer.batch_decode(tokenized["input_ids"], skip_special_tokens=False)
for i, decoded in enumerate(decoded_tokens):
# Find where the assistant response starts
assistant_start = decoded.find("<|im_start|>assistant")
if assistant_start != -1:
# Find the actual token position
# We'll mask everything before the assistant response with -100
assistant_tokens = self.tokenizer("<|im_start|>assistant", add_special_tokens=False)["input_ids"]
if len(assistant_tokens) > 0:
# Find where the assistant token first appears
assistant_token_id = assistant_tokens[0]
assistant_positions = (tokenized["input_ids"][i] == assistant_token_id).nonzero(as_tuple=True)[0]
if len(assistant_positions) > 0:
# Mask everything before the assistant token
labels[i, :assistant_positions[0]] = -100
except Exception as e:
self.logger.warning(f"Could not mask user tokens: {e}")
# Fallback: Just use the input_ids as labels
pass
tokenized["labels"] = labels
return tokenized
# Tokenize dataset
tokenized_dataset = train_dataset.map(
tokenize_function,
batched=True,
remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"],
desc="Tokenizing dataset"
desc="Tokenizing dataset for Qwen2.5-Coder"
)
self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}")
self.logger.info(f"Dataset tokenized for Qwen2.5-Coder. Size: {len(tokenized_dataset)}")
return tokenized_dataset
def _setup_trainer(self, tokenized_dataset: Dataset):
"""Setup the HuggingFace trainer with memory optimizations"""
self.logger.info("Setting up trainer...")
# Training arguments optimized for RTX3070 8GB
@ -218,7 +328,10 @@ class ModelTrainer:
# Memory optimization settings
ddp_find_unused_parameters=False,
per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10,
# Explicitly disable torch compilation
torch_compile=False,
torch_compile_backend=None
)
# Data collator
@ -228,17 +341,32 @@ class ModelTrainer:
# )
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
# Log data collator setup for debugging
self.logger.info(f"Data collator created with tokenizer type: {type(self.tokenizer)}")
self.logger.info(f"Data collator tokenizer has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
# Initialize trainer
self.trainer = SFTTrainer(
model=self.model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
data_collator=data_collator,
tokenizer=self.tokenizer,
dataset_text_field="text",
packing=False # Can make training 5x faster for short sequences.
)
self.logger.info("Initializing SFTTrainer...")
# Environment variables for torch compilation are set at module level
self.logger.info("Torch compilation environment variables set at module level")
try:
self.trainer = SFTTrainer(
model=self.model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
data_collator=data_collator,
tokenizer=self.tokenizer,
# dataset_text_field="text",
# packing=False # Can make training 5x faster for short sequences.
)
self.logger.info("SFTTrainer initialized successfully")
except Exception as e:
self.logger.error(f"Failed to initialize SFTTrainer: {e}")
self.logger.error(f"Error type: {type(e)}")
raise
self.logger.info("Trainer setup completed")