From f39220b192cc4ebdd136f59c4ca1e0a282b56f99 Mon Sep 17 00:00:00 2001 From: Suherdy Yacob Date: Fri, 22 Aug 2025 23:28:17 +0700 Subject: [PATCH] fix many bugs --- configs/training_config.yaml | 22 +++-- src/config.py | 20 +++- src/dataset_processor.py | 1 + src/main.py | 17 ++-- src/trainer.py | 174 ++++++++++++++++++++++++++++++----- 5 files changed, 197 insertions(+), 37 deletions(-) diff --git a/configs/training_config.yaml b/configs/training_config.yaml index b049fc5..0bd1316 100644 --- a/configs/training_config.yaml +++ b/configs/training_config.yaml @@ -3,7 +3,7 @@ model: name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit" - max_seq_length: 2048 + max_seq_length: 512 trust_remote_code: true use_fast_tokenizer: true padding_side: "left" @@ -11,8 +11,8 @@ model: training: # Memory-optimized batch size for RTX3070 8GB - per_device_train_batch_size: 2 - gradient_accumulation_steps: 4 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 16 max_steps: 30 # Training parameters @@ -34,7 +34,7 @@ training: greater_is_better: false # Data loading - dataloader_num_workers: 2 + dataloader_num_workers: 0 # Temporarily disabled for debugging dataloader_pin_memory: true remove_unused_columns: false @@ -42,13 +42,20 @@ training: use_gradient_checkpointing: true offload_to_cpu: false # Explicitly no CPU offloading + # Additional memory optimizations + dataloader_drop_last: true + + # Aggressive memory settings for 8GB GPU + per_device_eval_batch_size: 1 + eval_accumulation_steps: 1 + # Optimizer settings - optim: "paged_adamw_8bit" + optim: "adamw_torch" weight_decay: 0.01 adam_beta1: 0.9 adam_beta2: 0.999 adam_epsilon: 1.0e-8 - max_grad_norm: 1.0 + max_grad_norm: 0.5 # Learning rate scheduler lr_scheduler_type: "linear" @@ -58,6 +65,9 @@ training: fp16: false tf32: true + # Disable torch compilation to avoid generator tracing issues + torch_compile: false + # Dataset settings dataset_shuffle: true dataset_seed: 3407 diff --git a/src/config.py b/src/config.py index faf1da6..cdace58 100644 --- a/src/config.py +++ b/src/config.py @@ -27,6 +27,7 @@ class TrainingConfig: """Training configuration""" per_device_train_batch_size: int = 2 gradient_accumulation_steps: int = 4 + max_steps: int = 10 num_train_epochs: int = 3 learning_rate: float = 2e-4 warmup_steps: int = 10 @@ -41,6 +42,7 @@ class TrainingConfig: dataloader_num_workers: int = 2 dataloader_pin_memory: bool = True remove_unused_columns: bool = False + dataloader_drop_last: bool = True label_names: List[str] = None # Memory optimization for RTX3070 8GB @@ -55,7 +57,9 @@ class TrainingConfig: adam_beta1: float = 0.9 adam_beta2: float = 0.999 adam_epsilon: float = 1e-8 - max_grad_norm: float = 1.0 + max_grad_norm: float = 0.5 + per_device_eval_batch_size: int = 1 + eval_accumulation_steps: int = 1 # Learning rate scheduler lr_scheduler_type: str = "cosine" @@ -66,6 +70,9 @@ class TrainingConfig: fp16: bool = False tf32: bool = True + # Compilation settings + torch_compile: bool = False + # Dataset processing dataset_shuffle: bool = True dataset_seed: int = 42 @@ -181,6 +188,7 @@ class AppConfig: 'training': { 'per_device_train_batch_size': self.training.per_device_train_batch_size, 'gradient_accumulation_steps': self.training.gradient_accumulation_steps, + 'max_steps': self.training.max_steps, 'num_train_epochs': self.training.num_train_epochs, 'learning_rate': self.training.learning_rate, 'warmup_steps': self.training.warmup_steps, @@ -195,17 +203,25 @@ class AppConfig: 'dataloader_num_workers': self.training.dataloader_num_workers, 'dataloader_pin_memory': self.training.dataloader_pin_memory, 'remove_unused_columns': self.training.remove_unused_columns, + 'dataloader_drop_last': self.training.dataloader_drop_last, 'use_gradient_checkpointing': self.training.use_gradient_checkpointing, 'offload_to_cpu': self.training.offload_to_cpu, 'optim': self.training.optim, 'weight_decay': self.training.weight_decay, + 'adam_beta1': self.training.adam_beta1, + 'adam_beta2': self.training.adam_beta2, + 'adam_epsilon': self.training.adam_epsilon, + 'max_grad_norm': self.training.max_grad_norm, + 'per_device_eval_batch_size': self.training.per_device_eval_batch_size, + 'eval_accumulation_steps': self.training.eval_accumulation_steps, 'lr_scheduler_type': self.training.lr_scheduler_type, 'warmup_ratio': self.training.warmup_ratio, 'bf16': self.training.bf16, 'fp16': self.training.fp16, 'tf32': self.training.tf32, 'dataset_shuffle': self.training.dataset_shuffle, - 'dataset_seed': self.training.dataset_seed + 'dataset_seed': self.training.dataset_seed, + 'torch_compile': self.training.torch_compile }, 'dataset': { 'min_file_size': self.dataset.min_file_size, diff --git a/src/dataset_processor.py b/src/dataset_processor.py index ee7a220..132b3b4 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -178,6 +178,7 @@ class DatasetProcessor: return code_samples finally: + self.logger.info(f"Finished processing {repo_url}") # Cleanup temporary directories, but keep gitclone folder # if temp_dir != "./gitclone": # shutil.rmtree(temp_dir, ignore_errors=True) diff --git a/src/main.py b/src/main.py index 239a415..8832d42 100644 --- a/src/main.py +++ b/src/main.py @@ -4,6 +4,17 @@ Main entry point for AI Trainer application Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model """ +# Disable torch compilation before importing any modules +import os +os.environ['DISABLE_TORCH_COMPILE'] = '1' +os.environ['UNSLOTH_DISABLE_COMPILE'] = '1' +os.environ['TORCH_COMPILE_DISABLE'] = '1' + +from trainer import ModelTrainer +from dataset_processor import DatasetProcessor +from config import AppConfig +from utils import setup_logging, check_gpu_memory + import argparse import logging import os @@ -13,12 +24,6 @@ from pathlib import Path # Add src to path for imports sys.path.append(str(Path(__file__).parent)) -from trainer import ModelTrainer -from dataset_processor import DatasetProcessor -from config import AppConfig -from utils import setup_logging, check_gpu_memory - - def parse_arguments(): """Parse command line arguments""" parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model") diff --git a/src/trainer.py b/src/trainer.py index 629cd70..f2c95af 100644 --- a/src/trainer.py +++ b/src/trainer.py @@ -3,13 +3,21 @@ Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit Optimized for RTX3070 8GB VRAM with no CPU offloading """ +# Disable torch compilation before importing any modules that might use it +import os +os.environ['DISABLE_TORCH_COMPILE'] = '1' +os.environ['UNSLOTH_DISABLE_COMPILE'] = '1' +os.environ['TORCH_COMPILE_DISABLE'] = '1' +os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' + +from unsloth import FastLanguageModel, is_bfloat16_supported +from unsloth.chat_templates import get_chat_template import logging import os import gc import torch from pathlib import Path from typing import Optional, Dict, Any - import torch.nn as nn from transformers import ( AutoModelForCausalLM, @@ -21,8 +29,6 @@ from transformers import ( ) from trl import SFTConfig, SFTTrainer from datasets import Dataset -from unsloth import FastLanguageModel, is_bfloat16_supported - from config import AppConfig from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage @@ -125,9 +131,18 @@ class ModelTrainer: max_seq_length=self.config.model.max_seq_length, dtype=None, # Auto-detect load_in_4bit=True, # Use 4-bit quantization - token=None, # Use default token + token="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", # Use default token ) + # Log tokenizer attributes for debugging + self.logger.info(f"Tokenizer type: {type(self.tokenizer)}") + self.logger.info(f"Tokenizer class: {self.tokenizer.__class__}") + self.logger.info(f"Tokenizer has 'unsloth_push_to_hub': {hasattr(self.tokenizer, 'unsloth_push_to_hub')}") + + # List all methods containing 'unsloth' for debugging + unsloth_methods = [attr for attr in dir(self.tokenizer) if 'unsloth' in attr.lower()] + self.logger.info(f"Tokenizer unsloth methods: {unsloth_methods}") + # Configure model for training self.model = FastLanguageModel.get_peft_model( self.model, @@ -152,31 +167,126 @@ class ModelTrainer: raise def _prepare_dataset(self, train_dataset: Dataset) -> Dataset: - """Prepare and tokenize the dataset""" - self.logger.info("Preparing dataset...") + """Prepare and tokenize the dataset for Qwen2.5-Coder""" + self.logger.info("Preparing dataset for Qwen2.5-Coder...") + + # Apply chat template for Qwen2.5-Coder if available + try: + chat_template = get_chat_template("qwen") + if chat_template and isinstance(chat_template, str): + self.tokenizer.chat_template = chat_template + self.logger.info("Applied Qwen chat template from string") + else: + self.logger.warning(f"Invalid chat template received: {type(chat_template)}") + except Exception as e: + self.logger.warning(f"Could not apply Qwen chat template: {e}") + # Fallback to default formatting + pass def tokenize_function(examples): - return self.tokenizer( - examples["text"], + # Format examples as instruction-following pairs for code training + formatted_texts = [] + for text in examples["text"]: + # Create an instruction format appropriate for code training + # For Qwen2.5-Coder, we can use a code completion or analysis format + messages = [ + {"role": "user", "content": "Analyze and understand the following code:"}, + {"role": "assistant", "content": text} + ] + + # Apply chat template if available, otherwise use simple formatting + try: + # Log tokenizer state in multiprocessing context + import multiprocessing + self.logger.debug(f"Tokenize function - Process ID: {multiprocessing.current_process().pid}") + self.logger.debug(f"Tokenize function - Tokenizer type: {type(self.tokenizer)}") + self.logger.debug(f"Tokenize function - Has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}") + self.logger.debug(f"Tokenize function - Has chat_template: {hasattr(self.tokenizer, 'chat_template')}") + if hasattr(self.tokenizer, 'chat_template'): + self.logger.debug(f"Tokenize function - chat_template type: {type(self.tokenizer.chat_template)}") + + # Check if tokenizer has apply_chat_template method + if hasattr(self.tokenizer, 'apply_chat_template'): + formatted_text = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=False # We're training on the full conversation + ) + else: + self.logger.warning("Tokenizer does not have apply_chat_template method, using fallback") + formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>" + + except AttributeError as e: + if 'unsloth_push_to_hub' in str(e): + self.logger.error(f"AttributeError in multiprocessing context: {e}") + self.logger.error(f"Tokenizer type: {type(self.tokenizer)}") + self.logger.error(f"Tokenizer attributes: {[attr for attr in dir(self.tokenizer) if not attr.startswith('_')]}") + elif 'padding_side' in str(e): + self.logger.warning(f"Chat template padding_side error: {e}") + self.logger.warning("Using fallback formatting due to chat template issue") + formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>" + else: + raise + except Exception as e: + self.logger.warning(f"Error applying chat template: {e}, using fallback formatting") + # Fallback to simple formatting with special tokens + formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>" + + formatted_texts.append(formatted_text) + + # Tokenize with proper padding and truncation for Qwen2.5-Coder + tokenized = self.tokenizer( + formatted_texts, padding="max_length", truncation=True, max_length=self.config.model.max_seq_length, - return_tensors="pt" + return_tensors="pt", + add_special_tokens=True ) + + # For causal language modeling, we need to create proper labels + # Clone input_ids to create labels + labels = tokenized["input_ids"].clone() + + # Try to mask the user part of the conversation + # Find the assistant token to determine where the assistant response starts + try: + # Convert to string to find the assistant token + decoded_tokens = self.tokenizer.batch_decode(tokenized["input_ids"], skip_special_tokens=False) + for i, decoded in enumerate(decoded_tokens): + # Find where the assistant response starts + assistant_start = decoded.find("<|im_start|>assistant") + if assistant_start != -1: + # Find the actual token position + # We'll mask everything before the assistant response with -100 + assistant_tokens = self.tokenizer("<|im_start|>assistant", add_special_tokens=False)["input_ids"] + if len(assistant_tokens) > 0: + # Find where the assistant token first appears + assistant_token_id = assistant_tokens[0] + assistant_positions = (tokenized["input_ids"][i] == assistant_token_id).nonzero(as_tuple=True)[0] + if len(assistant_positions) > 0: + # Mask everything before the assistant token + labels[i, :assistant_positions[0]] = -100 + except Exception as e: + self.logger.warning(f"Could not mask user tokens: {e}") + # Fallback: Just use the input_ids as labels + pass + + tokenized["labels"] = labels + return tokenized # Tokenize dataset tokenized_dataset = train_dataset.map( tokenize_function, batched=True, remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"], - desc="Tokenizing dataset" + desc="Tokenizing dataset for Qwen2.5-Coder" ) - self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}") + self.logger.info(f"Dataset tokenized for Qwen2.5-Coder. Size: {len(tokenized_dataset)}") return tokenized_dataset def _setup_trainer(self, tokenized_dataset: Dataset): - """Setup the HuggingFace trainer with memory optimizations""" self.logger.info("Setting up trainer...") # Training arguments optimized for RTX3070 8GB @@ -218,7 +328,10 @@ class ModelTrainer: # Memory optimization settings ddp_find_unused_parameters=False, per_device_eval_batch_size=self.config.training.per_device_train_batch_size, - max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10 + max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10, + # Explicitly disable torch compilation + torch_compile=False, + torch_compile_backend=None ) # Data collator @@ -228,17 +341,32 @@ class ModelTrainer: # ) data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer) + # Log data collator setup for debugging + self.logger.info(f"Data collator created with tokenizer type: {type(self.tokenizer)}") + self.logger.info(f"Data collator tokenizer has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}") + # Initialize trainer - self.trainer = SFTTrainer( - model=self.model, - args=training_args, - train_dataset=tokenized_dataset, - eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo) - data_collator=data_collator, - tokenizer=self.tokenizer, - dataset_text_field="text", - packing=False # Can make training 5x faster for short sequences. - ) + self.logger.info("Initializing SFTTrainer...") + + # Environment variables for torch compilation are set at module level + self.logger.info("Torch compilation environment variables set at module level") + + try: + self.trainer = SFTTrainer( + model=self.model, + args=training_args, + train_dataset=tokenized_dataset, + eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo) + data_collator=data_collator, + tokenizer=self.tokenizer, + # dataset_text_field="text", + # packing=False # Can make training 5x faster for short sequences. + ) + self.logger.info("SFTTrainer initialized successfully") + except Exception as e: + self.logger.error(f"Failed to initialize SFTTrainer: {e}") + self.logger.error(f"Error type: {type(e)}") + raise self.logger.info("Trainer setup completed")