fix many bugs

2025-08-22 23:28:17 +07:00 · 2025-08-22 23:28:17 +07:00 · f39220b192
commit f39220b192
parent dc14dc4c2c
5 changed files with 197 additions and 37 deletions
--- a/configs/training_config.yaml
+++ b/configs/training_config.yaml
@ -3,7 +3,7 @@
 model:
  name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
-  max_seq_length: 2048
+  max_seq_length: 512
  trust_remote_code: true
  use_fast_tokenizer: true
  padding_side: "left"
@ -11,8 +11,8 @@ model:
 training:
  # Memory-optimized batch size for RTX3070 8GB
-  per_device_train_batch_size: 2
+  per_device_train_batch_size: 1
-  gradient_accumulation_steps: 4
+  gradient_accumulation_steps: 16
  max_steps: 30
  # Training parameters
@ -34,7 +34,7 @@ training:
  greater_is_better: false
  # Data loading
-  dataloader_num_workers: 2
+  dataloader_num_workers: 0  # Temporarily disabled for debugging
  dataloader_pin_memory: true
  remove_unused_columns: false
@ -42,13 +42,20 @@ training:
  use_gradient_checkpointing: true
  offload_to_cpu: false  # Explicitly no CPU offloading
  # Additional memory optimizations
  dataloader_drop_last: true
  # Aggressive memory settings for 8GB GPU
  per_device_eval_batch_size: 1
  eval_accumulation_steps: 1
  # Optimizer settings
-  optim: "paged_adamw_8bit"
+  optim: "adamw_torch"
  weight_decay: 0.01
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-8
-  max_grad_norm: 1.0
+  max_grad_norm: 0.5
  # Learning rate scheduler
  lr_scheduler_type: "linear"
@ -58,6 +65,9 @@ training:
  fp16: false
  tf32: true
  # Disable torch compilation to avoid generator tracing issues
  torch_compile: false
  # Dataset settings
  dataset_shuffle: true
  dataset_seed: 3407
--- a/src/config.py
+++ b/src/config.py
@ -27,6 +27,7 @@ class TrainingConfig:
    """Training configuration"""
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4
    max_steps: int = 10  
    num_train_epochs: int = 3
    learning_rate: float = 2e-4
    warmup_steps: int = 10
@ -41,6 +42,7 @@ class TrainingConfig:
    dataloader_num_workers: int = 2
    dataloader_pin_memory: bool = True
    remove_unused_columns: bool = False
    dataloader_drop_last: bool = True
    label_names: List[str] = None
    # Memory optimization for RTX3070 8GB
@ -55,7 +57,9 @@ class TrainingConfig:
    adam_beta1: float = 0.9
    adam_beta2: float = 0.999
    adam_epsilon: float = 1e-8
-    max_grad_norm: float = 1.0
+    max_grad_norm: float = 0.5
    per_device_eval_batch_size: int = 1
    eval_accumulation_steps: int = 1
    # Learning rate scheduler
    lr_scheduler_type: str = "cosine"
@ -66,6 +70,9 @@ class TrainingConfig:
    fp16: bool = False
    tf32: bool = True
    # Compilation settings
    torch_compile: bool = False
    # Dataset processing
    dataset_shuffle: bool = True
    dataset_seed: int = 42
@ -181,6 +188,7 @@ class AppConfig:
            'training': {
                'per_device_train_batch_size': self.training.per_device_train_batch_size,
                'gradient_accumulation_steps': self.training.gradient_accumulation_steps,
                'max_steps': self.training.max_steps,
                'num_train_epochs': self.training.num_train_epochs,
                'learning_rate': self.training.learning_rate,
                'warmup_steps': self.training.warmup_steps,
@ -195,17 +203,25 @@ class AppConfig:
                'dataloader_num_workers': self.training.dataloader_num_workers,
                'dataloader_pin_memory': self.training.dataloader_pin_memory,
                'remove_unused_columns': self.training.remove_unused_columns,
                'dataloader_drop_last': self.training.dataloader_drop_last,
                'use_gradient_checkpointing': self.training.use_gradient_checkpointing,
                'offload_to_cpu': self.training.offload_to_cpu,
                'optim': self.training.optim,
                'weight_decay': self.training.weight_decay,
                'adam_beta1': self.training.adam_beta1,
                'adam_beta2': self.training.adam_beta2,
                'adam_epsilon': self.training.adam_epsilon,
                'max_grad_norm': self.training.max_grad_norm,
                'per_device_eval_batch_size': self.training.per_device_eval_batch_size,
                'eval_accumulation_steps': self.training.eval_accumulation_steps,
                'lr_scheduler_type': self.training.lr_scheduler_type,
                'warmup_ratio': self.training.warmup_ratio,
                'bf16': self.training.bf16,
                'fp16': self.training.fp16,
                'tf32': self.training.tf32,
                'dataset_shuffle': self.training.dataset_shuffle,
-                'dataset_seed': self.training.dataset_seed
+                'dataset_seed': self.training.dataset_seed,
                'torch_compile': self.training.torch_compile
            },
            'dataset': {
                'min_file_size': self.dataset.min_file_size,
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@ -178,6 +178,7 @@ class DatasetProcessor:
            return code_samples
        finally:
            self.logger.info(f"Finished processing {repo_url}")
            # Cleanup temporary directories, but keep gitclone folder
            # if temp_dir != "./gitclone":
            #     shutil.rmtree(temp_dir, ignore_errors=True)
--- a/src/main.py
+++ b/src/main.py
@ -4,6 +4,17 @@ Main entry point for AI Trainer application
 Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model
 """
 # Disable torch compilation before importing any modules
 import os
 os.environ['DISABLE_TORCH_COMPILE'] = '1'
 os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
 os.environ['TORCH_COMPILE_DISABLE'] = '1'
 from trainer import ModelTrainer
 from dataset_processor import DatasetProcessor
 from config import AppConfig
 from utils import setup_logging, check_gpu_memory
 import argparse
 import logging
 import os
@ -13,12 +24,6 @@ from pathlib import Path
 # Add src to path for imports
 sys.path.append(str(Path(__file__).parent))
 from trainer import ModelTrainer
 from dataset_processor import DatasetProcessor
 from config import AppConfig
 from utils import setup_logging, check_gpu_memory
 def parse_arguments():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model")
--- a/src/trainer.py
+++ b/src/trainer.py
@ -3,13 +3,21 @@ Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit
 Optimized for RTX3070 8GB VRAM with no CPU offloading
 """
 # Disable torch compilation before importing any modules that might use it
 import os
 os.environ['DISABLE_TORCH_COMPILE'] = '1'
 os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
 os.environ['TORCH_COMPILE_DISABLE'] = '1'
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
 from unsloth import FastLanguageModel, is_bfloat16_supported
 from unsloth.chat_templates import get_chat_template
 import logging
 import os
 import gc
 import torch
 from pathlib import Path
 from typing import Optional, Dict, Any
 import torch.nn as nn
 from transformers import (
    AutoModelForCausalLM,
@ -21,8 +29,6 @@ from transformers import (
 )
 from trl import SFTConfig, SFTTrainer
 from datasets import Dataset
 from unsloth import FastLanguageModel, is_bfloat16_supported
 from config import AppConfig
 from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage
@ -125,9 +131,18 @@ class ModelTrainer:
                max_seq_length=self.config.model.max_seq_length,
                dtype=None,  # Auto-detect
                load_in_4bit=True,  # Use 4-bit quantization
-                token=None,  # Use default token
+                token="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",  # Use default token
            )
            # Log tokenizer attributes for debugging
            self.logger.info(f"Tokenizer type: {type(self.tokenizer)}")
            self.logger.info(f"Tokenizer class: {self.tokenizer.__class__}")
            self.logger.info(f"Tokenizer has 'unsloth_push_to_hub': {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
            # List all methods containing 'unsloth' for debugging
            unsloth_methods = [attr for attr in dir(self.tokenizer) if 'unsloth' in attr.lower()]
            self.logger.info(f"Tokenizer unsloth methods: {unsloth_methods}")
            # Configure model for training
            self.model = FastLanguageModel.get_peft_model(
                self.model,
@ -152,31 +167,126 @@ class ModelTrainer:
            raise
    def _prepare_dataset(self, train_dataset: Dataset) -> Dataset:
-        """Prepare and tokenize the dataset"""
+        """Prepare and tokenize the dataset for Qwen2.5-Coder"""
-        self.logger.info("Preparing dataset...")
+        self.logger.info("Preparing dataset for Qwen2.5-Coder...")
        # Apply chat template for Qwen2.5-Coder if available
        try:
            chat_template = get_chat_template("qwen")
            if chat_template and isinstance(chat_template, str):
                self.tokenizer.chat_template = chat_template
                self.logger.info("Applied Qwen chat template from string")
            else:
                self.logger.warning(f"Invalid chat template received: {type(chat_template)}")
        except Exception as e:
            self.logger.warning(f"Could not apply Qwen chat template: {e}")
            # Fallback to default formatting
            pass
        def tokenize_function(examples):
-            return self.tokenizer(
+            # Format examples as instruction-following pairs for code training
-                examples["text"],
+            formatted_texts = []
            for text in examples["text"]:
                # Create an instruction format appropriate for code training
                # For Qwen2.5-Coder, we can use a code completion or analysis format
                messages = [
                    {"role": "user", "content": "Analyze and understand the following code:"},
                    {"role": "assistant", "content": text}
                ]
                # Apply chat template if available, otherwise use simple formatting
                try:
                    # Log tokenizer state in multiprocessing context
                    import multiprocessing
                    self.logger.debug(f"Tokenize function - Process ID: {multiprocessing.current_process().pid}")
                    self.logger.debug(f"Tokenize function - Tokenizer type: {type(self.tokenizer)}")
                    self.logger.debug(f"Tokenize function - Has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
                    self.logger.debug(f"Tokenize function - Has chat_template: {hasattr(self.tokenizer, 'chat_template')}")
                    if hasattr(self.tokenizer, 'chat_template'):
                        self.logger.debug(f"Tokenize function - chat_template type: {type(self.tokenizer.chat_template)}")
                    # Check if tokenizer has apply_chat_template method
                    if hasattr(self.tokenizer, 'apply_chat_template'):
                        formatted_text = self.tokenizer.apply_chat_template(
                            messages,
                            tokenize=False,
                            add_generation_prompt=False  # We're training on the full conversation
                        )
                    else:
                        self.logger.warning("Tokenizer does not have apply_chat_template method, using fallback")
                        formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
                except AttributeError as e:
                    if 'unsloth_push_to_hub' in str(e):
                        self.logger.error(f"AttributeError in multiprocessing context: {e}")
                        self.logger.error(f"Tokenizer type: {type(self.tokenizer)}")
                        self.logger.error(f"Tokenizer attributes: {[attr for attr in dir(self.tokenizer) if not attr.startswith('_')]}")
                    elif 'padding_side' in str(e):
                        self.logger.warning(f"Chat template padding_side error: {e}")
                        self.logger.warning("Using fallback formatting due to chat template issue")
                        formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
                    else:
                        raise
                except Exception as e:
                    self.logger.warning(f"Error applying chat template: {e}, using fallback formatting")
                    # Fallback to simple formatting with special tokens
                    formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
                formatted_texts.append(formatted_text)
            # Tokenize with proper padding and truncation for Qwen2.5-Coder
            tokenized = self.tokenizer(
                formatted_texts,
                padding="max_length",
                truncation=True,
                max_length=self.config.model.max_seq_length,
-                return_tensors="pt"
+                return_tensors="pt",
                add_special_tokens=True
            )
            # For causal language modeling, we need to create proper labels
            # Clone input_ids to create labels
            labels = tokenized["input_ids"].clone()
            # Try to mask the user part of the conversation
            # Find the assistant token to determine where the assistant response starts
            try:
                # Convert to string to find the assistant token
                decoded_tokens = self.tokenizer.batch_decode(tokenized["input_ids"], skip_special_tokens=False)
                for i, decoded in enumerate(decoded_tokens):
                    # Find where the assistant response starts
                    assistant_start = decoded.find("<|im_start|>assistant")
                    if assistant_start != -1:
                        # Find the actual token position
                        # We'll mask everything before the assistant response with -100
                        assistant_tokens = self.tokenizer("<|im_start|>assistant", add_special_tokens=False)["input_ids"]
                        if len(assistant_tokens) > 0:
                            # Find where the assistant token first appears
                            assistant_token_id = assistant_tokens[0]
                            assistant_positions = (tokenized["input_ids"][i] == assistant_token_id).nonzero(as_tuple=True)[0]
                            if len(assistant_positions) > 0:
                                # Mask everything before the assistant token
                                labels[i, :assistant_positions[0]] = -100
            except Exception as e:
                self.logger.warning(f"Could not mask user tokens: {e}")
                # Fallback: Just use the input_ids as labels
                pass
            tokenized["labels"] = labels
            return tokenized
        # Tokenize dataset
        tokenized_dataset = train_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"],
-            desc="Tokenizing dataset"
+            desc="Tokenizing dataset for Qwen2.5-Coder"
        )
-        self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}")
+        self.logger.info(f"Dataset tokenized for Qwen2.5-Coder. Size: {len(tokenized_dataset)}")
        return tokenized_dataset
    def _setup_trainer(self, tokenized_dataset: Dataset):
        """Setup the HuggingFace trainer with memory optimizations"""
        self.logger.info("Setting up trainer...")
        # Training arguments optimized for RTX3070 8GB
@ -218,7 +328,10 @@ class ModelTrainer:
            # Memory optimization settings
            ddp_find_unused_parameters=False,
            per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
-            max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
+            max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10,
            # Explicitly disable torch compilation
            torch_compile=False,
            torch_compile_backend=None
        )
        # Data collator
@ -228,7 +341,17 @@ class ModelTrainer:
        # )
        data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
        # Log data collator setup for debugging
        self.logger.info(f"Data collator created with tokenizer type: {type(self.tokenizer)}")
        self.logger.info(f"Data collator tokenizer has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
        # Initialize trainer
        self.logger.info("Initializing SFTTrainer...")
        # Environment variables for torch compilation are set at module level
        self.logger.info("Torch compilation environment variables set at module level")
        try:
            self.trainer = SFTTrainer(
                model=self.model,
                args=training_args,
@ -236,9 +359,14 @@ class ModelTrainer:
                eval_dataset=tokenized_dataset,  # Using same dataset for eval (for demo)
                data_collator=data_collator,
                tokenizer=self.tokenizer,
-            dataset_text_field="text",
+                # dataset_text_field="text",
-            packing=False  # Can make training 5x faster for short sequences.
+                # packing=False  # Can make training 5x faster for short sequences.
            )
            self.logger.info("SFTTrainer initialized successfully")
        except Exception as e:
            self.logger.error(f"Failed to initialize SFTTrainer: {e}")
            self.logger.error(f"Error type: {type(e)}")
            raise
        self.logger.info("Trainer setup completed")