fix many bugs

2025-08-22 23:28:17 +07:00 · 2025-08-22 23:28:17 +07:00 · f39220b192
commit f39220b192
parent dc14dc4c2c
5 changed files with 197 additions and 37 deletions
--- a/configs/training_config.yaml
+++ b/configs/training_config.yaml
@ -3,7 +3,7 @@

 model:
  name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
-  max_seq_length: 2048
+  max_seq_length: 512
  trust_remote_code: true
  use_fast_tokenizer: true
  padding_side: "left"
@ -11,8 +11,8 @@ model:

 training:
  # Memory-optimized batch size for RTX3070 8GB
-  per_device_train_batch_size: 2
-  gradient_accumulation_steps: 4
+  per_device_train_batch_size: 1
+  gradient_accumulation_steps: 16
  max_steps: 30

  # Training parameters
@ -34,7 +34,7 @@ training:
  greater_is_better: false

  # Data loading
-  dataloader_num_workers: 2
+  dataloader_num_workers: 0  # Temporarily disabled for debugging
  dataloader_pin_memory: true
  remove_unused_columns: false

@ -42,13 +42,20 @@ training:
  use_gradient_checkpointing: true
  offload_to_cpu: false  # Explicitly no CPU offloading

+  # Additional memory optimizations
+  dataloader_drop_last: true
+
+  # Aggressive memory settings for 8GB GPU
+  per_device_eval_batch_size: 1
+  eval_accumulation_steps: 1
+
  # Optimizer settings
-  optim: "paged_adamw_8bit"
+  optim: "adamw_torch"
  weight_decay: 0.01
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-8
-  max_grad_norm: 1.0
+  max_grad_norm: 0.5

  # Learning rate scheduler
  lr_scheduler_type: "linear"
@ -58,6 +65,9 @@ training:
  fp16: false
  tf32: true

+  # Disable torch compilation to avoid generator tracing issues
+  torch_compile: false
+
  # Dataset settings
  dataset_shuffle: true
  dataset_seed: 3407
--- a/src/config.py
+++ b/src/config.py
@ -27,6 +27,7 @@ class TrainingConfig:
    """Training configuration"""
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4
+    max_steps: int = 10  
    num_train_epochs: int = 3
    learning_rate: float = 2e-4
    warmup_steps: int = 10
@ -41,6 +42,7 @@ class TrainingConfig:
    dataloader_num_workers: int = 2
    dataloader_pin_memory: bool = True
    remove_unused_columns: bool = False
+    dataloader_drop_last: bool = True
    label_names: List[str] = None

    # Memory optimization for RTX3070 8GB
@ -55,7 +57,9 @@ class TrainingConfig:
    adam_beta1: float = 0.9
    adam_beta2: float = 0.999
    adam_epsilon: float = 1e-8
-    max_grad_norm: float = 1.0
+    max_grad_norm: float = 0.5
+    per_device_eval_batch_size: int = 1
+    eval_accumulation_steps: int = 1

    # Learning rate scheduler
    lr_scheduler_type: str = "cosine"
@ -66,6 +70,9 @@ class TrainingConfig:
    fp16: bool = False
    tf32: bool = True

+    # Compilation settings
+    torch_compile: bool = False
+
    # Dataset processing
    dataset_shuffle: bool = True
    dataset_seed: int = 42
@ -181,6 +188,7 @@ class AppConfig:
            'training': {
                'per_device_train_batch_size': self.training.per_device_train_batch_size,
                'gradient_accumulation_steps': self.training.gradient_accumulation_steps,
+                'max_steps': self.training.max_steps,
                'num_train_epochs': self.training.num_train_epochs,
                'learning_rate': self.training.learning_rate,
                'warmup_steps': self.training.warmup_steps,
@ -195,17 +203,25 @@ class AppConfig:
                'dataloader_num_workers': self.training.dataloader_num_workers,
                'dataloader_pin_memory': self.training.dataloader_pin_memory,
                'remove_unused_columns': self.training.remove_unused_columns,
+                'dataloader_drop_last': self.training.dataloader_drop_last,
                'use_gradient_checkpointing': self.training.use_gradient_checkpointing,
                'offload_to_cpu': self.training.offload_to_cpu,
                'optim': self.training.optim,
                'weight_decay': self.training.weight_decay,
+                'adam_beta1': self.training.adam_beta1,
+                'adam_beta2': self.training.adam_beta2,
+                'adam_epsilon': self.training.adam_epsilon,
+                'max_grad_norm': self.training.max_grad_norm,
+                'per_device_eval_batch_size': self.training.per_device_eval_batch_size,
+                'eval_accumulation_steps': self.training.eval_accumulation_steps,
                'lr_scheduler_type': self.training.lr_scheduler_type,
                'warmup_ratio': self.training.warmup_ratio,
                'bf16': self.training.bf16,
                'fp16': self.training.fp16,
                'tf32': self.training.tf32,
                'dataset_shuffle': self.training.dataset_shuffle,
-                'dataset_seed': self.training.dataset_seed
+                'dataset_seed': self.training.dataset_seed,
+                'torch_compile': self.training.torch_compile
            },
            'dataset': {
                'min_file_size': self.dataset.min_file_size,
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@ -178,6 +178,7 @@ class DatasetProcessor:
            return code_samples

        finally:
+            self.logger.info(f"Finished processing {repo_url}")
            # Cleanup temporary directories, but keep gitclone folder
            # if temp_dir != "./gitclone":
            #     shutil.rmtree(temp_dir, ignore_errors=True)
--- a/src/main.py
+++ b/src/main.py
@ -4,6 +4,17 @@ Main entry point for AI Trainer application
 Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model
 """

+# Disable torch compilation before importing any modules
+import os
+os.environ['DISABLE_TORCH_COMPILE'] = '1'
+os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
+os.environ['TORCH_COMPILE_DISABLE'] = '1'
+
+from trainer import ModelTrainer
+from dataset_processor import DatasetProcessor
+from config import AppConfig
+from utils import setup_logging, check_gpu_memory
+
 import argparse
 import logging
 import os
@ -13,12 +24,6 @@ from pathlib import Path
 # Add src to path for imports
 sys.path.append(str(Path(__file__).parent))

-from trainer import ModelTrainer
-from dataset_processor import DatasetProcessor
-from config import AppConfig
-from utils import setup_logging, check_gpu_memory
-
-
 def parse_arguments():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model")
--- a/src/trainer.py
+++ b/src/trainer.py
@ -3,13 +3,21 @@ Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit
 Optimized for RTX3070 8GB VRAM with no CPU offloading
 """

+# Disable torch compilation before importing any modules that might use it
+import os
+os.environ['DISABLE_TORCH_COMPILE'] = '1'
+os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
+os.environ['TORCH_COMPILE_DISABLE'] = '1'
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
+
+from unsloth import FastLanguageModel, is_bfloat16_supported
+from unsloth.chat_templates import get_chat_template
 import logging
 import os
 import gc
 import torch
 from pathlib import Path
 from typing import Optional, Dict, Any
-
 import torch.nn as nn
 from transformers import (
    AutoModelForCausalLM,
@ -21,8 +29,6 @@ from transformers import (
 )
 from trl import SFTConfig, SFTTrainer
 from datasets import Dataset
-from unsloth import FastLanguageModel, is_bfloat16_supported
-
 from config import AppConfig
 from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage

@ -125,9 +131,18 @@ class ModelTrainer:
                max_seq_length=self.config.model.max_seq_length,
                dtype=None,  # Auto-detect
                load_in_4bit=True,  # Use 4-bit quantization
-                token=None,  # Use default token
+                token="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",  # Use default token
            )

+            # Log tokenizer attributes for debugging
+            self.logger.info(f"Tokenizer type: {type(self.tokenizer)}")
+            self.logger.info(f"Tokenizer class: {self.tokenizer.__class__}")
+            self.logger.info(f"Tokenizer has 'unsloth_push_to_hub': {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
+
+            # List all methods containing 'unsloth' for debugging
+            unsloth_methods = [attr for attr in dir(self.tokenizer) if 'unsloth' in attr.lower()]
+            self.logger.info(f"Tokenizer unsloth methods: {unsloth_methods}")
+
            # Configure model for training
            self.model = FastLanguageModel.get_peft_model(
                self.model,
@ -152,31 +167,126 @@ class ModelTrainer:
            raise

    def _prepare_dataset(self, train_dataset: Dataset) -> Dataset:
-        """Prepare and tokenize the dataset"""
-        self.logger.info("Preparing dataset...")
+        """Prepare and tokenize the dataset for Qwen2.5-Coder"""
+        self.logger.info("Preparing dataset for Qwen2.5-Coder...")
+
+        # Apply chat template for Qwen2.5-Coder if available
+        try:
+            chat_template = get_chat_template("qwen")
+            if chat_template and isinstance(chat_template, str):
+                self.tokenizer.chat_template = chat_template
+                self.logger.info("Applied Qwen chat template from string")
+            else:
+                self.logger.warning(f"Invalid chat template received: {type(chat_template)}")
+        except Exception as e:
+            self.logger.warning(f"Could not apply Qwen chat template: {e}")
+            # Fallback to default formatting
+            pass

        def tokenize_function(examples):
-            return self.tokenizer(
-                examples["text"],
+            # Format examples as instruction-following pairs for code training
+            formatted_texts = []
+            for text in examples["text"]:
+                # Create an instruction format appropriate for code training
+                # For Qwen2.5-Coder, we can use a code completion or analysis format
+                messages = [
+                    {"role": "user", "content": "Analyze and understand the following code:"},
+                    {"role": "assistant", "content": text}
+                ]
+
+                # Apply chat template if available, otherwise use simple formatting
+                try:
+                    # Log tokenizer state in multiprocessing context
+                    import multiprocessing
+                    self.logger.debug(f"Tokenize function - Process ID: {multiprocessing.current_process().pid}")
+                    self.logger.debug(f"Tokenize function - Tokenizer type: {type(self.tokenizer)}")
+                    self.logger.debug(f"Tokenize function - Has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
+                    self.logger.debug(f"Tokenize function - Has chat_template: {hasattr(self.tokenizer, 'chat_template')}")
+                    if hasattr(self.tokenizer, 'chat_template'):
+                        self.logger.debug(f"Tokenize function - chat_template type: {type(self.tokenizer.chat_template)}")
+
+                    # Check if tokenizer has apply_chat_template method
+                    if hasattr(self.tokenizer, 'apply_chat_template'):
+                        formatted_text = self.tokenizer.apply_chat_template(
+                            messages,
+                            tokenize=False,
+                            add_generation_prompt=False  # We're training on the full conversation
+                        )
+                    else:
+                        self.logger.warning("Tokenizer does not have apply_chat_template method, using fallback")
+                        formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
+
+                except AttributeError as e:
+                    if 'unsloth_push_to_hub' in str(e):
+                        self.logger.error(f"AttributeError in multiprocessing context: {e}")
+                        self.logger.error(f"Tokenizer type: {type(self.tokenizer)}")
+                        self.logger.error(f"Tokenizer attributes: {[attr for attr in dir(self.tokenizer) if not attr.startswith('_')]}")
+                    elif 'padding_side' in str(e):
+                        self.logger.warning(f"Chat template padding_side error: {e}")
+                        self.logger.warning("Using fallback formatting due to chat template issue")
+                        formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
+                    else:
+                        raise
+                except Exception as e:
+                    self.logger.warning(f"Error applying chat template: {e}, using fallback formatting")
+                    # Fallback to simple formatting with special tokens
+                    formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
+
+                formatted_texts.append(formatted_text)
+            
+            # Tokenize with proper padding and truncation for Qwen2.5-Coder
+            tokenized = self.tokenizer(
+                formatted_texts,
                padding="max_length",
                truncation=True,
                max_length=self.config.model.max_seq_length,
-                return_tensors="pt"
+                return_tensors="pt",
+                add_special_tokens=True
            )
+            
+            # For causal language modeling, we need to create proper labels
+            # Clone input_ids to create labels
+            labels = tokenized["input_ids"].clone()
+            
+            # Try to mask the user part of the conversation
+            # Find the assistant token to determine where the assistant response starts
+            try:
+                # Convert to string to find the assistant token
+                decoded_tokens = self.tokenizer.batch_decode(tokenized["input_ids"], skip_special_tokens=False)
+                for i, decoded in enumerate(decoded_tokens):
+                    # Find where the assistant response starts
+                    assistant_start = decoded.find("<|im_start|>assistant")
+                    if assistant_start != -1:
+                        # Find the actual token position
+                        # We'll mask everything before the assistant response with -100
+                        assistant_tokens = self.tokenizer("<|im_start|>assistant", add_special_tokens=False)["input_ids"]
+                        if len(assistant_tokens) > 0:
+                            # Find where the assistant token first appears
+                            assistant_token_id = assistant_tokens[0]
+                            assistant_positions = (tokenized["input_ids"][i] == assistant_token_id).nonzero(as_tuple=True)[0]
+                            if len(assistant_positions) > 0:
+                                # Mask everything before the assistant token
+                                labels[i, :assistant_positions[0]] = -100
+            except Exception as e:
+                self.logger.warning(f"Could not mask user tokens: {e}")
+                # Fallback: Just use the input_ids as labels
+                pass
+            
+            tokenized["labels"] = labels
+            return tokenized

        # Tokenize dataset
        tokenized_dataset = train_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"],
-            desc="Tokenizing dataset"
+            desc="Tokenizing dataset for Qwen2.5-Coder"
        )

-        self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}")
+        self.logger.info(f"Dataset tokenized for Qwen2.5-Coder. Size: {len(tokenized_dataset)}")
        return tokenized_dataset

    def _setup_trainer(self, tokenized_dataset: Dataset):
-        """Setup the HuggingFace trainer with memory optimizations"""
        self.logger.info("Setting up trainer...")

        # Training arguments optimized for RTX3070 8GB
@ -218,7 +328,10 @@ class ModelTrainer:
            # Memory optimization settings
            ddp_find_unused_parameters=False,
            per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
-            max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
+            max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10,
+            # Explicitly disable torch compilation
+            torch_compile=False,
+            torch_compile_backend=None
        )

        # Data collator
@ -228,17 +341,32 @@ class ModelTrainer:
        # )
        data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)

+        # Log data collator setup for debugging
+        self.logger.info(f"Data collator created with tokenizer type: {type(self.tokenizer)}")
+        self.logger.info(f"Data collator tokenizer has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
+
        # Initialize trainer
-        self.trainer = SFTTrainer(
-            model=self.model,
-            args=training_args,
-            train_dataset=tokenized_dataset,
-            eval_dataset=tokenized_dataset,  # Using same dataset for eval (for demo)
-            data_collator=data_collator,
-            tokenizer=self.tokenizer,
-            dataset_text_field="text",
-            packing=False  # Can make training 5x faster for short sequences.
-        )
+        self.logger.info("Initializing SFTTrainer...")
+
+        # Environment variables for torch compilation are set at module level
+        self.logger.info("Torch compilation environment variables set at module level")
+
+        try:
+            self.trainer = SFTTrainer(
+                model=self.model,
+                args=training_args,
+                train_dataset=tokenized_dataset,
+                eval_dataset=tokenized_dataset,  # Using same dataset for eval (for demo)
+                data_collator=data_collator,
+                tokenizer=self.tokenizer,
+                # dataset_text_field="text",
+                # packing=False  # Can make training 5x faster for short sequences.
+            )
+            self.logger.info("SFTTrainer initialized successfully")
+        except Exception as e:
+            self.logger.error(f"Failed to initialize SFTTrainer: {e}")
+            self.logger.error(f"Error type: {type(e)}")
+            raise

        self.logger.info("Trainer setup completed")