From f39220b192cc4ebdd136f59c4ca1e0a282b56f99 Mon Sep 17 00:00:00 2001
From: Suherdy Yacob <suherdy.yacob@mapan.co.id>
Date: Fri, 22 Aug 2025 23:28:17 +0700
Subject: [PATCH] fix many bugs

---
 configs/training_config.yaml |  22 +++--
 src/config.py                |  20 +++-
 src/dataset_processor.py     |   1 +
 src/main.py                  |  17 ++--
 src/trainer.py               | 174 ++++++++++++++++++++++++++++++-----
 5 files changed, 197 insertions(+), 37 deletions(-)

diff --git a/configs/training_config.yaml b/configs/training_config.yaml
index b049fc5..0bd1316 100644
--- a/configs/training_config.yaml
+++ b/configs/training_config.yaml
@@ -3,7 +3,7 @@
 
 model:
   name: "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
-  max_seq_length: 2048
+  max_seq_length: 512
   trust_remote_code: true
   use_fast_tokenizer: true
   padding_side: "left"
@@ -11,8 +11,8 @@ model:
 
 training:
   # Memory-optimized batch size for RTX3070 8GB
-  per_device_train_batch_size: 2
-  gradient_accumulation_steps: 4
+  per_device_train_batch_size: 1
+  gradient_accumulation_steps: 16
   max_steps: 30
 
   # Training parameters
@@ -34,7 +34,7 @@ training:
   greater_is_better: false
 
   # Data loading
-  dataloader_num_workers: 2
+  dataloader_num_workers: 0  # Temporarily disabled for debugging
   dataloader_pin_memory: true
   remove_unused_columns: false
 
@@ -42,13 +42,20 @@ training:
   use_gradient_checkpointing: true
   offload_to_cpu: false  # Explicitly no CPU offloading
 
+  # Additional memory optimizations
+  dataloader_drop_last: true
+
+  # Aggressive memory settings for 8GB GPU
+  per_device_eval_batch_size: 1
+  eval_accumulation_steps: 1
+
   # Optimizer settings
-  optim: "paged_adamw_8bit"
+  optim: "adamw_torch"
   weight_decay: 0.01
   adam_beta1: 0.9
   adam_beta2: 0.999
   adam_epsilon: 1.0e-8
-  max_grad_norm: 1.0
+  max_grad_norm: 0.5
 
   # Learning rate scheduler
   lr_scheduler_type: "linear"
@@ -58,6 +65,9 @@ training:
   fp16: false
   tf32: true
 
+  # Disable torch compilation to avoid generator tracing issues
+  torch_compile: false
+
   # Dataset settings
   dataset_shuffle: true
   dataset_seed: 3407
diff --git a/src/config.py b/src/config.py
index faf1da6..cdace58 100644
--- a/src/config.py
+++ b/src/config.py
@@ -27,6 +27,7 @@ class TrainingConfig:
     """Training configuration"""
     per_device_train_batch_size: int = 2
     gradient_accumulation_steps: int = 4
+    max_steps: int = 10  
     num_train_epochs: int = 3
     learning_rate: float = 2e-4
     warmup_steps: int = 10
@@ -41,6 +42,7 @@ class TrainingConfig:
     dataloader_num_workers: int = 2
     dataloader_pin_memory: bool = True
     remove_unused_columns: bool = False
+    dataloader_drop_last: bool = True
     label_names: List[str] = None
 
     # Memory optimization for RTX3070 8GB
@@ -55,7 +57,9 @@ class TrainingConfig:
     adam_beta1: float = 0.9
     adam_beta2: float = 0.999
     adam_epsilon: float = 1e-8
-    max_grad_norm: float = 1.0
+    max_grad_norm: float = 0.5
+    per_device_eval_batch_size: int = 1
+    eval_accumulation_steps: int = 1
 
     # Learning rate scheduler
     lr_scheduler_type: str = "cosine"
@@ -66,6 +70,9 @@ class TrainingConfig:
     fp16: bool = False
     tf32: bool = True
 
+    # Compilation settings
+    torch_compile: bool = False
+
     # Dataset processing
     dataset_shuffle: bool = True
     dataset_seed: int = 42
@@ -181,6 +188,7 @@ class AppConfig:
             'training': {
                 'per_device_train_batch_size': self.training.per_device_train_batch_size,
                 'gradient_accumulation_steps': self.training.gradient_accumulation_steps,
+                'max_steps': self.training.max_steps,
                 'num_train_epochs': self.training.num_train_epochs,
                 'learning_rate': self.training.learning_rate,
                 'warmup_steps': self.training.warmup_steps,
@@ -195,17 +203,25 @@ class AppConfig:
                 'dataloader_num_workers': self.training.dataloader_num_workers,
                 'dataloader_pin_memory': self.training.dataloader_pin_memory,
                 'remove_unused_columns': self.training.remove_unused_columns,
+                'dataloader_drop_last': self.training.dataloader_drop_last,
                 'use_gradient_checkpointing': self.training.use_gradient_checkpointing,
                 'offload_to_cpu': self.training.offload_to_cpu,
                 'optim': self.training.optim,
                 'weight_decay': self.training.weight_decay,
+                'adam_beta1': self.training.adam_beta1,
+                'adam_beta2': self.training.adam_beta2,
+                'adam_epsilon': self.training.adam_epsilon,
+                'max_grad_norm': self.training.max_grad_norm,
+                'per_device_eval_batch_size': self.training.per_device_eval_batch_size,
+                'eval_accumulation_steps': self.training.eval_accumulation_steps,
                 'lr_scheduler_type': self.training.lr_scheduler_type,
                 'warmup_ratio': self.training.warmup_ratio,
                 'bf16': self.training.bf16,
                 'fp16': self.training.fp16,
                 'tf32': self.training.tf32,
                 'dataset_shuffle': self.training.dataset_shuffle,
-                'dataset_seed': self.training.dataset_seed
+                'dataset_seed': self.training.dataset_seed,
+                'torch_compile': self.training.torch_compile
             },
             'dataset': {
                 'min_file_size': self.dataset.min_file_size,
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index ee7a220..132b3b4 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -178,6 +178,7 @@ class DatasetProcessor:
             return code_samples
 
         finally:
+            self.logger.info(f"Finished processing {repo_url}")
             # Cleanup temporary directories, but keep gitclone folder
             # if temp_dir != "./gitclone":
             #     shutil.rmtree(temp_dir, ignore_errors=True)
diff --git a/src/main.py b/src/main.py
index 239a415..8832d42 100644
--- a/src/main.py
+++ b/src/main.py
@@ -4,6 +4,17 @@ Main entry point for AI Trainer application
 Training framework for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit model
 """
 
+# Disable torch compilation before importing any modules
+import os
+os.environ['DISABLE_TORCH_COMPILE'] = '1'
+os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
+os.environ['TORCH_COMPILE_DISABLE'] = '1'
+
+from trainer import ModelTrainer
+from dataset_processor import DatasetProcessor
+from config import AppConfig
+from utils import setup_logging, check_gpu_memory
+
 import argparse
 import logging
 import os
@@ -13,12 +24,6 @@ from pathlib import Path
 # Add src to path for imports
 sys.path.append(str(Path(__file__).parent))
 
-from trainer import ModelTrainer
-from dataset_processor import DatasetProcessor
-from config import AppConfig
-from utils import setup_logging, check_gpu_memory
-
-
 def parse_arguments():
     """Parse command line arguments"""
     parser = argparse.ArgumentParser(description="AI Trainer for Qwen2.5-Coder model")
diff --git a/src/trainer.py b/src/trainer.py
index 629cd70..f2c95af 100644
--- a/src/trainer.py
+++ b/src/trainer.py
@@ -3,13 +3,21 @@ Model trainer for unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit
 Optimized for RTX3070 8GB VRAM with no CPU offloading
 """
 
+# Disable torch compilation before importing any modules that might use it
+import os
+os.environ['DISABLE_TORCH_COMPILE'] = '1'
+os.environ['UNSLOTH_DISABLE_COMPILE'] = '1'
+os.environ['TORCH_COMPILE_DISABLE'] = '1'
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
+
+from unsloth import FastLanguageModel, is_bfloat16_supported
+from unsloth.chat_templates import get_chat_template
 import logging
 import os
 import gc
 import torch
 from pathlib import Path
 from typing import Optional, Dict, Any
-
 import torch.nn as nn
 from transformers import (
     AutoModelForCausalLM,
@@ -21,8 +29,6 @@ from transformers import (
 )
 from trl import SFTConfig, SFTTrainer
 from datasets import Dataset
-from unsloth import FastLanguageModel, is_bfloat16_supported
-
 from config import AppConfig
 from utils import check_gpu_memory, clear_gpu_cache, get_memory_usage
 
@@ -125,9 +131,18 @@ class ModelTrainer:
                 max_seq_length=self.config.model.max_seq_length,
                 dtype=None,  # Auto-detect
                 load_in_4bit=True,  # Use 4-bit quantization
-                token=None,  # Use default token
+                token="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",  # Use default token
             )
 
+            # Log tokenizer attributes for debugging
+            self.logger.info(f"Tokenizer type: {type(self.tokenizer)}")
+            self.logger.info(f"Tokenizer class: {self.tokenizer.__class__}")
+            self.logger.info(f"Tokenizer has 'unsloth_push_to_hub': {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
+
+            # List all methods containing 'unsloth' for debugging
+            unsloth_methods = [attr for attr in dir(self.tokenizer) if 'unsloth' in attr.lower()]
+            self.logger.info(f"Tokenizer unsloth methods: {unsloth_methods}")
+
             # Configure model for training
             self.model = FastLanguageModel.get_peft_model(
                 self.model,
@@ -152,31 +167,126 @@ class ModelTrainer:
             raise
 
     def _prepare_dataset(self, train_dataset: Dataset) -> Dataset:
-        """Prepare and tokenize the dataset"""
-        self.logger.info("Preparing dataset...")
+        """Prepare and tokenize the dataset for Qwen2.5-Coder"""
+        self.logger.info("Preparing dataset for Qwen2.5-Coder...")
+
+        # Apply chat template for Qwen2.5-Coder if available
+        try:
+            chat_template = get_chat_template("qwen")
+            if chat_template and isinstance(chat_template, str):
+                self.tokenizer.chat_template = chat_template
+                self.logger.info("Applied Qwen chat template from string")
+            else:
+                self.logger.warning(f"Invalid chat template received: {type(chat_template)}")
+        except Exception as e:
+            self.logger.warning(f"Could not apply Qwen chat template: {e}")
+            # Fallback to default formatting
+            pass
 
         def tokenize_function(examples):
-            return self.tokenizer(
-                examples["text"],
+            # Format examples as instruction-following pairs for code training
+            formatted_texts = []
+            for text in examples["text"]:
+                # Create an instruction format appropriate for code training
+                # For Qwen2.5-Coder, we can use a code completion or analysis format
+                messages = [
+                    {"role": "user", "content": "Analyze and understand the following code:"},
+                    {"role": "assistant", "content": text}
+                ]
+
+                # Apply chat template if available, otherwise use simple formatting
+                try:
+                    # Log tokenizer state in multiprocessing context
+                    import multiprocessing
+                    self.logger.debug(f"Tokenize function - Process ID: {multiprocessing.current_process().pid}")
+                    self.logger.debug(f"Tokenize function - Tokenizer type: {type(self.tokenizer)}")
+                    self.logger.debug(f"Tokenize function - Has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
+                    self.logger.debug(f"Tokenize function - Has chat_template: {hasattr(self.tokenizer, 'chat_template')}")
+                    if hasattr(self.tokenizer, 'chat_template'):
+                        self.logger.debug(f"Tokenize function - chat_template type: {type(self.tokenizer.chat_template)}")
+
+                    # Check if tokenizer has apply_chat_template method
+                    if hasattr(self.tokenizer, 'apply_chat_template'):
+                        formatted_text = self.tokenizer.apply_chat_template(
+                            messages,
+                            tokenize=False,
+                            add_generation_prompt=False  # We're training on the full conversation
+                        )
+                    else:
+                        self.logger.warning("Tokenizer does not have apply_chat_template method, using fallback")
+                        formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
+
+                except AttributeError as e:
+                    if 'unsloth_push_to_hub' in str(e):
+                        self.logger.error(f"AttributeError in multiprocessing context: {e}")
+                        self.logger.error(f"Tokenizer type: {type(self.tokenizer)}")
+                        self.logger.error(f"Tokenizer attributes: {[attr for attr in dir(self.tokenizer) if not attr.startswith('_')]}")
+                    elif 'padding_side' in str(e):
+                        self.logger.warning(f"Chat template padding_side error: {e}")
+                        self.logger.warning("Using fallback formatting due to chat template issue")
+                        formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
+                    else:
+                        raise
+                except Exception as e:
+                    self.logger.warning(f"Error applying chat template: {e}, using fallback formatting")
+                    # Fallback to simple formatting with special tokens
+                    formatted_text = f"<|im_start|>user\nAnalyze and understand the following code:<|im_end|>\n<|im_start|>assistant\n{text}<|im_end|>"
+
+                formatted_texts.append(formatted_text)
+            
+            # Tokenize with proper padding and truncation for Qwen2.5-Coder
+            tokenized = self.tokenizer(
+                formatted_texts,
                 padding="max_length",
                 truncation=True,
                 max_length=self.config.model.max_seq_length,
-                return_tensors="pt"
+                return_tensors="pt",
+                add_special_tokens=True
             )
+            
+            # For causal language modeling, we need to create proper labels
+            # Clone input_ids to create labels
+            labels = tokenized["input_ids"].clone()
+            
+            # Try to mask the user part of the conversation
+            # Find the assistant token to determine where the assistant response starts
+            try:
+                # Convert to string to find the assistant token
+                decoded_tokens = self.tokenizer.batch_decode(tokenized["input_ids"], skip_special_tokens=False)
+                for i, decoded in enumerate(decoded_tokens):
+                    # Find where the assistant response starts
+                    assistant_start = decoded.find("<|im_start|>assistant")
+                    if assistant_start != -1:
+                        # Find the actual token position
+                        # We'll mask everything before the assistant response with -100
+                        assistant_tokens = self.tokenizer("<|im_start|>assistant", add_special_tokens=False)["input_ids"]
+                        if len(assistant_tokens) > 0:
+                            # Find where the assistant token first appears
+                            assistant_token_id = assistant_tokens[0]
+                            assistant_positions = (tokenized["input_ids"][i] == assistant_token_id).nonzero(as_tuple=True)[0]
+                            if len(assistant_positions) > 0:
+                                # Mask everything before the assistant token
+                                labels[i, :assistant_positions[0]] = -100
+            except Exception as e:
+                self.logger.warning(f"Could not mask user tokens: {e}")
+                # Fallback: Just use the input_ids as labels
+                pass
+            
+            tokenized["labels"] = labels
+            return tokenized
 
         # Tokenize dataset
         tokenized_dataset = train_dataset.map(
             tokenize_function,
             batched=True,
             remove_columns=["text", "language", "file_path", "repo_name", "file_size", "line_count"],
-            desc="Tokenizing dataset"
+            desc="Tokenizing dataset for Qwen2.5-Coder"
         )
 
-        self.logger.info(f"Dataset tokenized. Size: {len(tokenized_dataset)}")
+        self.logger.info(f"Dataset tokenized for Qwen2.5-Coder. Size: {len(tokenized_dataset)}")
         return tokenized_dataset
 
     def _setup_trainer(self, tokenized_dataset: Dataset):
-        """Setup the HuggingFace trainer with memory optimizations"""
         self.logger.info("Setting up trainer...")
 
         # Training arguments optimized for RTX3070 8GB
@@ -218,7 +328,10 @@ class ModelTrainer:
             # Memory optimization settings
             ddp_find_unused_parameters=False,
             per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
-            max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
+            max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10,
+            # Explicitly disable torch compilation
+            torch_compile=False,
+            torch_compile_backend=None
         )
 
         # Data collator
@@ -228,17 +341,32 @@ class ModelTrainer:
         # )
         data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
 
+        # Log data collator setup for debugging
+        self.logger.info(f"Data collator created with tokenizer type: {type(self.tokenizer)}")
+        self.logger.info(f"Data collator tokenizer has unsloth_push_to_hub: {hasattr(self.tokenizer, 'unsloth_push_to_hub')}")
+
         # Initialize trainer
-        self.trainer = SFTTrainer(
-            model=self.model,
-            args=training_args,
-            train_dataset=tokenized_dataset,
-            eval_dataset=tokenized_dataset,  # Using same dataset for eval (for demo)
-            data_collator=data_collator,
-            tokenizer=self.tokenizer,
-            dataset_text_field="text",
-            packing=False  # Can make training 5x faster for short sequences.
-        )
+        self.logger.info("Initializing SFTTrainer...")
+
+        # Environment variables for torch compilation are set at module level
+        self.logger.info("Torch compilation environment variables set at module level")
+
+        try:
+            self.trainer = SFTTrainer(
+                model=self.model,
+                args=training_args,
+                train_dataset=tokenized_dataset,
+                eval_dataset=tokenized_dataset,  # Using same dataset for eval (for demo)
+                data_collator=data_collator,
+                tokenizer=self.tokenizer,
+                # dataset_text_field="text",
+                # packing=False  # Can make training 5x faster for short sequences.
+            )
+            self.logger.info("SFTTrainer initialized successfully")
+        except Exception as e:
+            self.logger.error(f"Failed to initialize SFTTrainer: {e}")
+            self.logger.error(f"Error type: {type(e)}")
+            raise
 
         self.logger.info("Trainer setup completed")