bug fixes

This commit is contained in:
Suherdy Yacob 2025-08-22 21:53:40 +07:00
parent 1f1c183884
commit dc14dc4c2c
4 changed files with 22 additions and 13 deletions

2
.gitignore vendored
View File

@ -28,6 +28,8 @@ share/python-wheels/
MANIFEST
/models
/unsloth_compiled_cache
/gitclone/enterprise
/gitclone/odoo
# PyInstaller
# Usually these files are written by a python script from a template

View File

@ -13,9 +13,10 @@ training:
# Memory-optimized batch size for RTX3070 8GB
per_device_train_batch_size: 2
gradient_accumulation_steps: 4
max_steps: 30
# Training parameters
num_train_epochs: 3
num_train_epochs: 1
learning_rate: 2.0e-4
warmup_steps: 10
warmup_ratio: 0.1
@ -42,7 +43,7 @@ training:
offload_to_cpu: false # Explicitly no CPU offloading
# Optimizer settings
optim: "adamw_torch"
optim: "paged_adamw_8bit"
weight_decay: 0.01
adam_beta1: 0.9
adam_beta2: 0.999
@ -50,7 +51,7 @@ training:
max_grad_norm: 1.0
# Learning rate scheduler
lr_scheduler_type: "cosine"
lr_scheduler_type: "linear"
# Precision - BF16 for better stability on modern GPUs
bf16: true
@ -59,7 +60,7 @@ training:
# Dataset settings
dataset_shuffle: true
dataset_seed: 42
dataset_seed: 3407
# Output settings
output_dir: "./models"

View File

@ -179,8 +179,8 @@ class DatasetProcessor:
finally:
# Cleanup temporary directories, but keep gitclone folder
if temp_dir != "./gitclone":
shutil.rmtree(temp_dir, ignore_errors=True)
# if temp_dir != "./gitclone":
# shutil.rmtree(temp_dir, ignore_errors=True)
def _extract_code_samples(self, repo_path: str, config: AppConfig) -> List[Dict]:
"""

View File

@ -16,8 +16,10 @@ from transformers import (
AutoTokenizer,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling
DataCollatorForLanguageModeling,
DataCollatorForSeq2Seq
)
from trl import SFTConfig, SFTTrainer
from datasets import Dataset
from unsloth import FastLanguageModel, is_bfloat16_supported
@ -178,7 +180,7 @@ class ModelTrainer:
self.logger.info("Setting up trainer...")
# Training arguments optimized for RTX3070 8GB
training_args = TrainingArguments(
training_args = SFTConfig(
output_dir=str(self.output_dir / "checkpoints"),
num_train_epochs=self.config.training.num_train_epochs,
per_device_train_batch_size=self.config.training.per_device_train_batch_size,
@ -216,22 +218,26 @@ class ModelTrainer:
# Memory optimization settings
ddp_find_unused_parameters=False,
per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
)
# Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False # Causal language modeling
)
# data_collator = DataCollatorForLanguageModeling(
# tokenizer=self.tokenizer,
# mlm=False # Causal language modeling
# )
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
# Initialize trainer
self.trainer = Trainer(
self.trainer = SFTTrainer(
model=self.model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
data_collator=data_collator,
tokenizer=self.tokenizer,
dataset_text_field="text",
packing=False # Can make training 5x faster for short sequences.
)
self.logger.info("Trainer setup completed")