bug fixes

This commit is contained in:
Suherdy Yacob 2025-08-22 21:53:40 +07:00
parent 1f1c183884
commit dc14dc4c2c
4 changed files with 22 additions and 13 deletions

2
.gitignore vendored
View File

@ -28,6 +28,8 @@ share/python-wheels/
MANIFEST MANIFEST
/models /models
/unsloth_compiled_cache /unsloth_compiled_cache
/gitclone/enterprise
/gitclone/odoo
# PyInstaller # PyInstaller
# Usually these files are written by a python script from a template # Usually these files are written by a python script from a template

View File

@ -13,9 +13,10 @@ training:
# Memory-optimized batch size for RTX3070 8GB # Memory-optimized batch size for RTX3070 8GB
per_device_train_batch_size: 2 per_device_train_batch_size: 2
gradient_accumulation_steps: 4 gradient_accumulation_steps: 4
max_steps: 30
# Training parameters # Training parameters
num_train_epochs: 3 num_train_epochs: 1
learning_rate: 2.0e-4 learning_rate: 2.0e-4
warmup_steps: 10 warmup_steps: 10
warmup_ratio: 0.1 warmup_ratio: 0.1
@ -42,7 +43,7 @@ training:
offload_to_cpu: false # Explicitly no CPU offloading offload_to_cpu: false # Explicitly no CPU offloading
# Optimizer settings # Optimizer settings
optim: "adamw_torch" optim: "paged_adamw_8bit"
weight_decay: 0.01 weight_decay: 0.01
adam_beta1: 0.9 adam_beta1: 0.9
adam_beta2: 0.999 adam_beta2: 0.999
@ -50,7 +51,7 @@ training:
max_grad_norm: 1.0 max_grad_norm: 1.0
# Learning rate scheduler # Learning rate scheduler
lr_scheduler_type: "cosine" lr_scheduler_type: "linear"
# Precision - BF16 for better stability on modern GPUs # Precision - BF16 for better stability on modern GPUs
bf16: true bf16: true
@ -59,7 +60,7 @@ training:
# Dataset settings # Dataset settings
dataset_shuffle: true dataset_shuffle: true
dataset_seed: 42 dataset_seed: 3407
# Output settings # Output settings
output_dir: "./models" output_dir: "./models"

View File

@ -179,8 +179,8 @@ class DatasetProcessor:
finally: finally:
# Cleanup temporary directories, but keep gitclone folder # Cleanup temporary directories, but keep gitclone folder
if temp_dir != "./gitclone": # if temp_dir != "./gitclone":
shutil.rmtree(temp_dir, ignore_errors=True) # shutil.rmtree(temp_dir, ignore_errors=True)
def _extract_code_samples(self, repo_path: str, config: AppConfig) -> List[Dict]: def _extract_code_samples(self, repo_path: str, config: AppConfig) -> List[Dict]:
""" """

View File

@ -16,8 +16,10 @@ from transformers import (
AutoTokenizer, AutoTokenizer,
Trainer, Trainer,
TrainingArguments, TrainingArguments,
DataCollatorForLanguageModeling DataCollatorForLanguageModeling,
DataCollatorForSeq2Seq
) )
from trl import SFTConfig, SFTTrainer
from datasets import Dataset from datasets import Dataset
from unsloth import FastLanguageModel, is_bfloat16_supported from unsloth import FastLanguageModel, is_bfloat16_supported
@ -178,7 +180,7 @@ class ModelTrainer:
self.logger.info("Setting up trainer...") self.logger.info("Setting up trainer...")
# Training arguments optimized for RTX3070 8GB # Training arguments optimized for RTX3070 8GB
training_args = TrainingArguments( training_args = SFTConfig(
output_dir=str(self.output_dir / "checkpoints"), output_dir=str(self.output_dir / "checkpoints"),
num_train_epochs=self.config.training.num_train_epochs, num_train_epochs=self.config.training.num_train_epochs,
per_device_train_batch_size=self.config.training.per_device_train_batch_size, per_device_train_batch_size=self.config.training.per_device_train_batch_size,
@ -216,22 +218,26 @@ class ModelTrainer:
# Memory optimization settings # Memory optimization settings
ddp_find_unused_parameters=False, ddp_find_unused_parameters=False,
per_device_eval_batch_size=self.config.training.per_device_train_batch_size, per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
) )
# Data collator # Data collator
data_collator = DataCollatorForLanguageModeling( # data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer, # tokenizer=self.tokenizer,
mlm=False # Causal language modeling # mlm=False # Causal language modeling
) # )
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
# Initialize trainer # Initialize trainer
self.trainer = Trainer( self.trainer = SFTTrainer(
model=self.model, model=self.model,
args=training_args, args=training_args,
train_dataset=tokenized_dataset, train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo) eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
data_collator=data_collator, data_collator=data_collator,
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
dataset_text_field="text",
packing=False # Can make training 5x faster for short sequences.
) )
self.logger.info("Trainer setup completed") self.logger.info("Trainer setup completed")