bug fixes
This commit is contained in:
parent
1f1c183884
commit
dc14dc4c2c
2
.gitignore
vendored
2
.gitignore
vendored
@ -28,6 +28,8 @@ share/python-wheels/
|
||||
MANIFEST
|
||||
/models
|
||||
/unsloth_compiled_cache
|
||||
/gitclone/enterprise
|
||||
/gitclone/odoo
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
|
||||
@ -13,9 +13,10 @@ training:
|
||||
# Memory-optimized batch size for RTX3070 8GB
|
||||
per_device_train_batch_size: 2
|
||||
gradient_accumulation_steps: 4
|
||||
max_steps: 30
|
||||
|
||||
# Training parameters
|
||||
num_train_epochs: 3
|
||||
num_train_epochs: 1
|
||||
learning_rate: 2.0e-4
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
@ -42,7 +43,7 @@ training:
|
||||
offload_to_cpu: false # Explicitly no CPU offloading
|
||||
|
||||
# Optimizer settings
|
||||
optim: "adamw_torch"
|
||||
optim: "paged_adamw_8bit"
|
||||
weight_decay: 0.01
|
||||
adam_beta1: 0.9
|
||||
adam_beta2: 0.999
|
||||
@ -50,7 +51,7 @@ training:
|
||||
max_grad_norm: 1.0
|
||||
|
||||
# Learning rate scheduler
|
||||
lr_scheduler_type: "cosine"
|
||||
lr_scheduler_type: "linear"
|
||||
|
||||
# Precision - BF16 for better stability on modern GPUs
|
||||
bf16: true
|
||||
@ -59,7 +60,7 @@ training:
|
||||
|
||||
# Dataset settings
|
||||
dataset_shuffle: true
|
||||
dataset_seed: 42
|
||||
dataset_seed: 3407
|
||||
|
||||
# Output settings
|
||||
output_dir: "./models"
|
||||
|
||||
@ -179,8 +179,8 @@ class DatasetProcessor:
|
||||
|
||||
finally:
|
||||
# Cleanup temporary directories, but keep gitclone folder
|
||||
if temp_dir != "./gitclone":
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
# if temp_dir != "./gitclone":
|
||||
# shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
def _extract_code_samples(self, repo_path: str, config: AppConfig) -> List[Dict]:
|
||||
"""
|
||||
|
||||
@ -16,8 +16,10 @@ from transformers import (
|
||||
AutoTokenizer,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
DataCollatorForLanguageModeling
|
||||
DataCollatorForLanguageModeling,
|
||||
DataCollatorForSeq2Seq
|
||||
)
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from datasets import Dataset
|
||||
from unsloth import FastLanguageModel, is_bfloat16_supported
|
||||
|
||||
@ -178,7 +180,7 @@ class ModelTrainer:
|
||||
self.logger.info("Setting up trainer...")
|
||||
|
||||
# Training arguments optimized for RTX3070 8GB
|
||||
training_args = TrainingArguments(
|
||||
training_args = SFTConfig(
|
||||
output_dir=str(self.output_dir / "checkpoints"),
|
||||
num_train_epochs=self.config.training.num_train_epochs,
|
||||
per_device_train_batch_size=self.config.training.per_device_train_batch_size,
|
||||
@ -216,22 +218,26 @@ class ModelTrainer:
|
||||
# Memory optimization settings
|
||||
ddp_find_unused_parameters=False,
|
||||
per_device_eval_batch_size=self.config.training.per_device_train_batch_size,
|
||||
max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10
|
||||
)
|
||||
|
||||
# Data collator
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=self.tokenizer,
|
||||
mlm=False # Causal language modeling
|
||||
)
|
||||
# data_collator = DataCollatorForLanguageModeling(
|
||||
# tokenizer=self.tokenizer,
|
||||
# mlm=False # Causal language modeling
|
||||
# )
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer)
|
||||
|
||||
# Initialize trainer
|
||||
self.trainer = Trainer(
|
||||
self.trainer = SFTTrainer(
|
||||
model=self.model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_dataset,
|
||||
eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo)
|
||||
data_collator=data_collator,
|
||||
tokenizer=self.tokenizer,
|
||||
dataset_text_field="text",
|
||||
packing=False # Can make training 5x faster for short sequences.
|
||||
)
|
||||
|
||||
self.logger.info("Trainer setup completed")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user