diff --git a/.gitignore b/.gitignore index 2d5066f..d3a13dc 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,8 @@ share/python-wheels/ MANIFEST /models /unsloth_compiled_cache +/gitclone/enterprise +/gitclone/odoo # PyInstaller # Usually these files are written by a python script from a template diff --git a/configs/training_config.yaml b/configs/training_config.yaml index e0f5e81..b049fc5 100644 --- a/configs/training_config.yaml +++ b/configs/training_config.yaml @@ -13,9 +13,10 @@ training: # Memory-optimized batch size for RTX3070 8GB per_device_train_batch_size: 2 gradient_accumulation_steps: 4 + max_steps: 30 # Training parameters - num_train_epochs: 3 + num_train_epochs: 1 learning_rate: 2.0e-4 warmup_steps: 10 warmup_ratio: 0.1 @@ -42,7 +43,7 @@ training: offload_to_cpu: false # Explicitly no CPU offloading # Optimizer settings - optim: "adamw_torch" + optim: "paged_adamw_8bit" weight_decay: 0.01 adam_beta1: 0.9 adam_beta2: 0.999 @@ -50,7 +51,7 @@ training: max_grad_norm: 1.0 # Learning rate scheduler - lr_scheduler_type: "cosine" + lr_scheduler_type: "linear" # Precision - BF16 for better stability on modern GPUs bf16: true @@ -59,7 +60,7 @@ training: # Dataset settings dataset_shuffle: true - dataset_seed: 42 + dataset_seed: 3407 # Output settings output_dir: "./models" diff --git a/src/dataset_processor.py b/src/dataset_processor.py index 38b866c..ee7a220 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -179,8 +179,8 @@ class DatasetProcessor: finally: # Cleanup temporary directories, but keep gitclone folder - if temp_dir != "./gitclone": - shutil.rmtree(temp_dir, ignore_errors=True) + # if temp_dir != "./gitclone": + # shutil.rmtree(temp_dir, ignore_errors=True) def _extract_code_samples(self, repo_path: str, config: AppConfig) -> List[Dict]: """ diff --git a/src/trainer.py b/src/trainer.py index 280c33a..629cd70 100644 --- a/src/trainer.py +++ b/src/trainer.py @@ -16,8 +16,10 @@ from transformers import ( AutoTokenizer, Trainer, TrainingArguments, - DataCollatorForLanguageModeling + DataCollatorForLanguageModeling, + DataCollatorForSeq2Seq ) +from trl import SFTConfig, SFTTrainer from datasets import Dataset from unsloth import FastLanguageModel, is_bfloat16_supported @@ -178,7 +180,7 @@ class ModelTrainer: self.logger.info("Setting up trainer...") # Training arguments optimized for RTX3070 8GB - training_args = TrainingArguments( + training_args = SFTConfig( output_dir=str(self.output_dir / "checkpoints"), num_train_epochs=self.config.training.num_train_epochs, per_device_train_batch_size=self.config.training.per_device_train_batch_size, @@ -216,22 +218,26 @@ class ModelTrainer: # Memory optimization settings ddp_find_unused_parameters=False, per_device_eval_batch_size=self.config.training.per_device_train_batch_size, + max_steps=self.config.training.max_steps if hasattr(self.config.training, 'max_steps') else 10 ) # Data collator - data_collator = DataCollatorForLanguageModeling( - tokenizer=self.tokenizer, - mlm=False # Causal language modeling - ) + # data_collator = DataCollatorForLanguageModeling( + # tokenizer=self.tokenizer, + # mlm=False # Causal language modeling + # ) + data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer) # Initialize trainer - self.trainer = Trainer( + self.trainer = SFTTrainer( model=self.model, args=training_args, train_dataset=tokenized_dataset, eval_dataset=tokenized_dataset, # Using same dataset for eval (for demo) data_collator=data_collator, tokenizer=self.tokenizer, + dataset_text_field="text", + packing=False # Can make training 5x faster for short sequences. ) self.logger.info("Trainer setup completed")