132 lines
2.7 KiB
YAML
132 lines
2.7 KiB
YAML
# Training configuration optimized for RTX3070 8GB VRAM - Qwen3-8B Model
|
|
# AI Trainer for unsloth/Qwen3-8B-bnb-4bit
|
|
|
|
model:
|
|
name: "unsloth/Qwen3-8B-bnb-4bit"
|
|
max_seq_length: 2048
|
|
trust_remote_code: true
|
|
use_fast_tokenizer: true
|
|
padding_side: "left"
|
|
truncation_side: "left"
|
|
|
|
training:
|
|
# Memory-optimized batch size for RTX3070 8GB with Qwen3-8B
|
|
per_device_train_batch_size: 1 # More conservative for larger model
|
|
gradient_accumulation_steps: 8 # Higher accumulation to maintain effective batch size
|
|
|
|
# Training parameters
|
|
num_train_epochs: 3
|
|
learning_rate: 1.0e-4 # Slightly lower for larger model
|
|
warmup_steps: 15
|
|
warmup_ratio: 0.1
|
|
|
|
# Logging and saving
|
|
logging_steps: 1
|
|
save_steps: 100
|
|
save_total_limit: 3
|
|
|
|
# Evaluation
|
|
evaluation_strategy: "steps"
|
|
eval_steps: 100
|
|
load_best_model_at_end: true
|
|
metric_for_best_model: "loss"
|
|
greater_is_better: false
|
|
|
|
# Data loading
|
|
dataloader_num_workers: 2
|
|
dataloader_pin_memory: true
|
|
remove_unused_columns: false
|
|
|
|
# Memory optimization - CRITICAL for RTX3070 8GB with 8B model
|
|
use_gradient_checkpointing: true
|
|
offload_to_cpu: false # Explicitly no CPU offloading
|
|
|
|
# Optimizer settings
|
|
optim: "adamw_torch"
|
|
weight_decay: 0.01
|
|
adam_beta1: 0.9
|
|
adam_beta2: 0.999
|
|
adam_epsilon: 1.0e-8
|
|
max_grad_norm: 1.0
|
|
|
|
# Learning rate scheduler
|
|
lr_scheduler_type: "cosine"
|
|
|
|
# Precision - BF16 for better stability on modern GPUs
|
|
bf16: true
|
|
fp16: false
|
|
tf32: true
|
|
|
|
# Dataset settings
|
|
dataset_shuffle: true
|
|
dataset_seed: 42
|
|
|
|
# Output settings
|
|
output_dir: "./models"
|
|
logging_dir: "./logs"
|
|
report_to: ["tensorboard"]
|
|
|
|
dataset:
|
|
# File filtering
|
|
min_file_size: 10
|
|
max_file_size: 10000
|
|
|
|
# Supported programming languages
|
|
supported_languages:
|
|
- python
|
|
- javascript
|
|
- typescript
|
|
- java
|
|
- cpp
|
|
- c
|
|
- csharp
|
|
- php
|
|
- ruby
|
|
- go
|
|
- rust
|
|
- swift
|
|
- kotlin
|
|
- scala
|
|
- sql
|
|
- bash
|
|
- yaml
|
|
- json
|
|
- xml
|
|
- html
|
|
- css
|
|
- markdown
|
|
|
|
# Files and directories to exclude
|
|
exclude_patterns:
|
|
- "\\.git/"
|
|
- "__pycache__/"
|
|
- "\\.pytest_cache/"
|
|
- "node_modules/"
|
|
- "\\.venv/"
|
|
- "venv/"
|
|
- "package-lock\\.json$"
|
|
- "yarn\\.lock$"
|
|
- "\\.log$"
|
|
- "\\.tmp$"
|
|
- "\\.bak$"
|
|
- "~\\$.*"
|
|
- "\\.swp$"
|
|
- "\\.swo$"
|
|
- "\\.DS_Store"
|
|
- "\\.pyc$"
|
|
- "\\.pyo$"
|
|
- "\\.pyd$"
|
|
- "\\.so$"
|
|
- "\\.dll$"
|
|
- "\\.exe$"
|
|
|
|
memory:
|
|
# Memory management for RTX3070 8GB with Qwen3-8B
|
|
max_memory_usage: 0.95 # Use up to 95% for more aggressive memory usage
|
|
enable_memory_tracking: true
|
|
clear_cache_between_epochs: true
|
|
|
|
# Attention optimization
|
|
use_memory_efficient_attention: true
|
|
attention_slicing: true
|
|
slice_size: 1 |