ai_github_trainer/configs/training_config_qwen3.yaml

132 lines
2.7 KiB
YAML

# Training configuration optimized for RTX3070 8GB VRAM - Qwen3-8B Model
# AI Trainer for unsloth/Qwen3-8B-bnb-4bit
model:
name: "unsloth/Qwen3-8B-bnb-4bit"
max_seq_length: 2048
trust_remote_code: true
use_fast_tokenizer: true
padding_side: "left"
truncation_side: "left"
training:
# Memory-optimized batch size for RTX3070 8GB with Qwen3-8B
per_device_train_batch_size: 1 # More conservative for larger model
gradient_accumulation_steps: 8 # Higher accumulation to maintain effective batch size
# Training parameters
num_train_epochs: 3
learning_rate: 1.0e-4 # Slightly lower for larger model
warmup_steps: 15
warmup_ratio: 0.1
# Logging and saving
logging_steps: 1
save_steps: 100
save_total_limit: 3
# Evaluation
eval_strategy: "steps"
eval_steps: 100
load_best_model_at_end: true
metric_for_best_model: "loss"
greater_is_better: false
# Data loading
dataloader_num_workers: 2
dataloader_pin_memory: true
remove_unused_columns: false
# Memory optimization - CRITICAL for RTX3070 8GB with 8B model
use_gradient_checkpointing: true
offload_to_cpu: false # Explicitly no CPU offloading
# Optimizer settings
optim: "adamw_torch"
weight_decay: 0.01
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1.0e-8
max_grad_norm: 1.0
# Learning rate scheduler
lr_scheduler_type: "cosine"
# Precision - BF16 for better stability on modern GPUs
bf16: true
fp16: false
tf32: true
# Dataset settings
dataset_shuffle: true
dataset_seed: 42
# Output settings
output_dir: "./models"
logging_dir: "./logs"
report_to: ["tensorboard"]
dataset:
# File filtering
min_file_size: 10
max_file_size: 10000
# Supported programming languages
supported_languages:
- python
- javascript
- typescript
- java
- cpp
- c
- csharp
- php
- ruby
- go
- rust
- swift
- kotlin
- scala
- sql
- bash
- yaml
- json
- xml
- html
- css
- markdown
# Files and directories to exclude
exclude_patterns:
- "\\.git/"
- "__pycache__/"
- "\\.pytest_cache/"
- "node_modules/"
- "\\.venv/"
- "venv/"
- "package-lock\\.json$"
- "yarn\\.lock$"
- "\\.log$"
- "\\.tmp$"
- "\\.bak$"
- "~\\$.*"
- "\\.swp$"
- "\\.swo$"
- "\\.DS_Store"
- "\\.pyc$"
- "\\.pyo$"
- "\\.pyd$"
- "\\.so$"
- "\\.dll$"
- "\\.exe$"
memory:
# Memory management for RTX3070 8GB with Qwen3-8B
max_memory_usage: 0.95 # Use up to 95% for more aggressive memory usage
enable_memory_tracking: true
clear_cache_between_epochs: true
# Attention optimization
use_memory_efficient_attention: true
attention_slicing: true
slice_size: 1