ai_github_trainer/configs/training_config_qwen3.yaml

# Training configuration optimized for RTX3070 8GB VRAM - Qwen3-8B Model
# AI Trainer for unsloth/Qwen3-8B-bnb-4bit

model:
  name: "unsloth/Qwen3-8B-bnb-4bit"
  max_seq_length: 2048
  trust_remote_code: true
  use_fast_tokenizer: true
  padding_side: "left"
  truncation_side: "left"

training:
  # Memory-optimized batch size for RTX3070 8GB with Qwen3-8B
  per_device_train_batch_size: 1  # More conservative for larger model
  gradient_accumulation_steps: 8  # Higher accumulation to maintain effective batch size

  # Training parameters
  num_train_epochs: 3
  learning_rate: 1.0e-4  # Slightly lower for larger model
  warmup_steps: 15
  warmup_ratio: 0.1

  # Logging and saving
  logging_steps: 1
  save_steps: 100
  save_total_limit: 3

  # Evaluation
  evaluation_strategy: "steps"
  eval_steps: 100
  load_best_model_at_end: true
  metric_for_best_model: "loss"
  greater_is_better: false

  # Data loading
  dataloader_num_workers: 2
  dataloader_pin_memory: true
  remove_unused_columns: false

  # Memory optimization - CRITICAL for RTX3070 8GB with 8B model
  use_gradient_checkpointing: true
  offload_to_cpu: false  # Explicitly no CPU offloading

  # Optimizer settings
  optim: "adamw_torch"
  weight_decay: 0.01
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-8
  max_grad_norm: 1.0

  # Learning rate scheduler
  lr_scheduler_type: "cosine"

  # Precision - BF16 for better stability on modern GPUs
  bf16: true
  fp16: false
  tf32: true

  # Dataset settings
  dataset_shuffle: true
  dataset_seed: 42

  # Output settings
  output_dir: "./models"
  logging_dir: "./logs"
  report_to: ["tensorboard"]

dataset:
  # File filtering
  min_file_size: 10
  max_file_size: 10000

  # Supported programming languages
  supported_languages:
    - python
    - javascript
    - typescript
    - java
    - cpp
    - c
    - csharp
    - php
    - ruby
    - go
    - rust
    - swift
    - kotlin
    - scala
    - sql
    - bash
    - yaml
    - json
    - xml
    - html
    - css
    - markdown

  # Files and directories to exclude
  exclude_patterns:
    - "\\.git/"
    - "__pycache__/"
    - "\\.pytest_cache/"
    - "node_modules/"
    - "\\.venv/"
    - "venv/"
    - "package-lock\\.json$"
    - "yarn\\.lock$"
    - "\\.log$"
    - "\\.tmp$"
    - "\\.bak$"
    - "~\\$.*"
    - "\\.swp$"
    - "\\.swo$"
    - "\\.DS_Store"
    - "\\.pyc$"
    - "\\.pyo$"
    - "\\.pyd$"
    - "\\.so$"
    - "\\.dll$"
    - "\\.exe$"

memory:
  # Memory management for RTX3070 8GB with Qwen3-8B
  max_memory_usage: 0.95  # Use up to 95% for more aggressive memory usage
  enable_memory_tracking: true
  clear_cache_between_epochs: true

  # Attention optimization
  use_memory_efficient_attention: true
  attention_slicing: true
  slice_size: 1