ai_github_trainer/src/config.py
Suherdy Yacob aaa0f1b51e 1. add 2 data processor type: standard and synthetic
2. add DataProcessorSynthetic class to format github repo to QA ChatML format
2025-08-23 16:44:33 +07:00

243 lines
8.8 KiB
Python

"""
Configuration management for AI Trainer
Handles training parameters and model settings
"""
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Union
import yaml
@dataclass
class ModelConfig:
"""Model-specific configuration"""
name: str = "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
max_seq_length: int = 1024
trust_remote_code: bool = True
use_fast_tokenizer: bool = True
padding_side: str = "left"
truncation_side: str = "left"
@dataclass
class TrainingConfig:
"""Training configuration"""
per_device_train_batch_size: int = 2
gradient_accumulation_steps: int = 4
max_steps: int = 100
num_train_epochs: int = 2
learning_rate: float = 2e-4
warmup_steps: int = 10
logging_steps: int = 1
save_steps: int = 100
save_total_limit: int = 3
eval_strategy: str = "steps"
eval_steps: int = 100
load_best_model_at_end: bool = True
metric_for_best_model: str = "loss"
greater_is_better: bool = False
dataloader_num_workers: int = 2
dataloader_pin_memory: bool = True
remove_unused_columns: bool = False
dataloader_drop_last: bool = True
label_names: List[str] = None
# Memory optimization for RTX3070 8GB
use_gradient_checkpointing: bool = True
offload_to_cpu: bool = False # Explicitly no CPU offloading
use_reentrant: bool = True
gradient_checkpointing_kwargs: Dict = None
# Optimizer settings
optim: str = "adamw_torch"
weight_decay: float = 0.01
adam_beta1: float = 0.9
adam_beta2: float = 0.999
adam_epsilon: float = 1e-8
max_grad_norm: float = 0.5
per_device_eval_batch_size: int = 1
eval_accumulation_steps: int = 1
# Learning rate scheduler
lr_scheduler_type: str = "cosine"
warmup_ratio: float = 0.1
# Precision settings
bf16: bool = True
fp16: bool = False
tf32: bool = True
# Compilation settings
torch_compile: bool = False
# Dataset processing
dataset_shuffle: bool = True
dataset_seed: int = 3407
# Output settings
output_dir: str = "./models"
logging_dir: str = "./logs"
report_to: List[str] = None
def __post_init__(self):
if self.label_names is None:
self.label_names = ["labels"]
if self.gradient_checkpointing_kwargs is None:
self.gradient_checkpointing_kwargs = {"use_reentrant": self.use_reentrant}
if self.report_to is None:
self.report_to = ["tensorboard"]
@dataclass
class DatasetConfig:
"""Dataset processing configuration"""
min_file_size: int = 10
max_file_size: int = 10000 # Characters
supported_languages: List[str] = None
exclude_patterns: List[str] = None
def __post_init__(self):
if self.supported_languages is None:
self.supported_languages = [
'python', 'javascript', 'typescript', 'java', 'cpp', 'c',
'csharp', 'php', 'ruby', 'go', 'rust', 'swift', 'kotlin',
'scala', 'sql', 'bash', 'yaml', 'json', 'xml', 'html', 'css'
]
if self.exclude_patterns is None:
self.exclude_patterns = [
r'\.git/',
r'__pycache__/',
r'node_modules/',
r'\.venv/',
r'package-lock\.json$',
r'\.log$'
]
@dataclass
class MemoryConfig:
"""Memory optimization settings for RTX3070 8GB"""
max_memory_usage: float = 0.95 # Use up to 95% of GPU memory
enable_memory_tracking: bool = True
clear_cache_between_epochs: bool = True
use_memory_efficient_attention: bool = True
attention_slicing: bool = True
slice_size: int = 1
@dataclass
class AppConfig:
"""Main application configuration"""
model: ModelConfig
training: TrainingConfig
dataset: DatasetConfig
memory: MemoryConfig
@classmethod
def from_yaml(cls, config_path: Union[str, Path]) -> "AppConfig":
"""Load configuration from YAML file"""
config_path = Path(config_path)
if not config_path.exists():
# Create default configuration
config = cls(
model=ModelConfig(),
training=TrainingConfig(),
dataset=DatasetConfig(),
memory=MemoryConfig()
)
config.save_yaml(config_path)
return config
with open(config_path, 'r', encoding='utf-8') as f:
config_dict = yaml.safe_load(f)
# Parse nested configurations
model_config = ModelConfig(**config_dict.get('model', {}))
training_config = TrainingConfig(**config_dict.get('training', {}))
dataset_config = DatasetConfig(**config_dict.get('dataset', {}))
memory_config = MemoryConfig(**config_dict.get('memory', {}))
return cls(
model=model_config,
training=training_config,
dataset=dataset_config,
memory=memory_config
)
def save_yaml(self, config_path: Union[str, Path]):
"""Save configuration to YAML file"""
config_path = Path(config_path)
config_path.parent.mkdir(parents=True, exist_ok=True)
config_dict = {
'model': {
'name': self.model.name,
'max_seq_length': self.model.max_seq_length,
'trust_remote_code': self.model.trust_remote_code,
'use_fast_tokenizer': self.model.use_fast_tokenizer,
'padding_side': self.model.padding_side,
'truncation_side': self.model.truncation_side
},
'training': {
'per_device_train_batch_size': self.training.per_device_train_batch_size,
'gradient_accumulation_steps': self.training.gradient_accumulation_steps,
'max_steps': self.training.max_steps,
'num_train_epochs': self.training.num_train_epochs,
'learning_rate': self.training.learning_rate,
'warmup_steps': self.training.warmup_steps,
'logging_steps': self.training.logging_steps,
'save_steps': self.training.save_steps,
'save_total_limit': self.training.save_total_limit,
'eval_strategy': self.training.eval_strategy,
'eval_steps': self.training.eval_steps,
'load_best_model_at_end': self.training.load_best_model_at_end,
'metric_for_best_model': self.training.metric_for_best_model,
'greater_is_better': self.training.greater_is_better,
'dataloader_num_workers': self.training.dataloader_num_workers,
'dataloader_pin_memory': self.training.dataloader_pin_memory,
'remove_unused_columns': self.training.remove_unused_columns,
'dataloader_drop_last': self.training.dataloader_drop_last,
'use_gradient_checkpointing': self.training.use_gradient_checkpointing,
'offload_to_cpu': self.training.offload_to_cpu,
'optim': self.training.optim,
'weight_decay': self.training.weight_decay,
'adam_beta1': self.training.adam_beta1,
'adam_beta2': self.training.adam_beta2,
'adam_epsilon': self.training.adam_epsilon,
'max_grad_norm': self.training.max_grad_norm,
'per_device_eval_batch_size': self.training.per_device_eval_batch_size,
'eval_accumulation_steps': self.training.eval_accumulation_steps,
'lr_scheduler_type': self.training.lr_scheduler_type,
'warmup_ratio': self.training.warmup_ratio,
'bf16': self.training.bf16,
'fp16': self.training.fp16,
'tf32': self.training.tf32,
'dataset_shuffle': self.training.dataset_shuffle,
'dataset_seed': self.training.dataset_seed,
'torch_compile': self.training.torch_compile
},
'dataset': {
'min_file_size': self.dataset.min_file_size,
'max_file_size': self.dataset.max_file_size,
'supported_languages': self.dataset.supported_languages,
'exclude_patterns': self.dataset.exclude_patterns
},
'memory': {
'max_memory_usage': self.memory.max_memory_usage,
'enable_memory_tracking': self.memory.enable_memory_tracking,
'clear_cache_between_epochs': self.memory.clear_cache_between_epochs,
'use_memory_efficient_attention': self.memory.use_memory_efficient_attention,
'attention_slicing': self.memory.attention_slicing,
'slice_size': self.memory.slice_size
}
}
with open(config_path, 'w', encoding='utf-8') as f:
yaml.dump(config_dict, f, default_flow_style=False, indent=2)