2. add DataProcessorSynthetic class to format github repo to QA ChatML format
243 lines
8.8 KiB
Python
243 lines
8.8 KiB
Python
"""
|
|
Configuration management for AI Trainer
|
|
Handles training parameters and model settings
|
|
"""
|
|
|
|
import os
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Union
|
|
|
|
import yaml
|
|
|
|
|
|
@dataclass
|
|
class ModelConfig:
|
|
"""Model-specific configuration"""
|
|
name: str = "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
|
|
max_seq_length: int = 1024
|
|
trust_remote_code: bool = True
|
|
use_fast_tokenizer: bool = True
|
|
padding_side: str = "left"
|
|
truncation_side: str = "left"
|
|
|
|
|
|
@dataclass
|
|
class TrainingConfig:
|
|
"""Training configuration"""
|
|
per_device_train_batch_size: int = 2
|
|
gradient_accumulation_steps: int = 4
|
|
max_steps: int = 100
|
|
num_train_epochs: int = 2
|
|
learning_rate: float = 2e-4
|
|
warmup_steps: int = 10
|
|
logging_steps: int = 1
|
|
save_steps: int = 100
|
|
save_total_limit: int = 3
|
|
eval_strategy: str = "steps"
|
|
eval_steps: int = 100
|
|
load_best_model_at_end: bool = True
|
|
metric_for_best_model: str = "loss"
|
|
greater_is_better: bool = False
|
|
dataloader_num_workers: int = 2
|
|
dataloader_pin_memory: bool = True
|
|
remove_unused_columns: bool = False
|
|
dataloader_drop_last: bool = True
|
|
label_names: List[str] = None
|
|
|
|
# Memory optimization for RTX3070 8GB
|
|
use_gradient_checkpointing: bool = True
|
|
offload_to_cpu: bool = False # Explicitly no CPU offloading
|
|
use_reentrant: bool = True
|
|
gradient_checkpointing_kwargs: Dict = None
|
|
|
|
# Optimizer settings
|
|
optim: str = "adamw_torch"
|
|
weight_decay: float = 0.01
|
|
adam_beta1: float = 0.9
|
|
adam_beta2: float = 0.999
|
|
adam_epsilon: float = 1e-8
|
|
max_grad_norm: float = 0.5
|
|
per_device_eval_batch_size: int = 1
|
|
eval_accumulation_steps: int = 1
|
|
|
|
# Learning rate scheduler
|
|
lr_scheduler_type: str = "cosine"
|
|
warmup_ratio: float = 0.1
|
|
|
|
# Precision settings
|
|
bf16: bool = True
|
|
fp16: bool = False
|
|
tf32: bool = True
|
|
|
|
# Compilation settings
|
|
torch_compile: bool = False
|
|
|
|
# Dataset processing
|
|
dataset_shuffle: bool = True
|
|
dataset_seed: int = 3407
|
|
|
|
# Output settings
|
|
output_dir: str = "./models"
|
|
logging_dir: str = "./logs"
|
|
report_to: List[str] = None
|
|
|
|
def __post_init__(self):
|
|
if self.label_names is None:
|
|
self.label_names = ["labels"]
|
|
|
|
if self.gradient_checkpointing_kwargs is None:
|
|
self.gradient_checkpointing_kwargs = {"use_reentrant": self.use_reentrant}
|
|
|
|
if self.report_to is None:
|
|
self.report_to = ["tensorboard"]
|
|
|
|
|
|
@dataclass
|
|
class DatasetConfig:
|
|
"""Dataset processing configuration"""
|
|
min_file_size: int = 10
|
|
max_file_size: int = 10000 # Characters
|
|
supported_languages: List[str] = None
|
|
exclude_patterns: List[str] = None
|
|
|
|
def __post_init__(self):
|
|
if self.supported_languages is None:
|
|
self.supported_languages = [
|
|
'python', 'javascript', 'typescript', 'java', 'cpp', 'c',
|
|
'csharp', 'php', 'ruby', 'go', 'rust', 'swift', 'kotlin',
|
|
'scala', 'sql', 'bash', 'yaml', 'json', 'xml', 'html', 'css'
|
|
]
|
|
|
|
if self.exclude_patterns is None:
|
|
self.exclude_patterns = [
|
|
r'\.git/',
|
|
r'__pycache__/',
|
|
r'node_modules/',
|
|
r'\.venv/',
|
|
r'package-lock\.json$',
|
|
r'\.log$'
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class MemoryConfig:
|
|
"""Memory optimization settings for RTX3070 8GB"""
|
|
max_memory_usage: float = 0.95 # Use up to 95% of GPU memory
|
|
enable_memory_tracking: bool = True
|
|
clear_cache_between_epochs: bool = True
|
|
use_memory_efficient_attention: bool = True
|
|
attention_slicing: bool = True
|
|
slice_size: int = 1
|
|
|
|
|
|
@dataclass
|
|
class AppConfig:
|
|
"""Main application configuration"""
|
|
model: ModelConfig
|
|
training: TrainingConfig
|
|
dataset: DatasetConfig
|
|
memory: MemoryConfig
|
|
|
|
@classmethod
|
|
def from_yaml(cls, config_path: Union[str, Path]) -> "AppConfig":
|
|
"""Load configuration from YAML file"""
|
|
config_path = Path(config_path)
|
|
|
|
if not config_path.exists():
|
|
# Create default configuration
|
|
config = cls(
|
|
model=ModelConfig(),
|
|
training=TrainingConfig(),
|
|
dataset=DatasetConfig(),
|
|
memory=MemoryConfig()
|
|
)
|
|
config.save_yaml(config_path)
|
|
return config
|
|
|
|
with open(config_path, 'r', encoding='utf-8') as f:
|
|
config_dict = yaml.safe_load(f)
|
|
|
|
# Parse nested configurations
|
|
model_config = ModelConfig(**config_dict.get('model', {}))
|
|
training_config = TrainingConfig(**config_dict.get('training', {}))
|
|
dataset_config = DatasetConfig(**config_dict.get('dataset', {}))
|
|
memory_config = MemoryConfig(**config_dict.get('memory', {}))
|
|
|
|
return cls(
|
|
model=model_config,
|
|
training=training_config,
|
|
dataset=dataset_config,
|
|
memory=memory_config
|
|
)
|
|
|
|
def save_yaml(self, config_path: Union[str, Path]):
|
|
"""Save configuration to YAML file"""
|
|
config_path = Path(config_path)
|
|
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
config_dict = {
|
|
'model': {
|
|
'name': self.model.name,
|
|
'max_seq_length': self.model.max_seq_length,
|
|
'trust_remote_code': self.model.trust_remote_code,
|
|
'use_fast_tokenizer': self.model.use_fast_tokenizer,
|
|
'padding_side': self.model.padding_side,
|
|
'truncation_side': self.model.truncation_side
|
|
},
|
|
'training': {
|
|
'per_device_train_batch_size': self.training.per_device_train_batch_size,
|
|
'gradient_accumulation_steps': self.training.gradient_accumulation_steps,
|
|
'max_steps': self.training.max_steps,
|
|
'num_train_epochs': self.training.num_train_epochs,
|
|
'learning_rate': self.training.learning_rate,
|
|
'warmup_steps': self.training.warmup_steps,
|
|
'logging_steps': self.training.logging_steps,
|
|
'save_steps': self.training.save_steps,
|
|
'save_total_limit': self.training.save_total_limit,
|
|
'eval_strategy': self.training.eval_strategy,
|
|
'eval_steps': self.training.eval_steps,
|
|
'load_best_model_at_end': self.training.load_best_model_at_end,
|
|
'metric_for_best_model': self.training.metric_for_best_model,
|
|
'greater_is_better': self.training.greater_is_better,
|
|
'dataloader_num_workers': self.training.dataloader_num_workers,
|
|
'dataloader_pin_memory': self.training.dataloader_pin_memory,
|
|
'remove_unused_columns': self.training.remove_unused_columns,
|
|
'dataloader_drop_last': self.training.dataloader_drop_last,
|
|
'use_gradient_checkpointing': self.training.use_gradient_checkpointing,
|
|
'offload_to_cpu': self.training.offload_to_cpu,
|
|
'optim': self.training.optim,
|
|
'weight_decay': self.training.weight_decay,
|
|
'adam_beta1': self.training.adam_beta1,
|
|
'adam_beta2': self.training.adam_beta2,
|
|
'adam_epsilon': self.training.adam_epsilon,
|
|
'max_grad_norm': self.training.max_grad_norm,
|
|
'per_device_eval_batch_size': self.training.per_device_eval_batch_size,
|
|
'eval_accumulation_steps': self.training.eval_accumulation_steps,
|
|
'lr_scheduler_type': self.training.lr_scheduler_type,
|
|
'warmup_ratio': self.training.warmup_ratio,
|
|
'bf16': self.training.bf16,
|
|
'fp16': self.training.fp16,
|
|
'tf32': self.training.tf32,
|
|
'dataset_shuffle': self.training.dataset_shuffle,
|
|
'dataset_seed': self.training.dataset_seed,
|
|
'torch_compile': self.training.torch_compile
|
|
},
|
|
'dataset': {
|
|
'min_file_size': self.dataset.min_file_size,
|
|
'max_file_size': self.dataset.max_file_size,
|
|
'supported_languages': self.dataset.supported_languages,
|
|
'exclude_patterns': self.dataset.exclude_patterns
|
|
},
|
|
'memory': {
|
|
'max_memory_usage': self.memory.max_memory_usage,
|
|
'enable_memory_tracking': self.memory.enable_memory_tracking,
|
|
'clear_cache_between_epochs': self.memory.clear_cache_between_epochs,
|
|
'use_memory_efficient_attention': self.memory.use_memory_efficient_attention,
|
|
'attention_slicing': self.memory.attention_slicing,
|
|
'slice_size': self.memory.slice_size
|
|
}
|
|
}
|
|
|
|
with open(config_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(config_dict, f, default_flow_style=False, indent=2) |