""" Configuration management for AI Trainer Handles training parameters and model settings """ import os from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional, Union import yaml @dataclass class ModelConfig: """Model-specific configuration""" name: str = "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit" max_seq_length: int = 1024 trust_remote_code: bool = True use_fast_tokenizer: bool = True padding_side: str = "left" truncation_side: str = "left" @dataclass class TrainingConfig: """Training configuration""" per_device_train_batch_size: int = 2 gradient_accumulation_steps: int = 4 max_steps: int = 100 num_train_epochs: int = 2 learning_rate: float = 2e-4 warmup_steps: int = 10 logging_steps: int = 1 save_steps: int = 100 save_total_limit: int = 3 eval_strategy: str = "steps" eval_steps: int = 100 load_best_model_at_end: bool = True metric_for_best_model: str = "loss" greater_is_better: bool = False dataloader_num_workers: int = 2 dataloader_pin_memory: bool = True remove_unused_columns: bool = False dataloader_drop_last: bool = True label_names: List[str] = None # Memory optimization for RTX3070 8GB use_gradient_checkpointing: bool = True offload_to_cpu: bool = False # Explicitly no CPU offloading use_reentrant: bool = True gradient_checkpointing_kwargs: Dict = None # Optimizer settings optim: str = "adamw_torch" weight_decay: float = 0.01 adam_beta1: float = 0.9 adam_beta2: float = 0.999 adam_epsilon: float = 1e-8 max_grad_norm: float = 0.5 per_device_eval_batch_size: int = 1 eval_accumulation_steps: int = 1 # Learning rate scheduler lr_scheduler_type: str = "cosine" warmup_ratio: float = 0.1 # Precision settings bf16: bool = True fp16: bool = False tf32: bool = True # Compilation settings torch_compile: bool = False # Dataset processing dataset_shuffle: bool = True dataset_seed: int = 3407 # Output settings output_dir: str = "./models" logging_dir: str = "./logs" report_to: List[str] = None def __post_init__(self): if self.label_names is None: self.label_names = ["labels"] if self.gradient_checkpointing_kwargs is None: self.gradient_checkpointing_kwargs = {"use_reentrant": self.use_reentrant} if self.report_to is None: self.report_to = ["tensorboard"] @dataclass class DatasetConfig: """Dataset processing configuration""" min_file_size: int = 10 max_file_size: int = 10000 # Characters supported_languages: List[str] = None exclude_patterns: List[str] = None def __post_init__(self): if self.supported_languages is None: self.supported_languages = [ 'python', 'javascript', 'typescript', 'java', 'cpp', 'c', 'csharp', 'php', 'ruby', 'go', 'rust', 'swift', 'kotlin', 'scala', 'sql', 'bash', 'yaml', 'json', 'xml', 'html', 'css' ] if self.exclude_patterns is None: self.exclude_patterns = [ r'\.git/', r'__pycache__/', r'node_modules/', r'\.venv/', r'package-lock\.json$', r'\.log$' ] @dataclass class MemoryConfig: """Memory optimization settings for RTX3070 8GB""" max_memory_usage: float = 0.95 # Use up to 95% of GPU memory enable_memory_tracking: bool = True clear_cache_between_epochs: bool = True use_memory_efficient_attention: bool = True attention_slicing: bool = True slice_size: int = 1 @dataclass class AppConfig: """Main application configuration""" model: ModelConfig training: TrainingConfig dataset: DatasetConfig memory: MemoryConfig @classmethod def from_yaml(cls, config_path: Union[str, Path]) -> "AppConfig": """Load configuration from YAML file""" config_path = Path(config_path) if not config_path.exists(): # Create default configuration config = cls( model=ModelConfig(), training=TrainingConfig(), dataset=DatasetConfig(), memory=MemoryConfig() ) config.save_yaml(config_path) return config with open(config_path, 'r', encoding='utf-8') as f: config_dict = yaml.safe_load(f) # Parse nested configurations model_config = ModelConfig(**config_dict.get('model', {})) training_config = TrainingConfig(**config_dict.get('training', {})) dataset_config = DatasetConfig(**config_dict.get('dataset', {})) memory_config = MemoryConfig(**config_dict.get('memory', {})) return cls( model=model_config, training=training_config, dataset=dataset_config, memory=memory_config ) def save_yaml(self, config_path: Union[str, Path]): """Save configuration to YAML file""" config_path = Path(config_path) config_path.parent.mkdir(parents=True, exist_ok=True) config_dict = { 'model': { 'name': self.model.name, 'max_seq_length': self.model.max_seq_length, 'trust_remote_code': self.model.trust_remote_code, 'use_fast_tokenizer': self.model.use_fast_tokenizer, 'padding_side': self.model.padding_side, 'truncation_side': self.model.truncation_side }, 'training': { 'per_device_train_batch_size': self.training.per_device_train_batch_size, 'gradient_accumulation_steps': self.training.gradient_accumulation_steps, 'max_steps': self.training.max_steps, 'num_train_epochs': self.training.num_train_epochs, 'learning_rate': self.training.learning_rate, 'warmup_steps': self.training.warmup_steps, 'logging_steps': self.training.logging_steps, 'save_steps': self.training.save_steps, 'save_total_limit': self.training.save_total_limit, 'eval_strategy': self.training.eval_strategy, 'eval_steps': self.training.eval_steps, 'load_best_model_at_end': self.training.load_best_model_at_end, 'metric_for_best_model': self.training.metric_for_best_model, 'greater_is_better': self.training.greater_is_better, 'dataloader_num_workers': self.training.dataloader_num_workers, 'dataloader_pin_memory': self.training.dataloader_pin_memory, 'remove_unused_columns': self.training.remove_unused_columns, 'dataloader_drop_last': self.training.dataloader_drop_last, 'use_gradient_checkpointing': self.training.use_gradient_checkpointing, 'offload_to_cpu': self.training.offload_to_cpu, 'optim': self.training.optim, 'weight_decay': self.training.weight_decay, 'adam_beta1': self.training.adam_beta1, 'adam_beta2': self.training.adam_beta2, 'adam_epsilon': self.training.adam_epsilon, 'max_grad_norm': self.training.max_grad_norm, 'per_device_eval_batch_size': self.training.per_device_eval_batch_size, 'eval_accumulation_steps': self.training.eval_accumulation_steps, 'lr_scheduler_type': self.training.lr_scheduler_type, 'warmup_ratio': self.training.warmup_ratio, 'bf16': self.training.bf16, 'fp16': self.training.fp16, 'tf32': self.training.tf32, 'dataset_shuffle': self.training.dataset_shuffle, 'dataset_seed': self.training.dataset_seed, 'torch_compile': self.training.torch_compile }, 'dataset': { 'min_file_size': self.dataset.min_file_size, 'max_file_size': self.dataset.max_file_size, 'supported_languages': self.dataset.supported_languages, 'exclude_patterns': self.dataset.exclude_patterns }, 'memory': { 'max_memory_usage': self.memory.max_memory_usage, 'enable_memory_tracking': self.memory.enable_memory_tracking, 'clear_cache_between_epochs': self.memory.clear_cache_between_epochs, 'use_memory_efficient_attention': self.memory.use_memory_efficient_attention, 'attention_slicing': self.memory.attention_slicing, 'slice_size': self.memory.slice_size } } with open(config_path, 'w', encoding='utf-8') as f: yaml.dump(config_dict, f, default_flow_style=False, indent=2)