ai_github_trainer/src/utils.py

"""
Utility functions for AI Trainer
Memory management, logging, and helper functions optimized for RTX3070 8GB VRAM
"""

import gc
import logging
import os
import sys
from pathlib import Path
from typing import Dict, Optional, Tuple, Any

import torch
import psutil
from colorama import init, Fore, Back, Style

# Initialize colorama for cross-platform colored output
init(autoreset=True)


def setup_logging(log_level: str = "INFO", log_file: Optional[str] = None) -> logging.Logger:
    """
    Setup logging configuration with colored console output

    Args:
        log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
        log_file: Optional log file path

    Returns:
        Configured logger
    """
    # Create formatter with colors
    class ColoredFormatter(logging.Formatter):
        COLORS = {
            'DEBUG': Fore.CYAN,
            'INFO': Fore.GREEN,
            'WARNING': Fore.YELLOW,
            'ERROR': Fore.RED,
            'CRITICAL': Fore.RED + Back.WHITE
        }

        def format(self, record):
            # Add color to the level name
            if record.levelname in self.COLORS:
                colored_levelname = f"{self.COLORS[record.levelname]}{record.levelname}{Style.RESET_ALL}"
                record.levelname = colored_levelname
            return super().format(record)

    # Create logger
    logger = logging.getLogger()
    logger.setLevel(getattr(logging, log_level.upper()))

    # Console handler with colors
    console_handler = logging.StreamHandler(sys.stdout)
    console_formatter = ColoredFormatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    console_handler.setFormatter(console_formatter)
    logger.addHandler(console_handler)

    # File handler if specified
    if log_file:
        log_path = Path(log_file)
        log_path.parent.mkdir(parents=True, exist_ok=True)

        file_handler = logging.FileHandler(log_path)
        file_formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        file_handler.setFormatter(file_formatter)
        logger.addHandler(file_handler)

    return logger


def check_gpu_memory() -> Dict[str, Any]:
    """
    Check GPU memory status and availability

    Returns:
        Dictionary with GPU memory information
    """
    if not torch.cuda.is_available():
        return {"error": "CUDA not available"}

    try:
        device = torch.cuda.current_device()
        total_memory = torch.cuda.get_device_properties(device).total_memory
        allocated_memory = torch.cuda.memory_allocated(device)
        reserved_memory = torch.cuda.memory_reserved(device)
        free_memory = total_memory - allocated_memory

        return {
            "device": torch.cuda.get_device_name(device),
            "device_id": device,
            "total_memory_gb": round(total_memory / (1024**3), 2),
            "allocated_memory_gb": round(allocated_memory / (1024**3), 2),
            "reserved_memory_gb": round(reserved_memory / (1024**3), 2),
            "free_memory_gb": round(free_memory / (1024**3), 2),
            "memory_utilization": round((allocated_memory / total_memory) * 100, 2),
            "cuda_version": torch.version.cuda,
            "cudnn_version": torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else "N/A"
        }
    except Exception as e:
        return {"error": f"Failed to get GPU info: {str(e)}"}


def get_memory_usage() -> Dict[str, float]:
    """
    Get system memory usage

    Returns:
        Dictionary with memory usage information
    """
    try:
        # GPU memory
        gpu_memory = check_gpu_memory()

        # System memory
        system_memory = psutil.virtual_memory()

        return {
            "gpu_total_gb": gpu_memory.get("total_memory_gb", 0),
            "gpu_allocated_gb": gpu_memory.get("allocated_memory_gb", 0),
            "gpu_free_gb": gpu_memory.get("free_memory_gb", 0),
            "system_total_gb": round(system_memory.total / (1024**3), 2),
            "system_available_gb": round(system_memory.available / (1024**3), 2),
            "system_used_gb": round(system_memory.used / (1024**3), 2),
            "system_memory_percent": system_memory.percent
        }
    except Exception as e:
        return {"error": f"Failed to get memory usage: {str(e)}"}


def clear_gpu_cache():
    """Clear GPU cache and perform garbage collection"""
    try:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()

        # Force garbage collection
        gc.collect()

    except Exception as e:
        print(f"Warning: Failed to clear GPU cache: {str(e)}")


def optimize_memory_settings():
    """Apply memory optimization settings for RTX3070"""
    try:
        if torch.cuda.is_available():
            # Set memory fraction to prevent out-of-memory
            torch.cuda.set_per_process_memory_fraction(0.85)  # Use 85% of GPU memory

            # Enable TF32 for better performance
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True

            # Optimize CUDA memory allocator
            os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'

    except Exception as e:
        print(f"Warning: Failed to optimize memory settings: {str(e)}")


def format_bytes(bytes_value: int) -> str:
    """
    Format bytes into human readable format

    Args:
        bytes_value: Number of bytes

    Returns:
        Formatted string (e.g., "1.5 GB")
    """
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if bytes_value < 1024.0:
            return ".1f"
        bytes_value /= 1024.0
    return ".1f"


def print_system_info():
    """Print comprehensive system information"""
    print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
    print(f"{Fore.CYAN}SYSTEM INFORMATION{Style.RESET_ALL}")
    print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}")

    # GPU Information
    gpu_info = check_gpu_memory()
    if "error" not in gpu_info:
        print(f"\n{Fore.GREEN}GPU Information:{Style.RESET_ALL}")
        print(f"  Device: {gpu_info['device']}")
        print(f"  CUDA Version: {gpu_info['cuda_version']}")
        print(f"  Total Memory: {gpu_info['total_memory_gb']} GB")
        print(f"  Allocated Memory: {gpu_info['allocated_memory_gb']} GB")
        print(f"  Free Memory: {gpu_info['free_memory_gb']} GB")
        print(f"  Memory Utilization: {gpu_info['memory_utilization']}%")
    else:
        print(f"\n{Fore.RED}GPU Information: {gpu_info['error']}{Style.RESET_ALL}")

    # System Memory
    system_memory = psutil.virtual_memory()
    print(f"\n{Fore.GREEN}System Memory:{Style.RESET_ALL}")
    print(f"  Total: {format_bytes(system_memory.total)}")
    print(f"  Available: {format_bytes(system_memory.available)}")
    print(f"  Used: {format_bytes(system_memory.used)}")
    print(f"  Usage: {system_memory.percent}%")

    # CPU Information
    print(f"\n{Fore.GREEN}CPU Information:{Style.RESET_ALL}")
    print(f"  Cores: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical")
    print(f"  CPU Usage: {psutil.cpu_percent()}%")

    print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")


def validate_environment():
    """Validate that the environment is suitable for training"""
    issues = []

    # Check CUDA availability
    if not torch.cuda.is_available():
        issues.append("CUDA is not available. A CUDA-compatible GPU is required.")

    # Check GPU memory
    if torch.cuda.is_available():
        gpu_info = check_gpu_memory()
        if "total_memory_gb" in gpu_info:
            total_memory = gpu_info["total_memory_gb"]
            if total_memory < 8:
                issues.append(f"GPU memory ({total_memory} GB) may be insufficient. Recommended: 8GB+")

    # Check required Python modules
    required_modules = ['torch', 'transformers', 'datasets', 'git']
    for module in required_modules:
        try:
            __import__(module)
        except ImportError:
            issues.append(f"Required module '{module}' is not installed.")

    if issues:
        print(f"\n{Fore.YELLOW}Environment Validation Issues:{Style.RESET_ALL}")
        for issue in issues:
            print(f"  - {issue}")
        return False

    print(f"\n{Fore.GREEN}Environment validation passed!{Style.RESET_ALL}")
    return True


def create_training_summary(config, training_time: float, final_model_path: str) -> str:
    """
    Create a summary of the training session

    Args:
        config: Training configuration
        training_time: Training time in seconds
        final_model_path: Path to the saved model

    Returns:
        Formatted summary string
    """
    summary = ".1f"".2f"f"""
{Fore.CYAN}{'='*60}{Style.RESET_ALL}
TRAINING SUMMARY
{Fore.CYAN}{'='*60}{Style.RESET_ALL}

Configuration:
  Model: {config.model.name}
  Epochs: {config.training.num_train_epochs}
  Batch Size: {config.training.per_device_train_batch_size}
  Gradient Accumulation: {config.training.gradient_accumulation_steps}
  Learning Rate: {config.training.learning_rate}
  Max Sequence Length: {config.model.max_seq_length}

Performance:
  Training Time: {training_time:.2f} seconds ({training_time/3600:.2f} hours)
  Effective Batch Size: {config.training.per_device_train_batch_size * config.training.gradient_accumulation_steps}

Output:
  Model Saved To: {final_model_path}

Memory Settings:
  Gradient Checkpointing: {config.training.use_gradient_checkpointing}
  CPU Offloading: {config.training.offload_to_cpu}
  BF16 Enabled: {config.training.bf16}

{Fore.CYAN}{'='*60}{Style.RESET_ALL}
"""

    return summary


def safe_import(module_name: str, fallback: Any = None):
    """
    Safely import a module with fallback

    Args:
        module_name: Name of the module to import
        fallback: Fallback value if import fails

    Returns:
        Imported module or fallback
    """
    try:
        return __import__(module_name)
    except ImportError:
        return fallback


# Initialize memory optimization settings on import
try:
    optimize_memory_settings()
except Exception:
    pass  # Ignore errors during initialization