| import argparse |
| import os |
| import shutil |
| import time |
| from datetime import datetime |
| from importlib import import_module |
| from pathlib import Path |
| from typing import Any, Dict, Optional |
|
|
| import torch |
| from lightning.pytorch.utilities import rank_zero_info |
| from omegaconf import OmegaConf |
|
|
|
|
| class Config: |
| def __init__(self, config_path: str = None, override_args: Dict[str, Any] = None): |
| self.config = OmegaConf.create({}) |
|
|
| |
| if config_path: |
| self.load_yaml(config_path) |
| if override_args: |
| self.override_config(override_args) |
|
|
| def load_yaml(self, config_path: str): |
| """Load YAML configuration file""" |
| loaded_config = OmegaConf.load(config_path) |
| self.config = OmegaConf.merge(self.config, loaded_config) |
|
|
| def override_config(self, override_args: Dict[str, Any]): |
| """Handle command line override arguments""" |
| dotlist = [] |
| for key, value in override_args.items(): |
| |
| |
| |
| |
| |
| |
| |
|
|
| val = self._convert_value(value) |
| |
| |
| |
|
|
| |
| |
| OmegaConf.update(self.config, key, val) |
|
|
| def _convert_value(self, value: str) -> Any: |
| """Convert string value to appropriate type""" |
| if value.lower() == "true": |
| return True |
| elif value.lower() == "false": |
| return False |
| elif value.lower() == "null": |
| return None |
| try: |
| return int(value) |
| except ValueError: |
| try: |
| return float(value) |
| except ValueError: |
| return value |
|
|
| def get(self, key: str, default: Any = None) -> Any: |
| """Get configuration value""" |
| return OmegaConf.select(self.config, key, default=default) |
|
|
| def __getattr__(self, name: str) -> Any: |
| """Support dot notation access""" |
| return self.config[name] |
|
|
| def __getitem__(self, key: str) -> Any: |
| """Support dictionary-like access""" |
| return self.config[key] |
|
|
| def export_config(self, path: str): |
| """Export current configuration to file""" |
| OmegaConf.save(self.config, path) |
|
|
|
|
| def parse_args(): |
| """Parse command line arguments""" |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "--config", type=str, required=True, help="Path to config file" |
| ) |
| parser.add_argument( |
| "--override", type=str, nargs="+", help="Override config values (key=value)" |
| ) |
| return parser.parse_args() |
|
|
|
|
| def load_config( |
| config_path: Optional[str] = None, override_args: Optional[Dict[str, Any]] = None |
| ) -> Config: |
| """Load configuration""" |
| if config_path is None: |
| args = parse_args() |
| config_path = args.config |
| if args.override: |
| override_args = {} |
| for override in args.override: |
| key, value = override.split("=", 1) |
| override_args[key.strip()] = value.strip() |
|
|
| return Config(config_path, override_args) |
|
|
|
|
| def instantiate(target, cfg=None, hfstyle=False, **init_args): |
| module_name, class_name = target.rsplit(".", 1) |
| module = import_module(module_name) |
| class_ = getattr(module, class_name) |
| if cfg is None: |
| return class_(**init_args) |
| else: |
| if hfstyle: |
| config_class = class_.config_class |
| cfg = config_class(config_obj=cfg) |
| return class_(cfg, **init_args) |
|
|
|
|
| def get_function(target): |
| module_name, function_name = target.rsplit(".", 1) |
| module = import_module(module_name) |
| function_ = getattr(module, function_name) |
| return function_ |
|
|
|
|
| def save_config_and_codes(config, save_dir): |
| os.makedirs(save_dir, exist_ok=True) |
| sanity_check_dir = os.path.join(save_dir, "sanity_check") |
| os.makedirs(sanity_check_dir, exist_ok=True) |
| with open(os.path.join(sanity_check_dir, f"{config.exp_name}.yaml"), "w") as f: |
| OmegaConf.save(config.config, f) |
| current_dir = Path.cwd() |
| exclude_dir = current_dir / "outputs" |
| for py_file in current_dir.rglob("*.py"): |
| if exclude_dir in py_file.parents: |
| continue |
| dest_path = Path(sanity_check_dir) / py_file.relative_to(current_dir) |
| dest_path.parent.mkdir(parents=True, exist_ok=True) |
| shutil.copy(py_file, dest_path) |
|
|
|
|
| def print_model_size(model): |
| total_params = sum(p.numel() for p in model.parameters()) |
| trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| rank_zero_info(f"Total parameters: {total_params:,}") |
| rank_zero_info(f"Trainable parameters: {trainable_params:,}") |
| rank_zero_info(f"Non-trainable parameters: {(total_params - trainable_params):,}") |
|
|
|
|
| def compare_statedict_and_parameters(state_dict, named_parameters, named_buffers): |
| """Compare differences between state_dict and parameters""" |
| |
| state_dict_keys = set(state_dict.keys()) |
|
|
| |
| named_params_keys = set(name for name, _ in named_parameters) |
|
|
| |
| only_in_state_dict = state_dict_keys - named_params_keys |
|
|
| |
| only_in_named_params = named_params_keys - state_dict_keys |
|
|
| |
| if only_in_state_dict: |
| print(f"Only in state_dict (not in parameters): {sorted(only_in_state_dict)}") |
|
|
| if only_in_named_params: |
| print( |
| f"Only in named_parameters (not in state_dict): {sorted(only_in_named_params)}" |
| ) |
|
|
| if not only_in_state_dict and not only_in_named_params: |
| print("All parameters match between state_dict and named_parameters") |
|
|
| |
| named_buffers_keys = set(name for name, _ in named_buffers) |
| buffers_only = state_dict_keys - named_params_keys - named_buffers_keys |
|
|
| if buffers_only: |
| print( |
| f"Other items in state_dict (neither params nor buffers): {sorted(buffers_only)}" |
| ) |
|
|
| print(f"Total state_dict items: {len(state_dict_keys)}") |
| print(f"Total named_parameters: {len(named_params_keys)}") |
| print(f"Total named_buffers: {len(named_buffers_keys)}") |
|
|
|
|
| def _resolve_global_rank() -> int: |
| """Resolve the global rank from environment variables.""" |
| for key in ("GLOBAL_RANK", "RANK", "SLURM_PROCID", "LOCAL_RANK"): |
| if key in os.environ: |
| try: |
| return int(os.environ[key]) |
| except ValueError: |
| continue |
| return 0 |
|
|
|
|
| def get_shared_run_time(base_dir: str, env_key: str = "PL_RUN_TIME") -> str: |
| """ |
| Get a synchronized run time across all processes. |
| |
| This function ensures all processes (both in distributed training and multi-process |
| scenarios) use the same timestamp for output directories and experiment tracking. |
| |
| Args: |
| base_dir: Base directory for output files |
| env_key: Environment variable key to cache the run time |
| |
| Returns: |
| Synchronized timestamp string in format YYYYMMDD_HHMMSS |
| """ |
| cached = os.environ.get(env_key) |
| if cached: |
| return cached |
|
|
| timestamp_format = "%Y%m%d_%H%M%S" |
|
|
| if torch.distributed.is_available() and torch.distributed.is_initialized(): |
| if torch.distributed.get_rank() == 0: |
| run_time = datetime.now().strftime(timestamp_format) |
| else: |
| run_time = None |
| container = [run_time] |
| torch.distributed.broadcast_object_list(container, src=0) |
| run_time = container[0] |
| if run_time is None: |
| raise RuntimeError("Failed to synchronize run time across ranks.") |
| os.environ[env_key] = run_time |
| return run_time |
|
|
| os.makedirs(base_dir, exist_ok=True) |
| sync_token = ( |
| os.environ.get("SLURM_JOB_ID") |
| or os.environ.get("TORCHELASTIC_RUN_ID") |
| or os.environ.get("JOB_ID") |
| or "default" |
| ) |
| sync_dir = os.path.join(base_dir, ".run_time_sync") |
| os.makedirs(sync_dir, exist_ok=True) |
| sync_file = os.path.join(sync_dir, f"{sync_token}.txt") |
|
|
| global_rank = _resolve_global_rank() |
| if global_rank == 0: |
| |
| if os.path.exists(sync_file): |
| try: |
| os.remove(sync_file) |
| except OSError: |
| pass |
|
|
| run_time = datetime.now().strftime(timestamp_format) |
| with open(sync_file, "w", encoding="utf-8") as f: |
| f.write(run_time) |
| else: |
| timeout = time.monotonic() + 1200.0 |
| while True: |
| if os.path.exists(sync_file): |
| try: |
| with open(sync_file, "r", encoding="utf-8") as f: |
| run_time = f.read().strip() |
| |
| |
| dt = datetime.strptime(run_time, timestamp_format) |
| if abs((datetime.now() - dt).total_seconds()) < 60: |
| break |
| except (ValueError, OSError): |
| |
| pass |
|
|
| if time.monotonic() > timeout: |
| raise TimeoutError( |
| "Timed out waiting for rank 0 to write synchronized timestamp." |
| ) |
| time.sleep(0.1) |
|
|
| os.environ[env_key] = run_time |
| return run_time |
|
|