Source code for tango.integrations.fairscale.training_engine

import logging
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

import torch
from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
from fairscale.optim.grad_scaler import ShardedGradScaler

from tango.common import Lazy
from tango.common.exceptions import ConfigurationError
from tango.integrations.torch import (
    LRScheduler,
    Model,
    Optimizer,
    TorchTrainingEngine,
    TrainConfig,
    TrainingEngine,
)

from .fsdp_config import FSDPConfig


[docs]@TrainingEngine.register("fairscale") class FairScaleTrainingEngine(TorchTrainingEngine): """ A :class:`~tango.integrations.torch.TrainingEngine` that leverages FairScale's :class:`~fairscale.nn.FullyShardedDataParallel` for use within :class:`~tango.integrations.torch.TorchTrainStep`. .. tip:: Registered as an :class:`~tango.integrations.torch.TrainingEngine` under the name "fairscale". .. tip:: To get the best performance out of :class:`FairScaleTrainingEngine` you should wrap individual layers of your model with :class:`~fairscale.nn.FullyShardedDataParallel` and/or :class:`~fairscale.nn.checkpoint.checkpoint_wrapper` while instantiating them. You can use :class:`with_wrapped_modules()` to accomplish this. .. important:: Only the parameters listed below should be defined in a configuration file. The other parameters will be automatically passed to the constructor within :class:`~tango.integrations.torch.TorchTrainStep`. .. warning:: :class:`~FairScaleTrainingEngine` can only be used in distributed training, i.e. when ``device_count > 1`` in the :class:`~tango.integrations.torch.TorchTrainStep`. For maximum memory savings, we recommend training with AMP enabled and the following :class:`FSDPConfig`: .. testcode:: from tango.integrations.fairscale import FSDPConfig fsdp_config = FSDPConfig( reshard_after_forward=True, move_params_to_cpu=True, move_grads_to_cpu=True, mixed_precision=True, ) For maximum training *speed*, we recommend training with AMP enabled and the following :class:`FSDPConfig`: .. testcode:: from tango.integrations.fairscale import FSDPConfig fsdp_config = FSDPConfig( reshard_after_forward=False, move_params_to_cpu=False, move_grads_to_cpu=False, mixed_precision=True, ) :param amp: Use automatic mixed precision (AMP). Default is ``False``. :param max_grad_norm: If set, gradients will be clipped to have this max norm. Default is ``None``. :param amp_use_bfloat16: Set to ``True`` to force using the ``bfloat16`` datatype in mixed precision training. Only applicable when ``amp=True``. If not specified, the default behavior will be to use ``bfloat16`` when training with AMP on CPU, otherwise not. :param fsdp_config: The options for :class:`~fairscale.nn.FullyShardedDataParallel`. If not specified, the default options will be used. """ def __init__( self, train_config: TrainConfig, model: Lazy[Model], optimizer: Lazy[Optimizer], *, lr_scheduler: Optional[Lazy[LRScheduler]] = None, amp: bool = False, max_grad_norm: Optional[float] = None, amp_use_bfloat16: Optional[bool] = None, fsdp_config: Optional[FSDPConfig] = None, ) -> None: if not train_config.is_distributed: raise ConfigurationError( f"{self.__class__.__name__} can only be used with distributed training" ) self.fsdp_config = fsdp_config or FSDPConfig() self.logger = logging.getLogger(self.__class__.__name__) super().__init__( train_config, model, optimizer, lr_scheduler=lr_scheduler, amp=amp, max_grad_norm=max_grad_norm, amp_use_bfloat16=amp_use_bfloat16, ) if amp: self.grad_scaler = ShardedGradScaler() def _construct_model(self, model: Union[Model, Lazy[Model]]) -> Model: if isinstance(model, Lazy): model = model.construct() if not self.fsdp_config.move_params_to_cpu: model.to(self.train_config.worker_local_default_device) return FSDP(model, **self.fsdp_config.as_kwargs()) def clip_grad_norm(self) -> None: if self.max_grad_norm is not None: self.model.clip_grad_norm_(self.max_grad_norm) # type: ignore def get_model_state(self) -> Dict[str, torch.Tensor]: return { "weights": self.model.local_state_dict(), # type: ignore "metadata": self.model.local_metadata_dict(), # type: ignore } def load_model_state(self, state_dict: Dict[str, torch.Tensor]) -> None: self.model.load_local_state_dict(state_dict["weights"]) # type: ignore def save_complete_weights_from_checkpoint( self, checkpoint_dir: Path, weights_path: Path ) -> None: self.logger.info("Consolidating sharded checkpoint weights...") sharded_weights: List[Dict[str, torch.Tensor]] = [] sharded_metadata: List[Dict[str, Any]] = [] for path in checkpoint_dir.resolve().glob("worker*_model.pt"): sharded_state = torch.load(path, map_location="cpu") sharded_weights.append(sharded_state["weights"]) sharded_metadata.append(sharded_state["metadata"]) full_state = FSDP.consolidate_shard_weights(sharded_weights, sharded_metadata) del sharded_weights del sharded_metadata torch.save(full_state, weights_path)