Source code for tango.integrations.fairscale.fsdp_config

from dataclasses import asdict, dataclass
from typing import Any, Dict, Optional

import torch
from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP

from tango.common import FromParams


[docs]@dataclass
class FSDPConfig(FromParams):
    """
    Defines all of the configurable options for FairScale's :class:`~fairscale.nn.FullyShardedDataParallel`.

    .. seealso::
        `Best practices for FullyShardedDataParallel <https://fairscale.readthedocs.io/en/latest/deep_dive/oss_sdp_fsdp.html#best-practices-for-fairscale-nn-fullyshardeddataparallel>`_
        from the FairScale docs.

    """  # noqa: E501

    reshard_after_forward: bool = True
    """
    See the docstring for :class:`~fairscale.nn.FullyShardedDataParallel`.
    """

    move_params_to_cpu: bool = False
    """
    See the docstring for :class:`~fairscale.nn.FullyShardedDataParallel`.
    """

    move_grads_to_cpu: Optional[bool] = None
    """
    See the docstring for :class:`~fairscale.nn.FullyShardedDataParallel`.

    .. seealso::
        :data:`move_params_to_cpu`

    .. warning::
        At the moment we recommend that you don't mess with this parameter, or only explicitly
        set it to the same value as :data:`move_params_to_cpu`. If you leave it as ``None``
        (the default), it will automatically be set to match :data:`move_params_to_cpu` by FairScale.

        Currently training seems to crash if you set this ``False`` while :data:`move_params_to_cpu` is ``True``.
        We're tracking `fairscale#918 <https://github.com/facebookresearch/fairscale/issues/918>`_,
        which may be related.
    """

    mixed_precision: bool = False
    """
    See the docstring for :class:`~fairscale.nn.FullyShardedDataParallel`.

    .. important::
        We recommend setting this to the same value as the ``amp`` parameter in
        :class:`FairScaleTrainingEngine`.

        Based on our experiments, if you're training with AMP enabled (``amp=True``)
        you might see a small additional speedup in training time along with a small
        additional decrease in GPU memory utilization without any performance penalty
        (with respect to convergence) by setting this to ``True``.
        But if you're *not* training with AMP, setting this ``True`` could impact the
        model's ability to converge.

    """

[docs]    def as_kwargs(self) -> Dict[str, Any]:
        """
        Convert to the appropriate ``kwargs`` for :class:`~fairscale.nn.FullyShardedDataParallel`.
        """
        return asdict(self)

[docs]    def wrap(self, module: torch.nn.Module):
        """
        A convenience method for wrapping a module in :class:`~fairscale.nn.FullyShardedDataParallel`
        with all of the options defined in this class.

        .. seealso::
            Internally this is what :func:`with_wrapped_modules()` calls.

        """
        return FSDP(module, **self.as_kwargs())