Source code for tango.integrations.fairscale.module_wrapper

import re
from typing import Optional, Set

import torch
import torch.nn as nn
from fairscale.nn.checkpoint import checkpoint_wrapper

from tango.integrations.torch import Model

from .fsdp_config import FSDPConfig


[docs]@Model.register("fairscale::with_wrapped_modules")  # type: ignore[arg-type]
def with_wrapped_modules(
    model: Model,
    modules_to_wrap: Set[str],
    fsdp_config: Optional[FSDPConfig] = None,
    activation_checkpointing: bool = False,
) -> Model:
    """
    A :class:`~tango.integrations.torch.Model` wrapper that can be used to easily wrap
    inner modules of a model with FairScale's :class:`~fairscale.nn.FullyShardedDataParallel` wrapper
    and/or :class:`~fairscale.nn.checkpoint.checkpoint_wrapper`.

    .. tip::
        Registered as a :class:`~tango.integrations.torch.Model` constructor under the name
        "fairscale::with_wrapped_modules".

    .. important::
        This is meant to be used with the :class:`FairScaleTrainingEngine`.

    :param model:
        The model to wrap.
    :param modules_to_wrap:
        The names of submodule to wrap. Can be regular expressions.
    :param fsdp_config:
        The ``FullyShardedDataParallel`` configuration to use when wrapping the modules.
        If not specified, the modules will NOT be wrapped with FSDP.
    :param activation_checkpointing:
        Whether to wrap the modules with FairScale's
        :class:`~fairscale.nn.checkpoint.checkpoint_wrapper`.

    Examples
    --------

    You can use this as a :class:`~tango.integrations.torch.Model` constructor from a config/params
    like this:

    .. testcode::

        import torch.nn as nn
        from tango.integrations.torch import Model


        class FeedForward(nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = nn.Linear(4, 4)
                self.activation = nn.ReLU()

            def forward(self, x):
                return self.activation(self.linear(x))

        @Model.register("simple_regression_model")
        class SimpleRegressionModel(Model):
            def __init__(self):
                super().__init__()
                self.blocks = nn.Sequential(*[FeedForward() for _ in range(3)])
                self.regression_head = nn.Linear(4, 1)
                self.loss_fcn = nn.MSELoss()

            def forward(self, x, y):
                output = self.blocks(x)
                output = self.regression_head(output)
                loss = self.loss_fcn(output, y)
                return {"loss": loss}


        model = Model.from_params({
            "type": "fairscale::with_wrapped_modules",
            "model": {
                "type": "simple_regression_model",
            },
            "modules_to_wrap": [r"blocks\\.[0-9]+", "regression_head"],
            "activation_checkpointing": True,
        })

    """

    def wrap_module(
        module: nn.Module,
    ) -> nn.Module:
        if activation_checkpointing:
            module = checkpoint_wrapper(module, offload_to_cpu=True)
        if fsdp_config is not None and torch.distributed.is_initialized():
            module = fsdp_config.wrap(module)
        return module

    all_module_names: Set[str] = set([name for name, _ in model.named_modules() if name])
    actual_modules_to_wrap: Set[str] = set()
    unmatched_patterns: Set[str] = modules_to_wrap.copy()
    for module_name in all_module_names:
        for pattern in modules_to_wrap:
            if re.fullmatch(pattern, module_name):
                actual_modules_to_wrap.add(module_name)
                if pattern in unmatched_patterns:
                    unmatched_patterns.remove(pattern)

    if unmatched_patterns:
        raise ValueError(
            f"Some patterns in 'modules_to_wrap' did not match actual module names ({unmatched_patterns})"
        )

    for module_name in actual_modules_to_wrap:
        if "." in module_name:
            *parent_parts, module_name = module_name.split(".")
            parent_module = model.get_submodule(".".join(parent_parts))
        else:
            parent_module = model
        module = parent_module.get_submodule(module_name)
        module = wrap_module(module)
        parent_module.add_module(module_name, module)

    return model