Replace unwrapping logic in strategies (Lightning-AI#13738)

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
jessecambon · Aug 16, 2022 · 2cfde9c · 2cfde9c
1 parent 2789188
commit 2cfde9c
Show file tree

Hide file tree

Showing 26 changed files with 274 additions and 159 deletions.
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -28,6 +28,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967))
 
 
+- Replaced the unwrapping logic in strategies with direct access to unwrapped `LightningModule` ([#13738](https://github.com/Lightning-AI/lightning/pull/13738))
+
+
 ### Deprecated
 
 - Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000))
@@ -39,6 +42,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated the calls to `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868))
 
 
+- Deprecated the `unwrap_lightning_module` and `unwrap_lightning_module_sharded` utility functions in favor of accessing the unwrapped `LightningModule` on the strategy directly ([#13738](https://github.com/Lightning-AI/lightning/pull/13738))
+
+
+- Deprecated the `pl_module` argument in `LightningParallelModule`, `LightningDistributedModule`, `LightningShardedDataParallel`, `LightningBaguaModule` and `LightningDeepSpeedModule` wrapper classes ([#13738](https://github.com/Lightning-AI/lightning/pull/13738))
+
+
+
 ### Removed
 
 - Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011))

diff --git a/src/pytorch_lightning/overrides/base.py b/src/pytorch_lightning/overrides/base.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -20,6 +20,7 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin
+from pytorch_lightning.utilities import rank_zero_deprecation
 
 
 class _LightningPrecisionModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module):
@@ -54,30 +55,47 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
 
 
 class _LightningModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module):
-    def __init__(self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]) -> None:
+    def __init__(
+        self, forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]]
+    ) -> None:
         """Wraps the user's LightningModule and redirects the forward call to the appropriate method, either
         ``training_step``, ``validation_step``, ``test_step``, or ``predict_step``.
 
         Inheriting classes may also modify the inputs or outputs of forward.
 
         Args:
-            pl_module: the model to wrap
+            forward_module: The module to wrap. If it's not a LightningModule, it must have an attribute ``.module``
+                pointing to a LightningModule reference.
         """
         super().__init__()
-        self.module = pl_module
+        if not isinstance(forward_module, pl.LightningModule) and (
+            not isinstance(getattr(forward_module, "module", None), pl.LightningModule)
+        ):
+            raise ValueError(
+                "`forward_module` must be a `LightningModule` instance or have an attribute `.module` pointing to one,"
+                f" got: {forward_module.__class__.__qualname__}"
+            )
+        # TODO: In v1.10.0, remove the Optional type from forward_module and remove the assertion
+        assert forward_module is not None
+        self._forward_module = forward_module
 
         # set the parameters_to_ignore from LightningModule.
-        _ddp_params_and_buffers_to_ignore = getattr(pl_module, "_ddp_params_and_buffers_to_ignore", [])
+        _ddp_params_and_buffers_to_ignore = getattr(self._forward_module, "_ddp_params_and_buffers_to_ignore", [])
         self._ddp_params_and_buffers_to_ignore = [f"module.{p}" for p in _ddp_params_and_buffers_to_ignore]
 
+    @property
+    def lightning_module(self) -> "pl.LightningModule":
+        if isinstance(self._forward_module, pl.LightningModule):
+            return self._forward_module
+        return self._forward_module.module
+
     def forward(self, *inputs: Any, **kwargs: Any) -> Any:
-        pl_module = unwrap_lightning_module(self.module)
+        pl_module = self.lightning_module
         trainer = pl_module._trainer
 
         if trainer is not None:
-            assert isinstance(self.module, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
             if trainer.training:
-                output = self.module.training_step(*inputs, **kwargs)
+                output = self._forward_module.training_step(*inputs, **kwargs)
                 # In manual_optimization, we need to prevent DDP reducer as
                 # it is done manually in `LightningModule.manual_backward`
                 # `require_backward_grad_sync` will be reset in the
@@ -86,27 +104,53 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any:
                     trainer.model.require_backward_grad_sync = False  # type: ignore[assignment]
                 return output
             if trainer.testing:
-                return self.module.test_step(*inputs, **kwargs)
+                return self._forward_module.test_step(*inputs, **kwargs)
             if trainer.sanity_checking or trainer.validating:
-                return self.module.validation_step(*inputs, **kwargs)
+                return self._forward_module.validation_step(*inputs, **kwargs)
             if trainer.predicting:
-                return self.module.predict_step(*inputs, **kwargs)
-        return self.module(*inputs, **kwargs)
-
-
-def unwrap_lightning_module(wrapped_model: nn.Module) -> "pl.LightningModule":
+                return self._forward_module.predict_step(*inputs, **kwargs)
+        return self._forward_module(*inputs, **kwargs)
+
+    @classmethod
+    def _validate_init_arguments(
+        cls,
+        pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
+        forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
+    ) -> None:
+        # TODO: In v1.10.0, remove this method and mark the forward_module init argument in all subclasses as required
+        if pl_module is not None:
+            rank_zero_deprecation(
+                f"The argument `pl_module` in `{cls.__name__}` is deprecated in v1.8.0 and will be removed in"
+                " v1.10.0. Please use `forward_module` instead."
+            )
+        elif forward_module is None:
+            raise ValueError("Argument `forward_module` is required.")
+
+
+def unwrap_lightning_module(wrapped_model: nn.Module, _suppress_warning: bool = False) -> "pl.LightningModule":
     """Recursively unwraps a :class:`~pytorch_lightning.core.module.LightningModule` by following the ``.module``
     attributes on the wrapper.
 
+    .. deprecated:: v1.8.0
+        The function ``unwrap_lightning_module`` is deprecated in v1.8.0 and will be removed in v1.10.0. Access the
+        ``LightningModule`` directly through the strategy attribute ``Strategy.lightning_module``.
+
     Raises:
         TypeError: If the unwrapping leads to a module that is not a LightningModule and that cannot be unwrapped
             further.
     """
+    if not _suppress_warning:
+        rank_zero_deprecation(
+            "The function `unwrap_lightning_module` is deprecated in v1.8.0 and will be removed in v1.10.0. Access the"
+            " `LightningModule` directly through the strategy attribute `Strategy.lightning_module`."
+        )
     model = wrapped_model
     if isinstance(model, (DistributedDataParallel, DataParallel)):
         model = unwrap_lightning_module(model.module)
-    if isinstance(model, (_LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase)):
-        model = unwrap_lightning_module(model.module)
+    if isinstance(model, _LightningModuleWrapperBase):
+        model = model.lightning_module
+    if isinstance(model, _LightningPrecisionModuleWrapperBase):
+        model = model.module
     if not isinstance(model, pl.LightningModule):
         raise TypeError(f"Unwrapping the module did not yield a `LightningModule`, got {type(model)} instead.")
     return model
diff --git a/src/pytorch_lightning/overrides/data_parallel.py b/src/pytorch_lightning/overrides/data_parallel.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import numbers
 import warnings
-from typing import Any, cast, Union
+from typing import Any, cast, Optional, Union
 
 import torch
 from torch import Tensor
@@ -52,11 +52,23 @@ class LightningParallelModule(_LightningModuleWrapperBase):
         )
 
     Args:
-        pl_module: the model to wrap
+        pl_module: The module to wrap. See description for `forward_module`.
+
+            .. deprecated:: v1.8.0
+                The argument ``pl_module`` is deprecated in v1.8.0 and will be removed in v1.10.0. Please use
+                ``forward_module`` instead.
+
+        forward_module: The module to wrap. If it's not a ``LightningModule``, it must have an attribute ``.module``
+            pointing to a ``LightningModule`` reference.
     """
 
-    def __init__(self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]) -> None:
-        super().__init__(pl_module)
+    def __init__(
+        self,
+        forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
+        pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
+    ) -> None:
+        self._validate_init_arguments(pl_module, forward_module)
+        super().__init__(forward_module=(pl_module or forward_module))
         _ignore_scalar_return_in_dp()
 
     def forward(self, *inputs: Any, **kwargs: Any) -> Any:
@@ -65,7 +77,7 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any:
         output = super().forward(*inputs, **kwargs)
 
         def output_transform(data: Any) -> Any:
-            device = cast(torch.device, self.module.device)
+            device = cast(torch.device, self.lightning_module.device)
             data = python_scalar_to_tensor(data, device)
             data = unsqueeze_scalar_tensor(data)
             return data
@@ -95,7 +107,7 @@ def find_tensor_with_device(tensor: Tensor) -> Tensor:
 
         if replica_device is not None:
             # by calling .to() we force the update to the self.device property
-            self.module.to(device=replica_device)
+            self._forward_module.to(device=replica_device)
         else:
             rank_zero_warn(
                 "Could not determine on which device the inputs are."

diff --git a/src/pytorch_lightning/overrides/distributed.py b/src/pytorch_lightning/overrides/distributed.py
@@ -19,12 +19,19 @@
 from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import BatchSampler, Dataset, DistributedSampler, Sampler
 
-from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
+import pytorch_lightning as pl
+from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class LightningDistributedModule(_LightningModuleWrapperBase):
-    ...
+    def __init__(
+        self,
+        forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
+        pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
+    ) -> None:
+        self._validate_init_arguments(pl_module, forward_module)
+        super().__init__(forward_module=(pl_module or forward_module))
 
 
 def _find_tensors(

diff --git a/src/pytorch_lightning/overrides/fairscale.py b/src/pytorch_lightning/overrides/fairscale.py
@@ -11,27 +11,44 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Optional, Union
+
 import torch.nn as nn
 
 import pytorch_lightning as pl
-from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, unwrap_lightning_module
-from pytorch_lightning.utilities import _IS_WINDOWS, _module_available
+from pytorch_lightning.overrides.base import (
+    _LightningModuleWrapperBase,
+    _LightningPrecisionModuleWrapperBase,
+    unwrap_lightning_module,
+)
+from pytorch_lightning.utilities import rank_zero_deprecation
+from pytorch_lightning.utilities.imports import _IS_WINDOWS, _module_available
 
 _FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn")
 
-if _FAIRSCALE_AVAILABLE:
+
+if _FAIRSCALE_AVAILABLE:  # pragma: no-cover
     from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
 
     class LightningShardedDataParallel(_LightningModuleWrapperBase):
-        # Just do this for later docstrings
-        pass
+        def __init__(
+            self,
+            forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
+            pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
+        ) -> None:
+            self._validate_init_arguments(pl_module, forward_module)
+            super().__init__(forward_module=(pl_module or forward_module))
 
     def unwrap_lightning_module_sharded(wrapped_model: nn.Module) -> "pl.LightningModule":
+        rank_zero_deprecation(
+            "The function `unwrap_lightning_module_sharded` is deprecated in v1.8.0 and will be removed in v1.10.0."
+            " Access the `LightningModule` directly through the strategy attribute `Strategy.lightning_module`."
+        )
         model = wrapped_model
         if isinstance(model, ShardedDataParallel):
             model = model.module
 
-        return unwrap_lightning_module(model)
+        return unwrap_lightning_module(model, _suppress_warning=True)
 
 else:
     LightningShardedDataParallel = ...  # type: ignore[assignment,misc]

diff --git a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py
@@ -15,9 +15,9 @@
 
 import torch
 
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE
 
 if _FAIRSCALE_AVAILABLE:
     from fairscale.optim import OSS

diff --git a/src/pytorch_lightning/strategies/bagua.py b/src/pytorch_lightning/strategies/bagua.py
@@ -7,11 +7,7 @@
 from torch.nn import Module
 
 import pytorch_lightning as pl
-from pytorch_lightning.overrides.base import (
-    _LightningModuleWrapperBase,
-    _LightningPrecisionModuleWrapperBase,
-    unwrap_lightning_module,
-)
+from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
@@ -54,10 +50,16 @@
 
 
 class LightningBaguaModule(_LightningModuleWrapperBase):
-    def __init__(self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]) -> None:
-        super().__init__(pl_module)
+    def __init__(
+        self,
+        forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
+        pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None,
+    ) -> None:
+        self._validate_init_arguments(pl_module, forward_module)
+        forward_module = pl_module or forward_module
+        super().__init__(forward_module=forward_module)
         # Bagua use `bagua_module_name` to distinguish different modules
-        self._bagua_module_name = f"{pl_module.__class__.__name__}{id(pl_module)}"
+        self._bagua_module_name = f"{forward_module.__class__.__name__}{id(forward_module)}"
 
 
 class BaguaStrategy(DDPStrategy):
@@ -109,13 +111,6 @@ def __init__(
         self._bagua_flatten = flatten
         self._bagua_kwargs = bagua_kwargs
 
-    @property
-    def lightning_module(self) -> Optional["pl.LightningModule"]:
-        model = self.model
-        if isinstance(model, BaguaDistributedDataParallel):
-            model = model.module
-        return unwrap_lightning_module(model) if model is not None else None
-
     def setup_distributed(self) -> None:
         reset_seed()
 
@@ -190,7 +185,7 @@ def _check_qadam_optimizer(self) -> None:
 
     def _configure_bagua_model(self, trainer: "pl.Trainer") -> None:
         model = LightningBaguaModule(self.model)  # type: ignore[arg-type]
-        self._model = self._setup_model(model)
+        self.model = self._setup_model(model)
 
         # start the background communication for async algorithm
         if trainer.training and self._bagua_algorithm == "async":

diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py
@@ -34,7 +34,6 @@
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase
 from pytorch_lightning.overrides.distributed import prepare_for_backward
-from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
@@ -55,7 +54,12 @@
     sync_ddp_if_available,
 )
 from pytorch_lightning.utilities.exceptions import DeadlockDetectedException
-from pytorch_lightning.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_11
+from pytorch_lightning.utilities.imports import (
+    _FAIRSCALE_AVAILABLE,
+    _IS_WINDOWS,
+    _TORCH_GREATER_EQUAL_1_10,
+    _TORCH_GREATER_EQUAL_1_11,
+)
 from pytorch_lightning.utilities.optimizer import optimizers_to_device
 from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.seed import reset_seed