Revert "Improving Hydra+DDP support (#11617)"

This reverts commit 45ca781.
Lightning-AI · Nov 18, 2022 · eaed66d · eaed66d
1 parent 609b258
commit eaed66d
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 171 deletions.
diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py
@@ -22,7 +22,6 @@
 import pytorch_lightning as pl
 from lightning_lite.plugins import ClusterEnvironment
 from lightning_lite.strategies.launchers.base import _Launcher
-from lightning_lite.strategies.launchers.subprocess_script import _basic_subprocess_cmd, _hydra_subprocess_cmd
 
 _HYDRA_AVAILABLE = RequirementCache("hydra-core")
 
@@ -100,6 +99,32 @@ def _call_children_scripts(self) -> None:
         # allow the user to pass the node rank
         os.environ["NODE_RANK"] = str(self.cluster_environment.node_rank())
         os.environ["LOCAL_RANK"] = str(self.cluster_environment.local_rank())
+
+        # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c`
+        # See https://docs.python.org/3/reference/import.html#main-spec
+        if __main__.__spec__ is None:  # pragma: no-cover
+            # Script called as `python a/b/c.py`
+            if _HYDRA_AVAILABLE:
+                # when user is using hydra find the absolute path
+                from hydra.utils import to_absolute_path
+
+                to_abs_path = to_absolute_path
+            else:
+                to_abs_path = os.path.abspath
+
+            # pull out the commands used to run the script and resolve the absolute file path
+            command = sys.argv
+            try:
+                full_path = to_abs_path(command[0])
+            except Exception:
+                full_path = os.path.abspath(command[0])
+
+            command[0] = full_path
+            # use the same python interpreter and actually running
+            command = [sys.executable] + command
+        else:  # Script called as `python -m a.b.c`
+            command = [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:]
+
         os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}"
 
         for local_rank in range(1, self.num_processes):
@@ -110,18 +135,18 @@ def _call_children_scripts(self) -> None:
             if os.environ.get("PL_GLOBAL_SEED") is None and "PL_GLOBAL_SEED" in env_copy:
                 del env_copy["PL_GLOBAL_SEED"]
 
-            hydra_in_use = False
+            # start process
+            # if hydra is available and initialized, make sure to set the cwd correctly
+            cwd: Optional[str] = None
             if _HYDRA_AVAILABLE:
                 from hydra.core.hydra_config import HydraConfig
+                from hydra.utils import get_original_cwd
 
-                hydra_in_use = HydraConfig.initialized()
-
-            if hydra_in_use:
-                command = _hydra_subprocess_cmd(local_rank)
-            else:
-                command = _basic_subprocess_cmd()
-
-            subprocess.Popen(command, env=env_copy)
+                if HydraConfig.initialized():
+                    cwd = get_original_cwd()
+                    os_cwd = f'"{os.getcwd()}"'
+                    command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"]
+            subprocess.Popen(command, env=env_copy, cwd=cwd)
 
             # starting all processes at once can cause issues
             # with dataloaders delay between 1-10 seconds

diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py