From 5c844a760ee509eb51189719b9f93cfc89a8cbd0 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 5 Oct 2024 19:36:02 -0600 Subject: [PATCH] Protect against the envar version of the Slurm custom args param Protect against the envar version of the Slurm custom args MCA param. This is an unfortunate hack that hopefully will eventually go away. See both of the following for detailed explanations and discussion: https://github.com/openpmix/prrte/issues/1974 https://github.com/open-mpi/ompi/issues/12471 Orgs/users wanting to add custom args to the internal "srun" command used to spawn the PRRTE daemons must do so via the default MCA param files (system or user), or via the prterun (or its proxy) cmd line Signed-off-by: Ralph Castain (from upstream commit 28432ed70f1dec5be4889af39f1a5c4947b9fc32) --- src/mca/plm/slurm/plm_slurm.h | 3 +- src/mca/plm/slurm/plm_slurm_component.c | 74 +++++++++++-------------- src/mca/plm/slurm/plm_slurm_module.c | 7 ++- src/runtime/prte_init.c | 18 ++++++ 4 files changed, 58 insertions(+), 44 deletions(-) diff --git a/src/mca/plm/slurm/plm_slurm.h b/src/mca/plm/slurm/plm_slurm.h index d654a48e4d..b357d4b7ef 100644 --- a/src/mca/plm/slurm/plm_slurm.h +++ b/src/mca/plm/slurm/plm_slurm.h @@ -33,9 +33,8 @@ BEGIN_C_DECLS struct prte_mca_plm_slurm_component_t { prte_plm_base_component_t super; - int custom_args_index; char *custom_args; - bool slurm_warning_msg; + bool early; }; typedef struct prte_mca_plm_slurm_component_t prte_mca_plm_slurm_component_t; diff --git a/src/mca/plm/slurm/plm_slurm_component.c b/src/mca/plm/slurm/plm_slurm_component.c index 936386bbb6..81cf74e59e 100644 --- a/src/mca/plm/slurm/plm_slurm_component.c +++ b/src/mca/plm/slurm/plm_slurm_component.c @@ -38,6 +38,7 @@ #include "src/util/name_fns.h" #include "src/util/pmix_environ.h" #include "src/util/pmix_show_help.h" +#include "src/util/pmix_string_copy.h" #include "plm_slurm.h" #include "src/mca/plm/base/plm_private.h" @@ -84,28 +85,15 @@ prte_mca_plm_slurm_component_t prte_mca_plm_slurm_component = { here; will be initialized in plm_slurm_open() */ }; -static char *custom_args = NULL; -static char *force_args = NULL; - static int plm_slurm_register(void) { pmix_mca_base_component_t *comp = &prte_mca_plm_slurm_component.super; - prte_mca_plm_slurm_component.custom_args_index = - pmix_mca_base_component_var_register(comp, "args", "Custom arguments to srun", - PMIX_MCA_BASE_VAR_TYPE_STRING, - &custom_args); - - force_args = NULL; - (void) pmix_mca_base_component_var_register(comp, "force_args", "Mandatory custom arguments to srun", - PMIX_MCA_BASE_VAR_TYPE_STRING, - &force_args); - - prte_mca_plm_slurm_component.slurm_warning_msg = false; - (void) pmix_mca_base_component_var_register(comp, "disable_warning", "Turn off warning message about custom args set in environment", - PMIX_MCA_BASE_VAR_TYPE_BOOL, - &prte_mca_plm_slurm_component.slurm_warning_msg); + prte_mca_plm_slurm_component.custom_args = NULL; + pmix_mca_base_component_var_register(comp, "args", "Custom arguments to srun", + PMIX_MCA_BASE_VAR_TYPE_STRING, + &prte_mca_plm_slurm_component.custom_args); return PRTE_SUCCESS; } @@ -117,11 +105,11 @@ static int plm_slurm_open(void) static int prte_mca_plm_slurm_component_query(pmix_mca_base_module_t **module, int *priority) { - const pmix_mca_base_var_t *var; - pmix_status_t rc; + FILE *fp; + char version[1024], *ptr; + int major, minor; /* Are we running under a SLURM job? */ - if (NULL != getenv("SLURM_JOBID")) { *priority = 75; @@ -129,28 +117,32 @@ static int prte_mca_plm_slurm_component_query(pmix_mca_base_module_t **module, i "%s plm:slurm: available for selection", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); - prte_mca_plm_slurm_component.custom_args = NULL; - - // if we were are warning about externally set custom args, then - // check to see if that was done - if (!prte_mca_plm_slurm_component.slurm_warning_msg && - NULL == force_args) { - // check for custom args - rc = pmix_mca_base_var_get(prte_mca_plm_slurm_component.custom_args_index, &var); - if (PMIX_SUCCESS == rc) { - // the variable was set - see who set it - if (PMIX_MCA_BASE_VAR_SOURCE_ENV == var->mbv_source) { - // set in the environment - warn - pmix_show_help("help-plm-slurm.txt", "custom-args-in-env", true, - custom_args); - } - } + // check the version + fp = popen("srun --version", "r"); + if (NULL == fp) { + // cannot run srun, so we cannot support this job + *module = NULL; + return PRTE_ERROR; } - - if (NULL != force_args) { - prte_mca_plm_slurm_component.custom_args = force_args; - } else if (NULL != custom_args) { - prte_mca_plm_slurm_component.custom_args = custom_args; + if (NULL == fgets(version, sizeof(version), fp)) { + pclose(fp); + *module = NULL; + return PRTE_ERROR; + } + pclose(fp); + // parse on the dots + major = strtol(&version[6], &ptr, 10); + ++ptr; + minor = strtol(ptr, NULL, 10); + + if (23 > major) { + prte_mca_plm_slurm_component.early = true; + } else if (23 < major) { + prte_mca_plm_slurm_component.early = false; + } else if (11 > minor) { + prte_mca_plm_slurm_component.early = true; + } else { + prte_mca_plm_slurm_component.early = false; } *module = (pmix_mca_base_module_t *) &prte_plm_slurm_module; diff --git a/src/mca/plm/slurm/plm_slurm_module.c b/src/mca/plm/slurm/plm_slurm_module.c index 2edae0312a..8c9a04ec27 100644 --- a/src/mca/plm/slurm/plm_slurm_module.c +++ b/src/mca/plm/slurm/plm_slurm_module.c @@ -15,7 +15,7 @@ * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -253,6 +253,11 @@ static void launch_daemons(int fd, short args, void *cbdata) /* add the srun command */ pmix_argv_append(&argc, &argv, "srun"); + // add the external launcher flag if necessary + if (!prte_mca_plm_slurm_component.early) { + pmix_argv_append(&argc, &argv, "--external-launcher"); + } + /* start one orted on each node */ pmix_argv_append(&argc, &argv, "--ntasks-per-node=1"); diff --git a/src/runtime/prte_init.c b/src/runtime/prte_init.c index db7022cfb1..5e7a234f63 100644 --- a/src/runtime/prte_init.c +++ b/src/runtime/prte_init.c @@ -179,6 +179,24 @@ int prte_init_minimum(void) return PRTE_ERR_SILENT; } + /* Protect against the envar version of the Slurm + * custom args MCA param. This is an unfortunate + * hack that hopefully will eventually go away. + * See both of the following for detailed + * explanations and discussion: + * + * https://github.com/openpmix/prrte/issues/1974 + * https://github.com/open-mpi/ompi/issues/12471 + * + * Orgs/users wanting to add custom args to the + * internal "srun" command used to spawn the + * PRRTE daemons must do so via the default MCA + * param files (system or user), or via the + * prterun (or its proxy) cmd line + */ + unsetenv("PRTE_MCA_plm_slurm_args"); + unsetenv("OMPI_MCA_plm_slurm_args"); + /* carry across the toolname */ pmix_tool_basename = prte_tool_basename;