From bb07914c55a79c9fdc183fffbd9d9896ad996b7a Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 29 Sep 2022 16:06:33 -0700 Subject: [PATCH 1/3] Fix print statement Remove extra format argument Signed-off-by: Ralph Castain (cherry picked from commit 146441f0c57edbf877cf5e8cbe0bb7bf4a006536) --- src/runtime/data_type_support/prte_dt_print_fns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/data_type_support/prte_dt_print_fns.c b/src/runtime/data_type_support/prte_dt_print_fns.c index 2a2ec897a3..f6c293450a 100644 --- a/src/runtime/data_type_support/prte_dt_print_fns.c +++ b/src/runtime/data_type_support/prte_dt_print_fns.c @@ -53,7 +53,7 @@ void prte_job_print(char **output, prte_job_t *src) tmp2 = pmix_argv_join(src->personality, ','); pmix_asprintf(&tmp, - "\nData for job: %s\tPersonality: %s\tRecovery: %s(%s)\n\tNum apps: %ld\tStdin " + "\nData for job: %s\tPersonality: %s\tRecovery: %s\n\tNum apps: %ld\tStdin " "target: %s\tState: %s\tAbort: %s", PRTE_JOBID_PRINT(src->nspace), tmp2, (prte_get_attribute(&src->attributes, PRTE_JOB_RECOVERABLE, NULL, PMIX_BOOL)) ? "ENABLED" : "DISABLED", From 6b0c9c9caed4ec04c3111289fca9e1b72a3977b8 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 29 Sep 2022 17:10:03 -0700 Subject: [PATCH 2/3] Push launch-agent CLI into the env Push any provided launch-agent CLI into the env as an MCA param for pickup by the plm framework. Signed-off-by: Ralph Castain (cherry picked from commit c274b7b05f3a1a587d77d8b9b252f105462715ed) --- src/tools/prte/prte.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/tools/prte/prte.c b/src/tools/prte/prte.c index aea0404388..1420af3fa6 100644 --- a/src/tools/prte/prte.c +++ b/src/tools/prte/prte.c @@ -464,6 +464,12 @@ int main(int argc, char *argv[]) prte_pmix_server_globals.report_uri = strdup(opt->values[0]); } + /* if we were given a launch agent, set the MCA param for it */ + opt = pmix_cmd_line_get_param(&results, PRTE_CLI_LAUNCH_AGENT); + if (NULL != opt) { + setenv("PRTE_MCA_prte_launch_agent", opt->values[0], true); // cmd line overrides all + } + /* if we are supporting a singleton, push its ID into the environ * so it can get picked up and registered by server init */ opt = pmix_cmd_line_get_param(&results, PRTE_CLI_SINGLETON); From 160c7b3255b0790f2114de5e838b1454a40abe77 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 29 Sep 2022 20:11:20 -0700 Subject: [PATCH 3/3] Fix mapping by pe-list when oversubscribed Do a second pass to complete placement of remaining procs. Ensure that we correctly bind the remainder to all CPUs in the list. Make the error message when unable to place all requested procs a little clearer. Signed-off-by: Ralph Castain (cherry picked from commit afe6a4570eda752399aaf9cfed040bcc575596ec) --- src/mca/rmaps/base/help-prte-rmaps-base.txt | 1 + src/mca/rmaps/base/rmaps_base_binding.c | 5 +- src/mca/rmaps/base/rmaps_base_map_job.c | 6 +- src/mca/rmaps/base/rmaps_base_support_fns.c | 10 ++- .../rmaps/round_robin/help-prte-rmaps-rr.txt | 39 +++++++++- src/mca/rmaps/round_robin/rmaps_rr_mappers.c | 74 +++++++++++++++++-- 6 files changed, 119 insertions(+), 16 deletions(-) diff --git a/src/mca/rmaps/base/help-prte-rmaps-base.txt b/src/mca/rmaps/base/help-prte-rmaps-base.txt index e6ebdefe35..694d9e54fa 100644 --- a/src/mca/rmaps/base/help-prte-rmaps-base.txt +++ b/src/mca/rmaps/base/help-prte-rmaps-base.txt @@ -75,6 +75,7 @@ no nodes were found or all the available nodes were already used. Note that since the -nolocal option was given no processes can be launched on the local node. +# [prte-rmaps-base:no-available-resources] No nodes are available for this job, either due to a failure to allocate nodes to the job, or allocated nodes being marked diff --git a/src/mca/rmaps/base/rmaps_base_binding.c b/src/mca/rmaps/base/rmaps_base_binding.c index 194516ca9d..2f10bf2053 100644 --- a/src/mca/rmaps/base/rmaps_base_binding.c +++ b/src/mca/rmaps/base/rmaps_base_binding.c @@ -192,6 +192,7 @@ static int bind_to_cpuset(prte_job_t *jdata, } else { type = HWLOC_OBJ_CORE; } + /* the CPU numbers would have been given to us based on the total * available CPUs on the machine. Thus, we cannot use the node->available * CPU set as we are removing CPUs for accounting purposes there. @@ -253,10 +254,6 @@ static int bind_to_cpuset(prte_job_t *jdata, hwloc_bitmap_andnot(node->available, node->available, obj->cpuset); #endif } - char *tmp; - hwloc_bitmap_list_asprintf(&tmp, node->available); - - free(tmp); return PRTE_SUCCESS; } diff --git a/src/mca/rmaps/base/rmaps_base_map_job.c b/src/mca/rmaps/base/rmaps_base_map_job.c index c725cec35e..18fd063982 100644 --- a/src/mca/rmaps/base/rmaps_base_map_job.c +++ b/src/mca/rmaps/base/rmaps_base_map_job.c @@ -488,7 +488,11 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) * override it */ if (!PRTE_BINDING_POLICY_IS_SET(jdata->map->binding)) { did_map = false; - if (inherit) { + if (options.oversubscribe) { + /* if we are oversubscribing, then do not bind */ + jdata->map->binding = PRTE_BIND_TO_NONE; + did_map = true; + } else if (inherit) { if (NULL != parent) { jdata->map->binding = parent->map->binding; did_map = true; diff --git a/src/mca/rmaps/base/rmaps_base_support_fns.c b/src/mca/rmaps/base/rmaps_base_support_fns.c index 698123e237..8a2974a90f 100644 --- a/src/mca/rmaps/base/rmaps_base_support_fns.c +++ b/src/mca/rmaps/base/rmaps_base_support_fns.c @@ -739,7 +739,11 @@ bool prte_rmaps_base_check_avail(prte_job_t *jdata, } if (PRTE_BIND_TO_NONE == options->bind) { - options->target = NULL; + if (NULL != options->job_cpuset) { + options->target = hwloc_bitmap_dup(options->job_cpuset); + } else { + options->target = NULL; + } avail = true; goto done; } @@ -860,6 +864,10 @@ int prte_rmaps_base_check_oversubscribed(prte_job_t *jdata, */ PRTE_FLAG_SET(node, PRTE_NODE_FLAG_OVERSUBSCRIBED); PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); + if (options->oversubscribe) { + return PRTE_SUCCESS; + } + /* check for permission */ if (PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_SLOTS_GIVEN)) { /* if we weren't given a directive either way, then we will error out diff --git a/src/mca/rmaps/round_robin/help-prte-rmaps-rr.txt b/src/mca/rmaps/round_robin/help-prte-rmaps-rr.txt index f4a40a8adf..8da1604344 100644 --- a/src/mca/rmaps/round_robin/help-prte-rmaps-rr.txt +++ b/src/mca/rmaps/round_robin/help-prte-rmaps-rr.txt @@ -12,6 +12,7 @@ # All rights reserved. # Copyright (c) 2017-2020 Intel, Inc. All rights reserved. # Copyright (c) 2018-2020 Cisco Systems, Inc. All rights reserved +# Copyright (c) 2022 Nanook Consulting. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -25,12 +26,13 @@ RMAPS found multiple applications to be launched, with at least one that failed to specify the number of processes to execute. When specifying multiple applications, you must specify how many processes of each to launch via the -np argument. - +# [prte-rmaps-rr:per-node-and-too-many-procs] There are not enough nodes in your allocation to satisfy your request to launch %d processes on a per-node basis - only %d nodes were available. Either request fewer processes, or obtain a larger allocation. +# [prte-rmaps-rr:n-per-node-and-too-many-procs] There are not enough nodes in your allocation to satisfy your request to launch %d processes on a %d per-node basis - only %d nodes with a total of %d slots were available. @@ -40,7 +42,7 @@ Either request fewer processes, or obtain a larger allocation. There are not enough slots on the nodes in your allocation to satisfy your request to launch on a %d process-per-node basis - only %d slots/node were available. Either request fewer processes/node, or obtain a larger allocation. - +# [prte-rmaps-rr:no-np-and-user-map] You have specified a rank-to-node/slot mapping, but failed to provide the number of processes to be executed. For some reason, this information @@ -54,3 +56,36 @@ to meet the requested mapping. Application: %s Number of procs: %d Number of resources: %d +# +[prte-rmaps-rr:not-enough-cpus] +There are not enough slots available in the system or not enough +CPUs in the specified PE-LIST to map the number of processes requested +by the application: + + app: %s + Number of procs: %d + pe-list: %s + +Either request fewer procs for your application, make more slots +available for use, or expand the pe-list. + +A "slot" is the PRRTE term for an allocatable unit where we can +launch a process. The number of slots available are defined by the +environment in which PRRTE processes are run: + + 1. Hostfile, via "slots=N" clauses (N defaults to number of + processor cores if not provided) + 2. The --host command line parameter, via a ":N" suffix on the + hostname (N defaults to 1 if not provided) + 3. Resource manager (e.g., SLURM, PBS/Torque, LSF, etc.) + 4. If none of a hostfile, the --host command line parameter, or an + RM is present, PRRTE defaults to the number of processor cores + +In all the above cases, if you want PRRTE to default to the number +of hardware threads instead of the number of processor cores, use the +--use-hwthread-cpus option. + +Alternatively, you can use the --map-by :OVERSUBSCRIBE option to ignore +the number of available slots and size of the pe-list when placing the +processes. +# diff --git a/src/mca/rmaps/round_robin/rmaps_rr_mappers.c b/src/mca/rmaps/round_robin/rmaps_rr_mappers.c index fc511a4e69..00bc4fbe9b 100644 --- a/src/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/src/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -370,6 +370,10 @@ int prte_rmaps_rr_bycpu(prte_job_t *jdata, prte_app_context_t *app, prte_proc_t *proc; char **tmp; int ntomap; + bool second_pass = false; + int extra_procs_to_assign = 0, nxtra_nodes = 0; + float balance; + char *savecpuset = NULL; prte_binding_policy_t savebind = options->bind; pmix_output_verbose(2, prte_rmaps_base_framework.framework_output, @@ -397,7 +401,11 @@ int prte_rmaps_rr_bycpu(prte_job_t *jdata, prte_app_context_t *app, tmp = pmix_argv_split(options->cpuset, ','); ntomap = pmix_argv_count(tmp); pmix_argv_free(tmp); + if (NULL != options->cpuset) { + savecpuset = strdup(options->cpuset); + } +pass: PMIX_LIST_FOREACH_SAFE(node, nd, node_list, prte_node_t) { pmix_output_verbose(2, prte_rmaps_base_framework.framework_output, @@ -405,7 +413,15 @@ int prte_rmaps_rr_bycpu(prte_job_t *jdata, prte_app_context_t *app, prte_rmaps_base_get_cpuset(jdata, node, options); - if (options->ordered || !options->overload) { + if (second_pass) { + options->nprocs = extra_procs_to_assign; + if (0 < nxtra_nodes) { + --nxtra_nodes; + if (0 == nxtra_nodes) { + --extra_procs_to_assign; + } + } + } else if (options->ordered || !options->overload) { options->nprocs = ntomap; } else { /* assign a number of procs equal to the number of available slots */ @@ -467,25 +483,67 @@ int prte_rmaps_rr_bycpu(prte_job_t *jdata, prte_app_context_t *app, PMIX_RELEASE(proc); } if (nprocs_mapped == app->num_procs) { + if (NULL != options->target) { + hwloc_bitmap_free(options->target); + options->target = NULL; + } + if (NULL != options->job_cpuset) { + hwloc_bitmap_free(options->job_cpuset); + options->job_cpuset = NULL; + } + if (NULL != savecpuset) { + free(savecpuset); + } return PRTE_SUCCESS; } - if(NULL != options->target) - { + if (NULL != options->target) { hwloc_bitmap_free(options->target); options->target = NULL; } + if (NULL != options->job_cpuset) { + hwloc_bitmap_free(options->job_cpuset); + options->job_cpuset = NULL; + } + } // next node + + /* second pass: if we haven't mapped everyone yet, it is + * because we are oversubscribed. All of the nodes that are + * at max_slots have been removed from the list as that specifies + * a hard boundary, so the nodes remaining are available for + * handling the oversubscription. Figure out how many procs + * to add to each of them. + */ + if (options->oversubscribe && !second_pass) { + balance = (float) ((int) app->num_procs - nprocs_mapped) + / (float) pmix_list_get_size(node_list); + extra_procs_to_assign = (int) balance; + if (0 < (balance - (float) extra_procs_to_assign)) { + /* compute how many nodes need an extra proc */ + nxtra_nodes = app->num_procs - nprocs_mapped + - (extra_procs_to_assign * pmix_list_get_size(node_list)); + /* add one so that we add an extra proc to the first nodes + * until all procs are mapped + */ + extra_procs_to_assign++; + } + /* restore the cpuset */ + options->cpuset = savecpuset; + // Rescan the nodes + second_pass = true; + goto pass; } errout: /* if we get here, then we were unable to map all the procs */ if (PRTE_ERR_SILENT != rc) { - pmix_show_help("help-prte-rmaps-base.txt", - "failed-map", true, - PRTE_ERROR_NAME(rc), + pmix_show_help("help-prte-rmaps-rr.txt", + "prte-rmaps-rr:not-enough-cpus", true, (NULL == app) ? "N/A" : app->app, (NULL == app) ? -1 : app->num_procs, - prte_rmaps_base_print_mapping(options->map), - prte_hwloc_base_print_binding(options->bind)); + savecpuset); + } + if (NULL != savecpuset) { + free(savecpuset); } return PRTE_ERR_SILENT; }