diff --git a/src/hwloc/help-prte-hwloc-base.txt b/src/hwloc/help-prte-hwloc-base.txt index c203d61eba..3122d356a4 100644 --- a/src/hwloc/help-prte-hwloc-base.txt +++ b/src/hwloc/help-prte-hwloc-base.txt @@ -2,6 +2,7 @@ # # Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved # Copyright (c) 2014-2020 Intel, Inc. All rights reserved. +# Copyright (c) 2022 Nanook Consulting. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -85,3 +86,86 @@ The binding request contains an unrecognized modifier: Request: %s Please check your request and try again. +# +[bind-to-option] +By default, processes are bound to individual CPUs (either COREs +or HWTHREADs, as defined by default or by user specification for +the job). On nodes that are OVERSUBSCRIBEd (i.e., where the number +of procs exceeds the number of assigned slots), the default is to +not bind the processes. + +NOTE: processes from prior jobs that are already executing on a node +are not "unbound" when a new job mapping results in the node +becoming oversubscribed. + +Binding is performed to the first available specified object type +within the object where the process was mapped. In other words, +binding can only be done to the mapped object or to a resource +located beneath that object. + +An object is considered completely consumed when the number of +processes bound to it equals the number of CPUs within it. Unbound +processes are not considered in this computation. Additional +processes cannot be mapped to consumed objects unless the +OVERLOAD qualifier is provided via the "--bind-to" command +line option. + +Note that directives and qualifiers are case-insensitive +and can be shortened to the minimum number of characters +to uniquely identify them. Thus, "L1CACHE" can be given +as "l1cache" or simply as "L1". + +Supported binding directives include: + +- NONE does not bind the processes + +- HWTHREAD binds each process to a single hardware + thread/ This requires that hwthreads be treated + as independent CPUs (i.e., that either the HWTCPUS + qualifier be provided to the "map-by" option or + that hwthreads be designated as CPUs by default). + +- CORE binds each process to a single core. This + can be done whether hwthreads or cores are being + treated as independent CPUs provided that mapping + is performed at the core or higher level. + +- L1CACHE binds each process to all the CPUs in + an L1 cache. + +- L2CACHE binds each process to all the CPUs in + an L2 cache + +- L3CACHE binds each process to all the CPUs in + an L3 cache + +- NUMA binds each process to all the CPUs in a NUMA + region + +- PACKAGE binds each process to all the CPUs in a PACKAGE + +Any directive can include qualifiers by adding a colon (:) and any +combination of one or more of the following to the --bind-to option: + +- OVERLOAD indicates that objects can have more + processes bound to them than CPUs within them + +- IF-SUPPORTED indicates that the job should continue to + be launched and executed even if binding cannot be + performed as requested. + +- REPORT outputs a report on the bindings for the processes + to stderr +# +[bind-upwards] +Binding is performed to the first available specified object type +within the object where the process was mapped. In other words, +binding can only be done to the mapped object or to a resource +located beneath that object. + +The specified binding lies above the mapping object type: + + Mapping level: %s + Binding level: %s + +Please correct the map/bind directives and try again. diff --git a/src/hwloc/hwloc-internal.h b/src/hwloc/hwloc-internal.h index a25561e157..7ff3c7e896 100644 --- a/src/hwloc/hwloc-internal.h +++ b/src/hwloc/hwloc-internal.h @@ -113,35 +113,6 @@ typedef struct { size_t mbs_len; } prte_hwloc_base_memory_segment_t; -/* structs for storing info on objects */ -typedef struct { - pmix_object_t super; - hwloc_cpuset_t available; - bool npus_calculated; - unsigned int npus; - unsigned int idx; - unsigned int num_bound; -} prte_hwloc_obj_data_t; -PMIX_CLASS_DECLARATION(prte_hwloc_obj_data_t); - -typedef struct { - pmix_list_item_t super; - hwloc_obj_type_t type; - unsigned cache_level; - unsigned int num_objs; - pmix_list_t sorted_by_dist_list; -} prte_hwloc_summary_t; -PMIX_CLASS_DECLARATION(prte_hwloc_summary_t); - -typedef struct { - pmix_object_t super; - hwloc_cpuset_t available; - pmix_list_t summaries; - hwloc_obj_t* numas; - unsigned num_numas; -} prte_hwloc_topo_data_t; -PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_hwloc_topo_data_t); - /* define binding policies */ typedef uint16_t prte_binding_policy_t; #define PRTE_BINDING_POLICY PRTE_UINT16 @@ -150,9 +121,6 @@ typedef uint16_t prte_binding_policy_t; #define PRTE_BIND_IF_SUPPORTED 0x1000 #define PRTE_BIND_ALLOW_OVERLOAD 0x2000 #define PRTE_BIND_GIVEN 0x4000 -/* bind each rank to the cpu in the given - * cpu list based on its node-local-rank */ -#define PRTE_BIND_ORDERED 0x8000 // overload policy was given #define PRTE_BIND_OVERLOAD_GIVEN 0x0100 @@ -184,12 +152,10 @@ typedef uint16_t prte_binding_policy_t; /* macro to detect if binding is forced */ #define PRTE_BIND_OVERLOAD_ALLOWED(n) (PRTE_BIND_ALLOW_OVERLOAD & (n)) #define PRTE_BIND_OVERLOAD_SET(n) (PRTE_BIND_OVERLOAD_GIVEN & (n)) -#define PRTE_BIND_ORDERED_REQUESTED(n) (PRTE_BIND_ORDERED & (n)) /* some global values */ PRTE_EXPORT extern hwloc_topology_t prte_hwloc_topology; PRTE_EXPORT extern prte_binding_policy_t prte_hwloc_default_binding_policy; -PRTE_EXPORT extern hwloc_cpuset_t prte_hwloc_my_cpuset; PRTE_EXPORT extern hwloc_obj_type_t prte_hwloc_levels[]; PRTE_EXPORT extern char *prte_hwloc_default_cpu_list; PRTE_EXPORT extern bool prte_hwloc_default_use_hwthread_cpus; @@ -264,15 +230,6 @@ PRTE_EXPORT int prte_hwloc_base_set_default_binding(void *jdata, void *options); PRTE_EXPORT int prte_hwloc_base_set_binding_policy(void *jdata, char *spec); -/** - * Loads prte_hwloc_my_cpuset (global variable in - * src/hwloc/hwloc-internal.h) for this process. prte_hwloc_my_cpuset - * will be loaded with this process' binding, or, if the process is - * not bound, use the hwloc root object's (available and online) - * cpuset. - */ -PRTE_EXPORT void prte_hwloc_base_get_local_cpuset(void); - struct prte_rmaps_numa_node_t { pmix_list_item_t super; int index; @@ -322,16 +279,14 @@ PRTE_EXPORT int prte_hwloc_base_set_topology(char *topofile); PRTE_EXPORT hwloc_cpuset_t prte_hwloc_base_generate_cpuset(hwloc_topology_t topo, bool use_hwthread_cpus, char *cpulist); -PRTE_EXPORT int prte_hwloc_base_filter_cpus(hwloc_topology_t topo); +PRTE_EXPORT hwloc_cpuset_t prte_hwloc_base_filter_cpus(hwloc_topology_t topo); /** * Free the hwloc topology. */ -PRTE_EXPORT void prte_hwloc_base_free_topology(hwloc_topology_t topo); PRTE_EXPORT unsigned int prte_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo, hwloc_obj_type_t target, unsigned cache_level); -PRTE_EXPORT void prte_hwloc_base_clear_usage(hwloc_topology_t topo); PRTE_EXPORT hwloc_obj_t prte_hwloc_base_get_obj_by_type(hwloc_topology_t topo, hwloc_obj_type_t target, @@ -339,9 +294,6 @@ PRTE_EXPORT hwloc_obj_t prte_hwloc_base_get_obj_by_type(hwloc_topology_t topo, unsigned int instance); PRTE_EXPORT unsigned int prte_hwloc_base_get_obj_idx(hwloc_topology_t topo, hwloc_obj_t obj); -PRTE_EXPORT int prte_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char *device_name, - pmix_list_t *sorted_list); - /** * Get the number of pu's under a given hwloc object. */ diff --git a/src/hwloc/hwloc.c b/src/hwloc/hwloc.c index 1d5cace414..6b25ecd2fa 100644 --- a/src/hwloc/hwloc.c +++ b/src/hwloc/hwloc.c @@ -282,19 +282,13 @@ void prte_hwloc_base_close(void) return; } - /* free memory */ - if (NULL != prte_hwloc_my_cpuset) { - hwloc_bitmap_free(prte_hwloc_my_cpuset); - prte_hwloc_my_cpuset = NULL; - } - if (NULL != prte_hwloc_default_cpu_list) { free(prte_hwloc_default_cpu_list); } /* destroy the topology */ if (NULL != prte_hwloc_topology) { - prte_hwloc_base_free_topology(prte_hwloc_topology); + hwloc_topology_destroy(prte_hwloc_topology); prte_hwloc_topology = NULL; } @@ -305,7 +299,7 @@ void prte_hwloc_base_close(void) int prte_hwloc_base_set_default_binding(void *jd, void *opt) { prte_job_t *jdata = (prte_job_t*)jd; - prte_schizo_options_t *options = (prte_schizo_options_t*)opt; + prte_rmaps_options_t *options = (prte_rmaps_options_t*)opt; prte_mapping_policy_t mpol; if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, NULL, PMIX_UINT16)) { @@ -358,41 +352,21 @@ int prte_hwloc_base_set_default_binding(void *jd, void *opt) PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_PACKAGE); } else { /* we are mapping by node or some other non-object method */ - if (options->nprocs <= 2) { - if (options->use_hwthreads) { - /* if we are using hwthread cpus, then bind to those */ - prte_output_verbose(options->verbosity, options->stream, - "setdefaultbinding[%d] binding not given - using byhwthread", __LINE__); - PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, - PRTE_BIND_TO_HWTHREAD); - } else { - /* for performance, bind to core */ - prte_output_verbose(options->verbosity, options->stream, - "setdefaultbinding[%d] binding not given - using bycore", __LINE__); - PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, - PRTE_BIND_TO_CORE); - } + if (options->use_hwthreads) { + /* if we are using hwthread cpus, then bind to those */ + prte_output_verbose(options->verbosity, options->stream, + "setdefaultbinding[%d] binding not given - using byhwthread", __LINE__); + PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, + PRTE_BIND_TO_HWTHREAD); } else { - /* bind to numa (if present), or by package (if numa isn't present and package is) */ - if (NULL != hwloc_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_NUMANODE, 0)) { - prte_output_verbose(options->verbosity, options->stream, - "setdefaultbinding[%d] binding not given - using bynuma", __LINE__); - PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_NUMA); - } else if (NULL != hwloc_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_PACKAGE, 0)) { - prte_output_verbose(options->verbosity, options->stream, - "setdefaultbinding[%d] binding not given - using bypackage", __LINE__); - PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_PACKAGE); - } else { - /* if we have neither, then just don't bind */ - prte_output_verbose(options->verbosity, options->stream, - "setdefaultbinding[%d] binding not given and no NUMA " - "or packages - not binding", - __LINE__); - PRTE_SET_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_NONE); - } + /* otherwise bind to core */ + prte_output_verbose(options->verbosity, options->stream, + "setdefaultbinding[%d] binding not given - using bycore", __LINE__); + PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, + PRTE_BIND_TO_CORE); } } - } else if (options->nprocs <= 2) { + } else { if (options->use_hwthreads) { /* if we are using hwthread cpus, then bind to those */ prte_output_verbose(options->verbosity, options->stream, @@ -400,31 +374,12 @@ int prte_hwloc_base_set_default_binding(void *jd, void *opt) __LINE__); PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_HWTHREAD); } else { - /* for performance, bind to core */ + /* otherwise bind to core */ prte_output_verbose(options->verbosity, options->stream, "setdefaultbinding[%d] binding not given - using bycore", __LINE__); PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_CORE); } - } else { - /* for performance, bind to numa, if available, else try package */ - if (NULL != hwloc_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_NUMANODE, 0)) { - prte_output_verbose(options->verbosity, options->stream, - "setdefaultbinding[%d] binding not given - using bynuma", - __LINE__); - PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_NUMA); - } else if (NULL != hwloc_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_PACKAGE, 0)) { - prte_output_verbose(options->verbosity, options->stream, - "setdefaultbinding[%d] binding not given - using bypackage", - __LINE__); - PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_PACKAGE); - } else { - /* just don't bind */ - prte_output_verbose(options->verbosity, options->stream, - "setdefaultbinding[%d] binding not given and no packages - not binding", - __LINE__); - PRTE_SET_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_NONE); - } } } /* they might have set the overload-allowed flag while wanting PRRTE @@ -566,55 +521,6 @@ char *prte_hwloc_base_print_locality(prte_hwloc_locality_t locality) return ptr->buffers[ptr->cntr]; } -static void obj_data_const(prte_hwloc_obj_data_t *ptr) -{ - ptr->npus_calculated = false; - ptr->npus = 0; - ptr->idx = UINT_MAX; - ptr->num_bound = 0; -} -PMIX_CLASS_INSTANCE(prte_hwloc_obj_data_t, pmix_object_t, obj_data_const, NULL); - -static void sum_const(prte_hwloc_summary_t *ptr) -{ - ptr->num_objs = 0; - PMIX_CONSTRUCT(&ptr->sorted_by_dist_list, pmix_list_t); -} -static void sum_dest(prte_hwloc_summary_t *ptr) -{ - pmix_list_item_t *item; - while (NULL != (item = pmix_list_remove_first(&ptr->sorted_by_dist_list))) { - PMIX_RELEASE(item); - } - PMIX_DESTRUCT(&ptr->sorted_by_dist_list); -} -PMIX_CLASS_INSTANCE(prte_hwloc_summary_t, pmix_list_item_t, sum_const, sum_dest); -static void topo_data_const(prte_hwloc_topo_data_t *ptr) -{ - ptr->available = NULL; - PMIX_CONSTRUCT(&ptr->summaries, pmix_list_t); - ptr->numas = NULL; - ptr->num_numas = 0; -} -static void topo_data_dest(prte_hwloc_topo_data_t *ptr) -{ - pmix_list_item_t *item; - - if (NULL != ptr->available) { - hwloc_bitmap_free(ptr->available); - } - while (NULL != (item = pmix_list_remove_first(&ptr->summaries))) { - PMIX_RELEASE(item); - } - PMIX_DESTRUCT(&ptr->summaries); - if (NULL != ptr->numas) { - free(ptr->numas); - } -} -PMIX_CLASS_INSTANCE(prte_hwloc_topo_data_t, pmix_object_t, topo_data_const, topo_data_dest); - -PMIX_CLASS_INSTANCE(prte_rmaps_numa_node_t, pmix_list_item_t, NULL, NULL); - int prte_hwloc_base_set_binding_policy(void *jdat, char *spec) { int i; @@ -647,8 +553,6 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec) } else if (0 == strcasecmp(quals[i], "no-overload")) { tmp = (tmp & ~PRTE_BIND_ALLOW_OVERLOAD); tmp |= PRTE_BIND_OVERLOAD_GIVEN; - } else if (0 == strcasecmp(quals[i], "ordered")) { - tmp |= PRTE_BIND_ORDERED; } else if (0 == strcasecmp(quals[i], "REPORT")) { if (NULL == jdata) { pmix_show_help("help-prte-rmaps-base.txt", "unsupported-default-modifier", true, diff --git a/src/hwloc/hwloc_base_util.c b/src/hwloc/hwloc_base_util.c index 25d1d4ac9b..0695c87759 100644 --- a/src/hwloc/hwloc_base_util.c +++ b/src/hwloc/hwloc_base_util.c @@ -103,13 +103,13 @@ hwloc_obj_t prte_hwloc_base_get_pu(hwloc_topology_t topo, bool use_hwthread_cpus return obj; } -hwloc_cpuset_t prte_hwloc_base_generate_cpuset(hwloc_topology_t topo, bool use_hwthread_cpus, +hwloc_cpuset_t prte_hwloc_base_generate_cpuset(hwloc_topology_t topo, + bool use_hwthread_cpus, char *cpulist) { hwloc_cpuset_t avail = NULL, pucpus, res; char **ranges = NULL, **range = NULL; int idx, cpu, start, end; - prte_hwloc_obj_data_t *data; hwloc_obj_t pu; /* find the specified logical cpus */ @@ -132,12 +132,6 @@ hwloc_cpuset_t prte_hwloc_base_generate_cpuset(hwloc_topology_t topo, bool use_h #endif hwloc_bitmap_or(res, avail, pucpus); hwloc_bitmap_copy(avail, res); - data = (prte_hwloc_obj_data_t *) pu->userdata; - if (NULL == data) { - pu->userdata = (void *) PMIX_NEW(prte_hwloc_obj_data_t); - data = (prte_hwloc_obj_data_t *) pu->userdata; - } - data->npus++; } break; case 2: @@ -153,12 +147,6 @@ hwloc_cpuset_t prte_hwloc_base_generate_cpuset(hwloc_topology_t topo, bool use_h #endif hwloc_bitmap_or(res, avail, pucpus); hwloc_bitmap_copy(avail, res); - data = (prte_hwloc_obj_data_t *) pu->userdata; - if (NULL == data) { - pu->userdata = (void *) PMIX_NEW(prte_hwloc_obj_data_t); - data = (prte_hwloc_obj_data_t *) pu->userdata; - } - data->npus++; } } break; @@ -180,6 +168,12 @@ hwloc_cpuset_t prte_hwloc_base_setup_summary(hwloc_topology_t topo) { hwloc_cpuset_t avail = NULL; + avail = hwloc_bitmap_alloc(); + /* get the cpus we are bound to */ + if (0 <= hwloc_get_cpubind(topo, avail, HWLOC_CPUBIND_PROCESS)) { + return avail; + } + /* get the root available cpuset */ #if HWLOC_API_VERSION < 0x20000 hwloc_obj_t root; @@ -190,15 +184,14 @@ hwloc_cpuset_t prte_hwloc_base_setup_summary(hwloc_topology_t topo) return NULL; } if (NULL == root->online_cpuset) { - avail = hwloc_bitmap_dup(root->allowed_cpuset); + hwloc_bitmap_copy(avail, root->allowed_cpuset); } else if (NULL == root->allowed_cpuset) { - avail = hwloc_bitmap_dup(root->online_cpuset); + hwloc_bitmap_copy(avail, root->online_cpuset); } else { - avail = hwloc_bitmap_alloc(); hwloc_bitmap_and(avail, root->online_cpuset, root->allowed_cpuset); } #else - avail = hwloc_bitmap_dup(hwloc_topology_get_allowed_cpuset(topo)); + hwloc_bitmap_copy(avail, hwloc_topology_get_allowed_cpuset(topo)); #endif return avail; @@ -207,25 +200,9 @@ hwloc_cpuset_t prte_hwloc_base_setup_summary(hwloc_topology_t topo) /* determine the node-level available cpuset based on * online vs allowed vs user-specified cpus */ -int prte_hwloc_base_filter_cpus(hwloc_topology_t topo) +hwloc_cpuset_t prte_hwloc_base_filter_cpus(hwloc_topology_t topo) { - hwloc_obj_t root; hwloc_cpuset_t avail = NULL; - prte_hwloc_topo_data_t *sum; - unsigned width, w, m, N, last; - hwloc_obj_t obj; - - root = hwloc_get_root_obj(topo); - - if (NULL == root->userdata) { - root->userdata = (void *) PMIX_NEW(prte_hwloc_topo_data_t); - } - sum = (prte_hwloc_topo_data_t *) root->userdata; - - /* should only ever enter here once, but check anyway */ - if (NULL != sum->available) { - return PRTE_SUCCESS; - } /* process any specified default cpu set against this topology */ if (NULL == prte_hwloc_default_cpu_list) { @@ -237,63 +214,7 @@ int prte_hwloc_base_filter_cpus(hwloc_topology_t topo) avail = prte_hwloc_base_generate_cpuset(topo, prte_hwloc_default_use_hwthread_cpus, prte_hwloc_default_cpu_list); } - if (NULL == avail) { - return PRTE_ERR_NOT_SUPPORTED; - } - - /* cache this info */ - sum->available = avail; - - /* Historically, CPU packages contained a single cpu die - * and nothing else. NUMA was therefore determined by simply - * looking at the memory bus attached to the socket where - * the package resided - all cpus in the package were - * exclusively "under" that NUMA. Since each socket had a - * unique NUMA, you could easily map by them. - - * More recently, packages have started to contain multiple - * cpu dies as well as memory and sometimes even fabric die. - * In these cases, the memory bus of the cpu dies in the - * package generally share any included memory die. This - * complicates the memory situation, leaving NUMA domains - * no longer cleanly delineated by processor (i.e.., the - * NUMA domains overlap each other). - * - * Fortunately, the OS index of non-CPU NUMA domains starts - * at 255 and counts downward (at least for GPUs) - while - * the index of CPU NUMA domains starts at 0 and counts - * upward. We can therefore separate the two by excluding - * NUMA domains with an OS index above the level where - * they first begin to intersect - */ - - /* compute the CPU NUMA cutoff for this node */ - width = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_NUMANODE); - if (0 == width) { - sum->num_numas = 0; - return PRTE_SUCCESS; - } - sum->numas = (hwloc_obj_t*)malloc(width * sizeof(hwloc_obj_t)); - sum->num_numas = 0; - for (w=0; w < UINT_MAX && sum->num_numas < width; w++) { - /* get the object at this index */ - obj = hwloc_get_numanode_obj_by_os_index(topo, w); - if (NULL == obj) { - continue; - } - /* check for overlap with all preceding numas */ - for (m=0; m < sum->num_numas; m++) { - if (hwloc_bitmap_intersects(obj->cpuset, sum->numas[m]->cpuset)) { - // if it intersects anyone, then we are done - return PRTE_SUCCESS; - } - } - /* cache this objet */ - sum->numas[sum->num_numas] = obj; - sum->num_numas++; - } - - return PRTE_SUCCESS; + return avail; } static void fill_cache_line_size(void) @@ -337,7 +258,8 @@ int prte_hwloc_base_get_topology(void) { int rc; - prte_output_verbose(2, prte_hwloc_base_output, "hwloc:base:get_topology"); + prte_output_verbose(2, prte_hwloc_base_output, + "hwloc:base:get_topology"); /* see if we already have it */ if (NULL != prte_hwloc_topology) { @@ -345,7 +267,8 @@ int prte_hwloc_base_get_topology(void) } if (NULL == prte_hwloc_base_topo_file) { - prte_output_verbose(1, prte_hwloc_base_output, "hwloc:base discovering topology"); + prte_output_verbose(1, prte_hwloc_base_output, + "hwloc:base discovering topology"); if (0 != hwloc_topology_init(&prte_hwloc_topology) || 0 != prte_hwloc_base_topology_set_flags(prte_hwloc_topology, 0, true) || 0 != hwloc_topology_load(prte_hwloc_topology)) { @@ -361,21 +284,10 @@ int prte_hwloc_base_get_topology(void) } } - /* filter the cpus thru any default cpu set */ - if (PRTE_SUCCESS != (rc = prte_hwloc_base_filter_cpus(prte_hwloc_topology))) { - hwloc_topology_destroy(prte_hwloc_topology); - return rc; - } - /* fill prte_cache_line_size global with the smallest L1 cache line size */ fill_cache_line_size(); - /* get or update our local cpuset - it will get used multiple - * times, so it's more efficient to keep a global copy - */ - prte_hwloc_base_get_local_cpuset(); - return PRTE_SUCCESS; } @@ -455,84 +367,6 @@ int prte_hwloc_base_set_topology(char *topofile) return PRTE_SUCCESS; } -static void free_object(hwloc_obj_t obj) -{ - prte_hwloc_obj_data_t *data; - unsigned k; - - /* free any data hanging on this object */ - if (NULL != obj->userdata) { - data = (prte_hwloc_obj_data_t *) obj->userdata; - PMIX_RELEASE(data); - obj->userdata = NULL; - } - - /* loop thru our children */ - for (k = 0; k < obj->arity; k++) { - free_object(obj->children[k]); - } -} - -void prte_hwloc_base_free_topology(hwloc_topology_t topo) -{ - hwloc_obj_t obj; - prte_hwloc_topo_data_t *rdata; - unsigned k; - - if (!topo_in_shmem) { - obj = hwloc_get_root_obj(topo); - /* release the root-level userdata */ - if (NULL != obj->userdata) { - rdata = (prte_hwloc_topo_data_t *) obj->userdata; - PMIX_RELEASE(rdata); - obj->userdata = NULL; - } - /* now recursively descend and release userdata - * in the rest of the objects - */ - for (k = 0; k < obj->arity; k++) { - free_object(obj->children[k]); - } - } - hwloc_topology_destroy(topo); -} - -void prte_hwloc_base_get_local_cpuset(void) -{ -#if HWLOC_API_VERSION < 0x20000 - hwloc_obj_t root; -#endif - - if (NULL != prte_hwloc_topology) { - if (NULL == prte_hwloc_my_cpuset) { - prte_hwloc_my_cpuset = hwloc_bitmap_alloc(); - } - - /* get the cpus we are bound to */ - if (hwloc_get_cpubind(prte_hwloc_topology, prte_hwloc_my_cpuset, HWLOC_CPUBIND_PROCESS) - < 0) { -/* we are not bound - use the root's available cpuset */ -#if HWLOC_API_VERSION < 0x20000 - root = hwloc_get_root_obj(prte_hwloc_topology); - if (NULL == root->online_cpuset && NULL == root->allowed_cpuset) { - /* we are hosed */ - PRTE_ERROR_LOG(PRTE_ERR_NOT_SUPPORTED); - } - if (NULL == root->online_cpuset) { - hwloc_bitmap_copy(prte_hwloc_my_cpuset, root->allowed_cpuset); - } else if (NULL == root->allowed_cpuset) { - hwloc_bitmap_copy(prte_hwloc_my_cpuset, root->online_cpuset); - } else { - hwloc_bitmap_and(prte_hwloc_my_cpuset, root->online_cpuset, root->allowed_cpuset); - } -#else - hwloc_bitmap_copy(prte_hwloc_my_cpuset, - hwloc_topology_get_allowed_cpuset(prte_hwloc_topology)); -#endif - } - } -} - int prte_hwloc_base_report_bind_failure(const char *file, int line, const char *msg, int rc) { static int already_reported = 0; @@ -623,26 +457,11 @@ unsigned int prte_hwloc_base_get_npus(hwloc_topology_t topo, bool use_hwthread_c unsigned int prte_hwloc_base_get_obj_idx(hwloc_topology_t topo, hwloc_obj_t obj) { unsigned cache_level = 0; - prte_hwloc_obj_data_t *data; hwloc_obj_t ptr; unsigned int nobjs, i; PRTE_OUTPUT_VERBOSE((5, prte_hwloc_base_output, "hwloc:base:get_idx")); - /* see if we already have the info */ - data = (prte_hwloc_obj_data_t *) obj->userdata; - - if (NULL == data) { - data = PMIX_NEW(prte_hwloc_obj_data_t); - obj->userdata = (void *) data; - } - - if (data->idx < UINT_MAX) { - PRTE_OUTPUT_VERBOSE( - (5, prte_hwloc_base_output, "hwloc:base:get_idx already have data: %u", data->idx)); - return data->idx; - } - #if HWLOC_API_VERSION < 0x20000 /* determine the number of objects of this type */ if (HWLOC_OBJ_CACHE == obj->type) { @@ -660,7 +479,6 @@ unsigned int prte_hwloc_base_get_obj_idx(hwloc_topology_t topo, hwloc_obj_t obj) for (i = 0; i < nobjs; i++) { ptr = prte_hwloc_base_get_obj_by_type(topo, obj->type, cache_level, i); if (ptr == obj) { - data->idx = i; return i; } } @@ -712,19 +530,6 @@ unsigned int prte_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo, hwloc_obj return 0; } - /* if the type is NUMA, then we just return the cached number */ - if (HWLOC_OBJ_NUMANODE == target) { - hwloc_obj_t root; - prte_hwloc_topo_data_t *sum; - - root = hwloc_get_root_obj(topo); - sum = (prte_hwloc_topo_data_t *) root->userdata; - if (NULL == sum) { - return 0; - } - return sum->num_numas; - } - #if HWLOC_API_VERSION >= 0x20000 if (0 > (rc = hwloc_get_nbobjs_by_type(topo, target))) { prte_output(0, "UNKNOWN HWLOC ERROR"); @@ -734,8 +539,6 @@ unsigned int prte_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo, hwloc_obj #else unsigned int num_objs; hwloc_obj_t obj; - prte_hwloc_summary_t *sum; - prte_hwloc_topo_data_t *data; /* we can just use the hwloc accessor to get it, * unless it is a CACHE as these are treated as special cases @@ -753,34 +556,8 @@ unsigned int prte_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo, hwloc_obj num_objs = 0; obj = hwloc_get_root_obj(topo); - /* first see if the topology already has this summary */ - data = (prte_hwloc_topo_data_t *) obj->userdata; - if (NULL == data) { - data = PMIX_NEW(prte_hwloc_topo_data_t); - obj->userdata = (void *) data; - } else { - PMIX_LIST_FOREACH(sum, &data->summaries, prte_hwloc_summary_t) - { - if (target == sum->type && cache_level == sum->cache_level) { - /* yep - return the value */ - PRTE_OUTPUT_VERBOSE((5, prte_hwloc_base_output, - "hwloc:base:get_nbojbs pre-existing data %u of %s:%u", - sum->num_objs, hwloc_obj_type_string(target), cache_level)); - return sum->num_objs; - } - } - } - - /* don't already know it - go get it */ df_search(topo, obj, target, cache_level, 0, &num_objs); - /* cache the results for later */ - sum = PMIX_NEW(prte_hwloc_summary_t); - sum->type = target; - sum->cache_level = cache_level; - sum->num_objs = num_objs; - pmix_list_append(&data->summaries, &sum->super); - PRTE_OUTPUT_VERBOSE((5, prte_hwloc_base_output, "hwloc:base:get_nbojbs computed data %u of %s:%u", num_objs, hwloc_obj_type_string(target), cache_level)); @@ -800,19 +577,6 @@ hwloc_obj_t prte_hwloc_base_get_obj_by_type(hwloc_topology_t topo, hwloc_obj_typ return NULL; } - /* if we are looking for NUMA, then just return the cached object */ - if (HWLOC_OBJ_NUMANODE == target) { - hwloc_obj_t obj, root; - prte_hwloc_topo_data_t *sum; - - root = hwloc_get_root_obj(topo); - sum = (prte_hwloc_topo_data_t *) root->userdata; - if (NULL == sum || sum->num_numas <= instance) { - return NULL; - } - return sum->numas[instance]; - } - #if HWLOC_API_VERSION >= 0x20000 return hwloc_get_obj_by_type(topo, target, instance); #else @@ -831,42 +595,6 @@ hwloc_obj_t prte_hwloc_base_get_obj_by_type(hwloc_topology_t topo, hwloc_obj_typ #endif } -static void df_clear(hwloc_topology_t topo, hwloc_obj_t start) -{ - unsigned k; - prte_hwloc_obj_data_t *data; - - /* see how many procs are bound to us */ - data = (prte_hwloc_obj_data_t *) start->userdata; - if (NULL != data) { - data->num_bound = 0; - } - - for (k = 0; k < start->arity; k++) { - df_clear(topo, start->children[k]); - } -} - -void prte_hwloc_base_clear_usage(hwloc_topology_t topo) -{ - hwloc_obj_t root; - unsigned k; - - /* bozo check */ - if (NULL == topo) { - PRTE_OUTPUT_VERBOSE((5, prte_hwloc_base_output, "hwloc:base:clear_usage: NULL topology")); - return; - } - - root = hwloc_get_root_obj(topo); - /* must not start at root as the root object has - * a different userdata attached to it - */ - for (k = 0; k < root->arity; k++) { - df_clear(topo, root->children[k]); - } -} - /* The current slot_list notation only goes to the core level - i.e., the location * is specified as package:core. Thus, the code below assumes that all locations * are to be parsed under that notation. @@ -1575,218 +1303,6 @@ char *prte_hwloc_base_cset2str(hwloc_const_cpuset_t cpuset, return result; } -static int dist_cmp_fn(pmix_list_item_t **a, pmix_list_item_t **b) -{ - prte_rmaps_numa_node_t *aitem = *((prte_rmaps_numa_node_t **) a); - prte_rmaps_numa_node_t *bitem = *((prte_rmaps_numa_node_t **) b); - - if (aitem->dist_from_closed > bitem->dist_from_closed) { - return 1; - } else if (aitem->dist_from_closed == bitem->dist_from_closed) { - return 0; - } else { - return -1; - } -} - -static void sort_by_dist(hwloc_topology_t topo, char *device_name, pmix_list_t *sorted_list) -{ - hwloc_obj_t device_obj = NULL; - hwloc_obj_t obj = NULL; - struct hwloc_distances_s *distances; - prte_rmaps_numa_node_t *numa_node; - int close_node_index; - float latency; - unsigned int j; -#if HWLOC_API_VERSION < 0x20000 - hwloc_obj_t root = NULL; - int depth; - unsigned i; -#else - unsigned distances_nr = 0; -#endif - - for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); device_obj; - device_obj = hwloc_get_next_osdev(topo, device_obj)) { - if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS - || device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { - if (!strcmp(device_obj->name, device_name)) { - /* find numa node containing this device */ - obj = device_obj->parent; -#if HWLOC_API_VERSION < 0x20000 - while ((obj != NULL) && (obj->type != HWLOC_OBJ_NUMANODE)) { - obj = obj->parent; - } -#else - while (obj && !obj->memory_arity) { - obj = obj->parent; /* no memory child, walk up */ - } - if (obj != NULL) { - obj = obj->memory_first_child; - } -#endif - if (obj == NULL) { - prte_output_verbose( - 5, prte_hwloc_base_output, - "hwloc:base:get_sorted_numa_list: NUMA node closest to %s wasn't found.", - device_name); - return; - } else { - close_node_index = obj->logical_index; - } - - /* find distance matrix for all numa nodes */ -#if HWLOC_API_VERSION < 0x20000 - distances = (struct hwloc_distances_s *) - hwloc_get_whole_distance_matrix_by_type(topo, HWLOC_OBJ_NUMANODE); - if (NULL == distances) { - /* we can try to find distances under group object. This info can be there. */ - depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NUMANODE); - if (HWLOC_TYPE_DEPTH_UNKNOWN == depth) { - prte_output_verbose(5, prte_hwloc_base_output, - "hwloc:base:get_sorted_numa_list: There is no " - "information about distances on the node."); - return; - } - root = hwloc_get_root_obj(topo); - for (i = 0; i < root->arity; i++) { - obj = root->children[i]; - if (obj->distances_count > 0) { - for (j = 0; j < obj->distances_count; j++) { - if (obj->distances[j]->relative_depth + 1 == (unsigned) depth) { - distances = obj->distances[j]; - break; - } - } - } - } - } - /* find all distances for our close node with logical index = close_node_index as - * close_node_index + nbobjs*j */ - if ((NULL == distances) || (0 == distances->nbobjs)) { - prte_output_verbose(5, prte_hwloc_base_output, - "hwloc:base:get_sorted_numa_list: There is no information " - "about distances on the node."); - return; - } - /* fill list of numa nodes */ - for (j = 0; j < distances->nbobjs; j++) { - latency = distances->latency[close_node_index + distances->nbobjs * j]; - numa_node = PMIX_NEW(prte_rmaps_numa_node_t); - numa_node->index = j; - numa_node->dist_from_closed = latency; - pmix_list_append(sorted_list, &numa_node->super); - } -#else - distances_nr = 1; - if (0 != hwloc_distances_get_by_type(topo, HWLOC_OBJ_NUMANODE, &distances_nr, - &distances, HWLOC_DISTANCES_KIND_MEANS_LATENCY, 0) || - 0 == distances_nr) { - prte_output_verbose(5, prte_hwloc_base_output, - "hwloc:base:get_sorted_numa_list: There is no information " - "about distances on the node."); - return; - } - /* fill list of numa nodes */ - for (j = 0; j < distances->nbobjs; j++) { - latency = distances->values[close_node_index + distances->nbobjs * j]; - numa_node = PMIX_NEW(prte_rmaps_numa_node_t); - numa_node->index = j; - numa_node->dist_from_closed = latency; - pmix_list_append(sorted_list, &numa_node->super); - } - hwloc_distances_release(topo, distances); -#endif - /* sort numa nodes by distance from the closest one to PCI */ - pmix_list_sort(sorted_list, dist_cmp_fn); - return; - } - } - } -} - -static int find_devices(hwloc_topology_t topo, char **device_name) -{ - hwloc_obj_t device_obj = NULL; - int count = 0; - for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); device_obj; - device_obj = hwloc_get_next_osdev(topo, device_obj)) { - if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { - count++; - free(*device_name); - *device_name = strdup(device_obj->name); - } - } - return count; -} - -int prte_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char *device_name, - pmix_list_t *sorted_list) -{ - hwloc_obj_t obj; - prte_hwloc_summary_t *sum; - prte_hwloc_topo_data_t *data; - prte_rmaps_numa_node_t *numa, *copy_numa; - int count; - - obj = hwloc_get_root_obj(topo); - - /* first see if the topology already has this info */ - /* we call prte_hwloc_base_get_nbobjs_by_type() before it to fill summary object so it should - * exist*/ - data = (prte_hwloc_topo_data_t *) obj->userdata; - if (NULL != data) { - PMIX_LIST_FOREACH(sum, &data->summaries, prte_hwloc_summary_t) - { - if (HWLOC_OBJ_NUMANODE == sum->type) { - if (pmix_list_get_size(&sum->sorted_by_dist_list) > 0) { - PMIX_LIST_FOREACH(numa, &(sum->sorted_by_dist_list), prte_rmaps_numa_node_t) - { - copy_numa = PMIX_NEW(prte_rmaps_numa_node_t); - copy_numa->index = numa->index; - copy_numa->dist_from_closed = numa->dist_from_closed; - pmix_list_append(sorted_list, ©_numa->super); - } - return PRTE_SUCCESS; - } else { - /* don't already know it - go get it */ - /* firstly we check if we need to autodetect OpenFabrics devices or we have the - * specified one */ - bool free_device_name = false; - if (!strcmp(device_name, "auto")) { - count = find_devices(topo, &device_name); - if (count > 1) { - free(device_name); - return count; - } - free_device_name = true; - } - if (!device_name) { - return PRTE_ERR_NOT_FOUND; - } else if (free_device_name && (0 == strlen(device_name))) { - free(device_name); - return PRTE_ERR_NOT_FOUND; - } - sort_by_dist(topo, device_name, sorted_list); - if (free_device_name) { - free(device_name); - } - /* store this info in summary object for later usage */ - PMIX_LIST_FOREACH(numa, sorted_list, prte_rmaps_numa_node_t) - { - copy_numa = PMIX_NEW(prte_rmaps_numa_node_t); - copy_numa->index = numa->index; - copy_numa->dist_from_closed = numa->dist_from_closed; - pmix_list_append(&(sum->sorted_by_dist_list), ©_numa->super); - } - return PRTE_SUCCESS; - } - } - } - } - return PRTE_ERR_NOT_FOUND; -} - char *prte_hwloc_base_get_topo_signature(hwloc_topology_t topo) { int nnuma, npackage, nl3, nl2, nl1, ncore, nhwt; diff --git a/src/mca/ess/hnp/ess_hnp_module.c b/src/mca/ess/hnp/ess_hnp_module.c index 00aa4cedb0..86ee05fe26 100644 --- a/src/mca/ess/hnp/ess_hnp_module.c +++ b/src/mca/ess/hnp/ess_hnp_module.c @@ -416,6 +416,7 @@ static int rte_init(int argc, char **argv) t->sig = strdup(prte_topo_signature); pmix_pointer_array_add(prte_node_topologies, t); node->topology = t; + node->available = prte_hwloc_base_filter_cpus(prte_hwloc_topology); if (15 < prte_output_get_verbosity(prte_ess_base_framework.framework_output)) { char *output = NULL; prte_output(0, "%s Topology Info:", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); diff --git a/src/mca/grpcomm/direct/grpcomm_direct.c b/src/mca/grpcomm/direct/grpcomm_direct.c index b30c36ee1e..8ebbe47ef8 100644 --- a/src/mca/grpcomm/direct/grpcomm_direct.c +++ b/src/mca/grpcomm/direct/grpcomm_direct.c @@ -519,15 +519,6 @@ static void xcast_recv(int status, pmix_proc_t *sender, PMIX_DATA_BUFFER_RELEASE(relay); return; } - if (PRTE_SUCCESS != (ret = prte_util_parse_node_info(data))) { - PRTE_ERROR_LOG(ret); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - PMIX_DATA_BUFFER_DESTRUCT(&datbuf); - PMIX_DESTRUCT(&coll); - PMIX_DATA_BUFFER_RELEASE(rly); - PMIX_DATA_BUFFER_RELEASE(relay); - return; - } /* unpack the wireup info */ cnt = 1; while (PMIX_SUCCESS == (ret = PMIx_Data_unpack(NULL, data, &dmn, &cnt, PMIX_PROC))) { diff --git a/src/mca/odls/base/odls_base_default_fns.c b/src/mca/odls/base/odls_base_default_fns.c index 37be70a90d..bce419f60b 100644 --- a/src/mca/odls/base/odls_base_default_fns.c +++ b/src/mca/odls/base/odls_base_default_fns.c @@ -276,14 +276,6 @@ int prte_odls_base_default_get_add_procs_data(pmix_data_buffer_t *buffer, pmix_n return rc; } - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_FULLY_DESCRIBED, NULL, PMIX_BOOL)) { - /* compute and pack the ppn */ - if (PRTE_SUCCESS != (rc = prte_util_generate_ppn(jdata, buffer))) { - PRTE_ERROR_LOG(rc); - return rc; - } - } - /* assemble the node and proc map info */ list = NULL; procs = NULL; @@ -600,40 +592,6 @@ int prte_odls_base_default_construct_child_list(pmix_data_buffer_t *buffer, pmix free(tmp); } - /* if the job is fully described, then mpirun will have computed - * and sent us the complete array of procs in the prte_job_t, so we - * don't need to do anything more here */ - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_FULLY_DESCRIBED, NULL, PMIX_BOOL)) { - /* load the ppn info into the job and node arrays - the - * function will ignore the data on the HNP as it already - * has the info */ - if (PRTE_SUCCESS != (rc = prte_util_decode_ppn(jdata, buffer))) { - PRTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } - - if (!PRTE_PROC_IS_MASTER) { - /* assign locations to the procs */ - if (PRTE_SUCCESS != (rc = prte_rmaps_base_assign_locations(jdata))) { - PRTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } - - /* compute the ranks and add the proc objects - * to the jdata->procs array */ - if (PRTE_SUCCESS != (rc = prte_rmaps_base_compute_vpids(jdata))) { - PRTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } - } - - /* and finally, compute the local and node ranks */ - if (PRTE_SUCCESS != (rc = prte_rmaps_base_compute_local_ranks(jdata))) { - PRTE_ERROR_LOG(rc); - goto REPORT_ERROR; - } - } - /* unpack the byte object containing any application setup info - there * might not be any, so it isn't an error if we don't find things */ cnt = 1; @@ -711,10 +669,8 @@ int prte_odls_base_default_construct_child_list(pmix_data_buffer_t *buffer, pmix /* not ready for use yet */ continue; } - if (!PRTE_PROC_IS_MASTER - && prte_get_attribute(&jdata->attributes, PRTE_JOB_FULLY_DESCRIBED, NULL, PMIX_BOOL)) { - /* the parser will have already made the connection, but the fully described - * case won't have done it, so connect the proc to its node here */ + if (!PRTE_PROC_IS_MASTER) { + /* connect the proc to its node here */ prte_output_verbose(5, prte_odls_base_framework.framework_output, "%s GETTING DAEMON FOR PROC %s WITH PARENT %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&pptr->name), @@ -779,21 +735,11 @@ int prte_odls_base_default_construct_child_list(pmix_data_buffer_t *buffer, pmix } } - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_FULLY_DESCRIBED, NULL, PMIX_BOOL)) { - /* reset the mapped flags */ - for (n = 0; n < jdata->map->nodes->size; n++) { - if (NULL - != (node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, n))) { - PRTE_FLAG_UNSET(node, PRTE_NODE_FLAG_MAPPED); - } - } - } - - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_FULLY_DESCRIBED, NULL, PMIX_BOOL)) { - /* compute and save bindings of local children */ - if (PRTE_SUCCESS != (rc = prte_rmaps_base_compute_bindings(jdata))) { - PRTE_ERROR_LOG(rc); - goto REPORT_ERROR; + /* reset the mapped flags */ + for (n = 0; n < jdata->map->nodes->size; n++) { + if (NULL + != (node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, n))) { + PRTE_FLAG_UNSET(node, PRTE_NODE_FLAG_MAPPED); } } diff --git a/src/mca/plm/base/plm_base_launch_support.c b/src/mca/plm/base/plm_base_launch_support.c index 406652b037..bf4bd3f22d 100644 --- a/src/mca/plm/base/plm_base_launch_support.c +++ b/src/mca/plm/base/plm_base_launch_support.c @@ -759,9 +759,8 @@ void prte_plm_base_complete_setup(int fd, short args, void *cbdata) */ PRTE_HASH_STR(serial_number, h); free(serial_number); - if (PRTE_SUCCESS - != (rc = pmix_hash_table_get_value_uint32(prte_coprocessors, h, - (void **) &vptr))) { + rc = pmix_hash_table_get_value_uint32(prte_coprocessors, h, (void **) &vptr); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); break; } @@ -868,8 +867,14 @@ void prte_plm_base_send_launch_msg(int fd, short args, void *cbdata) if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); } - prte_never_launched = true; - PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_ALL_JOBS_COMPLETE); + /* if we are persistent, then we remain alive - otherwise, declare + * all jobs complete and terminate */ + if (prte_persistent) { + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_TERMINATED); + } else { + prte_never_launched = true; + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_ALL_JOBS_COMPLETE); + } PMIX_RELEASE(caddy); if (NULL != cmpdata) { free(cmpdata); @@ -1129,7 +1134,6 @@ void prte_plm_base_daemon_topology(int status, pmix_proc_t *sender, pmix_data_bu { hwloc_topology_t topo; hwloc_obj_t root; - prte_hwloc_topo_data_t *sum; int rc, idx; char *sig, *coprocessors, **sns; prte_proc_t *daemon = NULL; @@ -1151,8 +1155,8 @@ void prte_plm_base_daemon_topology(int status, pmix_proc_t *sender, pmix_data_bu if (NULL == jdatorted) { jdatorted = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace); } - if (NULL - == (daemon = (prte_proc_t *) pmix_pointer_array_get_item(jdatorted->procs, sender->rank))) { + daemon = (prte_proc_t *) pmix_pointer_array_get_item(jdatorted->procs, sender->rank); + if (NULL == daemon) { PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); prted_failed_launch = true; goto CLEANUP; @@ -1206,8 +1210,8 @@ void prte_plm_base_daemon_topology(int status, pmix_proc_t *sender, pmix_data_bu /* find it in the array */ t = NULL; for (i = 0; i < prte_node_topologies->size; i++) { - if (NULL - == (t2 = (prte_topology_t *) pmix_pointer_array_get_item(prte_node_topologies, i))) { + t2 = (prte_topology_t *) pmix_pointer_array_get_item(prte_node_topologies, i); + if (NULL == t2) { continue; } /* just check the signature */ @@ -1234,16 +1238,14 @@ void prte_plm_base_daemon_topology(int status, pmix_proc_t *sender, pmix_data_bu topo = ptopo.topology; ptopo.topology = NULL; PMIX_TOPOLOGY_DESTRUCT(&ptopo); - /* Apply any CPU filters (not preserved by the XML) */ - prte_hwloc_base_filter_cpus(topo); /* record the final topology */ t->topo = topo; - /* setup the summary data for this topology as we will need - * it when we go to map/bind procs to it */ - root = hwloc_get_root_obj(topo); - root->userdata = (void *) PMIX_NEW(prte_hwloc_topo_data_t); - sum = (prte_hwloc_topo_data_t *) root->userdata; - sum->available = prte_hwloc_base_setup_summary(topo); + /* update the node's available processors */ + if (NULL != daemon->node->available) { + hwloc_bitmap_free(daemon->node->available); + } + /* Apply any CPU filters (not preserved by the XML) */ + daemon->node->available = prte_hwloc_base_filter_cpus(topo); /* unpack any coprocessors */ idx = 1; @@ -1364,7 +1366,6 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu char *alias; uint8_t naliases, ni; hwloc_obj_t root; - prte_hwloc_topo_data_t *sum; char *nodename = NULL; pmix_info_t *info; size_t n, ninfo; @@ -1522,8 +1523,8 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu prte_hetero_nodes = true; } } else if (!prte_hetero_nodes) { - if (0 != strcmp(sig, prte_base_compute_node_sig) - || (prte_hnp_is_allocated && 0 != strcmp(sig, mytopo->sig))) { + if (0 != strcmp(sig, prte_base_compute_node_sig) || + (prte_hnp_is_allocated && 0 != strcmp(sig, mytopo->sig))) { prte_hetero_nodes = true; } } @@ -1596,12 +1597,11 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu topo = ptopo.topology; ptopo.topology = NULL; PMIX_TOPOLOGY_DESTRUCT(&ptopo); - /* setup the summary data for this topology as we will need - * it when we go to map/bind procs to it */ - root = hwloc_get_root_obj(topo); - root->userdata = (void *) PMIX_NEW(prte_hwloc_topo_data_t); - sum = (prte_hwloc_topo_data_t *) root->userdata; - sum->available = prte_hwloc_base_setup_summary(topo); + /* update the node's available processors */ + if (NULL != daemon->node->available) { + hwloc_bitmap_free(daemon->node->available); + } + daemon->node->available = prte_hwloc_base_filter_cpus(topo); /* cleanup */ PMIX_DATA_BUFFER_DESTRUCT(data); } @@ -1680,6 +1680,7 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); found = true; daemon->node->topology = t; + daemon->node->available = prte_hwloc_base_filter_cpus(t->topo); if (NULL != topo) { hwloc_topology_destroy(topo); } @@ -1698,7 +1699,7 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu daemon->node->topology = t; if (NULL != topo) { /* Apply any CPU filters (not preserved by the XML) */ - prte_hwloc_base_filter_cpus(topo); + daemon->node->available = prte_hwloc_base_filter_cpus(topo); t->topo = topo; } else { PRTE_OUTPUT_VERBOSE((5, prte_plm_base_framework.framework_output, diff --git a/src/mca/plm/base/plm_base_receive.c b/src/mca/plm/base/plm_base_receive.c index 54f5ea8230..13cbef5321 100644 --- a/src/mca/plm/base/plm_base_receive.c +++ b/src/mca/plm/base/plm_base_receive.c @@ -307,8 +307,6 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, pmix_data_buffer_t *buf } else { jdata->bookmark = parent->bookmark; } - /* provide the parent's last object */ - jdata->bkmark_obj = parent->bkmark_obj; } if (!prte_dvm_ready) { diff --git a/src/mca/ras/simulator/ras_sim_module.c b/src/mca/ras/simulator/ras_sim_module.c index 6b9ab97cc3..6b9e02ed19 100644 --- a/src/mca/ras/simulator/ras_sim_module.c +++ b/src/mca/ras/simulator/ras_sim_module.c @@ -55,7 +55,6 @@ static int allocate(prte_job_t *jdata, pmix_list_t *nodes) char prefix[6]; bool use_hwthread_cpus = false; hwloc_obj_t root; - prte_hwloc_topo_data_t *rdata; hwloc_cpuset_t available, mycpus; node_cnt = pmix_argv_split(prte_ras_simulator_component.num_nodes, ','); @@ -98,6 +97,11 @@ static int allocate(prte_job_t *jdata, pmix_list_t *nodes) return PRTE_ERR_NOT_FOUND; } topo = t->topo; + if (NULL != job_cpuset) { + available = prte_hwloc_base_generate_cpuset(topo, use_hwthread_cpus, job_cpuset); + } else { + available = prte_hwloc_base_filter_cpus(topo); + } /* process the request */ for (n = 0; NULL != node_cnt[n]; n++) { @@ -112,22 +116,6 @@ static int allocate(prte_job_t *jdata, pmix_list_t *nodes) /* set the prefix for this group of nodes */ prefix[4] += n; - /* get the available processors on this node */ - root = hwloc_get_root_obj(topo); - if (NULL == root->userdata) { - /* incorrect */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - return PRTE_ERR_BAD_PARAM; - } - rdata = (prte_hwloc_topo_data_t *) root->userdata; - available = hwloc_bitmap_dup(rdata->available); - - if (NULL != job_cpuset) { - mycpus = prte_hwloc_base_generate_cpuset(topo, use_hwthread_cpus, job_cpuset); - hwloc_bitmap_and(available, mycpus, available); - hwloc_bitmap_free(mycpus); - } - for (i = 0; i < num_nodes; i++) { node = PMIX_NEW(prte_node_t); pmix_asprintf(&node->name, "%s%0*d", prefix, dig, i); @@ -151,10 +139,11 @@ static int allocate(prte_job_t *jdata, pmix_list_t *nodes) prte_output_verbose(1, prte_ras_base_framework.framework_output, "Created Node <%10s> [%3d : %3d]", node->name, node->slots, node->slots_max); + node->available = hwloc_bitmap_dup(available); pmix_list_append(nodes, &node->super); } - hwloc_bitmap_free(available); } + hwloc_bitmap_free(available); /* record the number of allocated nodes */ prte_num_allocated_nodes = pmix_list_get_size(nodes); diff --git a/src/mca/rmaps/base/Makefile.am b/src/mca/rmaps/base/Makefile.am index f9eeae8846..1c728170c7 100644 --- a/src/mca/rmaps/base/Makefile.am +++ b/src/mca/rmaps/base/Makefile.am @@ -13,6 +13,7 @@ # Copyright (c) 2011 Los Alamos National Security, LLC. # All rights reserved. # Copyright (c) 2015-2019 Intel, Inc. All rights reserved. +# Copyright (c) 2022 Nanook Consulting. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -31,8 +32,7 @@ libmca_rmaps_la_SOURCES += \ base/rmaps_base_support_fns.c \ base/rmaps_base_ranking.c \ base/rmaps_base_print_fns.c \ - base/rmaps_base_binding.c \ - base/rmaps_base_assign_locations.c + base/rmaps_base_binding.c dist_prtedata_DATA = base/help-prte-rmaps-base.txt diff --git a/src/mca/rmaps/base/base.h b/src/mca/rmaps/base/base.h index d68153268f..adef9d7111 100644 --- a/src/mca/rmaps/base/base.h +++ b/src/mca/rmaps/base/base.h @@ -75,6 +75,7 @@ typedef struct { /* default file for use in sequential and rankfile mapping * when the directive comes thru MCA param */ char *file; + hwloc_cpuset_t available, baseset; // scratch for binding calculation } prte_rmaps_base_t; /** @@ -97,7 +98,6 @@ PMIX_CLASS_DECLARATION(prte_rmaps_base_selected_module_t); * Map a job */ PRTE_EXPORT void prte_rmaps_base_map_job(int sd, short args, void *cbdata); -PRTE_EXPORT int prte_rmaps_base_assign_locations(prte_job_t *jdata); /** * Utility routines to get/set vpid mapping for the job @@ -118,15 +118,34 @@ PRTE_EXPORT int prte_rmaps_base_filter_nodes(prte_app_context_t *app, pmix_list_ bool remove); PRTE_EXPORT int prte_rmaps_base_set_default_mapping(prte_job_t *jdata, - prte_schizo_options_t *options); + prte_rmaps_options_t *options); PRTE_EXPORT int prte_rmaps_base_set_mapping_policy(prte_job_t *jdata, char *spec); PRTE_EXPORT int prte_rmaps_base_set_default_ranking(prte_job_t *jdata, - prte_schizo_options_t *options); + prte_rmaps_options_t *options); PRTE_EXPORT int prte_rmaps_base_set_ranking_policy(prte_job_t *jdata, char *spec); PRTE_EXPORT void prte_rmaps_base_display_map(prte_job_t *jdata); +PRTE_EXPORT bool prte_rmaps_base_check_avail(prte_job_t *jdata, + prte_app_context_t *app, + prte_node_t *node, + pmix_list_t *node_list, + hwloc_obj_t obj, + prte_rmaps_options_t *options); + +PRTE_EXPORT int prte_rmaps_base_check_oversubscribed(prte_job_t *jdata, + prte_app_context_t *app, + prte_node_t *node); + +PRTE_EXPORT void prte_rmaps_base_get_cpuset(prte_job_t *jdata, + prte_node_t *node, + prte_rmaps_options_t *options); + +PRTE_EXPORT int prte_rmaps_base_check_support(prte_job_t *jdata, + prte_node_t *node, + prte_rmaps_options_t *options); + END_C_DECLS #endif diff --git a/src/mca/rmaps/base/help-prte-rmaps-base.txt b/src/mca/rmaps/base/help-prte-rmaps-base.txt index fc2ebb2a77..244fbb54a1 100644 --- a/src/mca/rmaps/base/help-prte-rmaps-base.txt +++ b/src/mca/rmaps/base/help-prte-rmaps-base.txt @@ -21,7 +21,7 @@ # # $HEADER$ # -# This is the US/English general help file for PRTE's prun. +# This is the US/English general help file for PRRTE's prun. # [prte-rmaps-base:alloc-error] There are not enough slots available in the system to satisfy the %d @@ -32,9 +32,9 @@ slots that were requested by the application: Either request fewer slots for your application, or make more slots available for use. -A "slot" is the PRTE term for an allocatable unit where we can +A "slot" is the PRRTE term for an allocatable unit where we can launch a process. The number of slots available are defined by the -environment in which PRTE processes are run: +environment in which PRRTE processes are run: 1. Hostfile, via "slots=N" clauses (N defaults to number of processor cores if not provided) @@ -42,9 +42,9 @@ environment in which PRTE processes are run: hostname (N defaults to 1 if not provided) 3. Resource manager (e.g., SLURM, PBS/Torque, LSF, etc.) 4. If none of a hostfile, the --host command line parameter, or an - RM is present, PRTE defaults to the number of processor cores + RM is present, PRRTE defaults to the number of processor cores -In all the above cases, if you want PRTE to default to the number +In all the above cases, if you want PRRTE to default to the number of hardware threads instead of the number of processor cores, use the --use-hwthread-cpus option. @@ -180,7 +180,7 @@ support binding processes to cpus. Node: %s -PRTE uses the "hwloc" library to perform process and memory +PRRTE uses the "hwloc" library to perform process and memory binding. This error message means that hwloc has indicated that processor binding support is not available on this machine. @@ -190,8 +190,8 @@ the OS does not expose this functionality). On Linux, lack of the functionality can mean that you are on a platform where processor and memory affinity is not supported in Linux itself, or that hwloc was built without NUMA and/or processor affinity -support. When building hwloc (which, depending on your PRTE -installation, may be embedded in PRTE itself), it is important to +support. When building hwloc (which, depending on your PRRTE +installation, may be embedded in PRRTE itself), it is important to have the libnuma header and library files available. Different linux distributions package these files under different names; look for packages with the word "numa" in them. You may also need a developer @@ -211,7 +211,7 @@ support binding memory to the process location. Node: %s -PRTE uses the "hwloc" library to perform process and memory +PRRTE uses the "hwloc" library to perform process and memory binding. This error message means that hwloc has indicated that processor binding support is not available on this machine. @@ -221,8 +221,8 @@ the OS does not expose this functionality). On Linux, lack of the functionality can mean that you are on a platform where processor and memory affinity is not supported in Linux itself, or that hwloc was built without NUMA and/or processor affinity -support. When building hwloc (which, depending on your PRTE -installation, may be embedded in PRTE itself), it is important to +support. When building hwloc (which, depending on your PRRTE +installation, may be embedded in PRRTE itself), it is important to have the libnuma header and library files available. Different linux distributions package these files under different names; look for packages with the word "numa" in them. You may also need a developer @@ -245,7 +245,7 @@ support binding memory to the process location. Node: %s -PRTE uses the "hwloc" library to perform process and memory +PRRTE uses the "hwloc" library to perform process and memory binding. This error message means that hwloc has indicated that processor binding support is not available on this machine. @@ -255,8 +255,8 @@ the OS does not expose this functionality). On Linux, lack of the functionality can mean that you are on a platform where processor and memory affinity is not supported in Linux itself, or that hwloc was built without NUMA and/or processor affinity -support. When building hwloc (which, depending on your PRTE -installation, may be embedded in PRTE itself), it is important to +support. When building hwloc (which, depending on your PRRTE +installation, may be embedded in PRRTE itself), it is important to have the libnuma header and library files available. Different linux distributions package these files under different names; look for packages with the word "numa" in them. You may also need a developer @@ -269,7 +269,7 @@ platform. If the OS/platform does actually support processor / memory affinity, then you should contact the hwloc maintainers: https://github.com/open-mpi/hwloc. -The provided memory binding policy requires that PRTE abort the +The provided memory binding policy requires that PRRTE abort the job at this time. # [rmaps:no-bindable-objects] @@ -326,7 +326,7 @@ been deprecated and replaced as follows: Deprecated: %s Replacement: %s -The deprecated forms *will* disappear in a future version of PRTE. +The deprecated forms *will* disappear in a future version of PRRTE. Please update to the new syntax. # [mismatch-binding] @@ -557,3 +557,222 @@ attempting to map a job that lacks an assigned personality. Job: %s Please report this to the PRRTE developers (https://github.com/openpmix/prrte/issues) +# +[unsupported-combination] +A %s policy was provided that is not supported in combination +with the "PE=N" option: + + Policy: %s + +When specifying the number of CPUs to use for each process in a job, +the processes must be bound at the CPU level - either HWThread if +"HWTCPUS" was specified, or CORE. Please remove the bind policy +specification and try again. +# +[unsupported-mapping-combo] +A mapping policy was provided that is not supported in combination +with the "PE-LIST=x,y,z" option: + + Policy: %s + +When specifying the CPUs to be used for the job, the mapper can +only look at those CPUs. It is therefore not possible to map +the job according to some other target object. Please either +remove the mapping policy or change it to "slot" and try again. +# +[map-by-option] +Processes are mapped in a round-robin fashion based on +one of the following directives as applied at the job level: + +- SLOT assigns procs to each node up to the number of available + slots on that node before moving to the next node in the + allocation + +- HWTHREAD assigns a proc to each hardware thread on a node in a + round-robin manner up to the number of available slots on that + node before moving to the next node in the allocation + +- CORE (default) assigns a proc to each core on a node in a + round-robin manner up to the number of available slots on that + node before moving to the next node in the allocation + +- L1CACHE assigns a proc to each L1 cache on a node in a + round-robin manner up to the number of available slots on that + node before moving to the next node in the allocation + +- L2CACHE assigns a proc to each L2 cache on a node in a + round-robin manner up to the number of available slots on that + node before moving to the next node in the allocation + +- L3CACHE assigns a proc to each L3 cache on a node in a + round-robin manner up to the number of available slots on that + node before moving to the next node in the allocation + +- NUMA assigns a proc to each NUMA region on a node in a + round-robin manner up to the number of available slots on that + node before moving to the next node in the allocation + +- PACKAGE assigns a proc to each package on a node in a + round-robin manner up to the number of available slots on that + node before moving to the next node in the allocation + +- NODE assigns processes in a round-robin fashion to all nodes + in the allocation, with the number assigned to each node capped + by the number of available slots on that node + +- SEQ (often accompanied by the file= qualifier) assigns + one process to each node specified in the file. The sequential + file is to contain an entry for each desired process, one per + line of the file. + +- PPR:N:resource maps N procs to each instance of the specified + resource type in the allocation + +- RANKFILE (often accompanied by the file= qualifier) assigns + one process to the node/resource specified in each entry of the + file, one per line of the file. + +- PE-LIST=a,b assigns procs to each node in the allocation based on + the ORDERED qualifier. The list is comprised of comma-delimited + ranges of CPUs to use for this job. If the ORDERED qualifier is not + provided, then each node will be assigned procs up to the number of + available slots, capped by the availability of the specified CPUs. + If ORDERED is given, then one proc will be assigned to each of the + specified CPUs, if available, capped by the number of slots on each + node and the total number of specified processes. Providing the + OVERLOAD qualifier to the "bind-to" option removes the check on + availability of the CPU in both cases. + +Any directive can include qualifiers by adding a colon (:) and any +combination of one or more of the following to the --map-by option +(except where noted): + +- PE=n bind n processing elements to each process (can not be + used in combination with rankfile or pe-list directives) + +- SPAN load balance the processes across the allocation by treating + the allocation as a single "super-node" (can not be used in + combination with slot, node, seq, ppr, rankfile, or + pe-list directives) + +- OVERSUBSCRIBE allow more processes on a node than processing elements +- NOOVERSUBSCRIBE means !OVERSUBSCRIBE + +- NOLOCAL do not launch processes on the same node as prun + +- HWTCPUS use hardware threads as CPU slots + +- CORECPUS use cores as CPU slots (default) + +- INHERIT +- NOINHERIT means !INHERIT + +- FILE= (path to file containing sequential or rankfile entries). + +- ORDERED only applies to the PE-LIST option to indicate that procs + are to be bound to each of the specified CPUs in the order + in which they are assigned (i.e., the first proc on a node shall + be bound to the first CPU in the list, the second proc shall be + bound to the second CPU, etc.) + +- DISPLAY outputs a table showing the mapped location of each process + prior to launch. + +- DISPLAYALLOC outputs the detected allocation of resources (e.g., nodes, + slots) for the job + +- DONOTLAUNCH directs PRRTE to map by not launch the specified job. + This is provided to help explore possible process placement patterns + before actually starting execution. + +Note that directives and qualifiers are case-insensitive +and can be shortened to the minimum number of characters +to uniquely identify them. Thus, "L1CACHE" can be given +as "l1cache" or simply as "L1". + +A more detailed description of the mapping, ranking, and binding procedure +can be obtained via the "--help placement" option. +# +[rank-by-option] +By default, process ranks are assigned in accordance with the mapping +directive - e.g., jobs that are mapped by-node will have the process +ranks assigned round-robin on a per-node basis. However, users can override +the default by specifying any of the following directives using the +--rank-by command line option: + +- SLOT assigns ranks to each process on a node in the order in + which the mapper assigned them. This is the default behavior, + but is provided as an explicit option to allow users to override + any alternative default specified in the environment. When mapping + to a specific resource type, procs assigned to a given instance + of that resource on a node will be ranked on a per-resource basis + on that node before moving to the next node. + +- NODE assigns ranks round-robin on a per-node basis + +- FILL assigns ranks to procs mapped to a particular resource type + on each node, filling all ranks on that resource before moving to + the next resource on that node. For example, procs mapped by-L1cache + would have all procs on the first L1cache ranked sequentially before + moving to the second L1cache on the node. Once all procs on the + node have been ranked, ranking would continue on the next node. + +- SPAN assigns ranks round-robin to procs mapped to a particular + resource type, treating the collection of resource instances + spanning the entire allocation as a single "super node" before + looping around for the next pass. Thus, ranking would begin with + the first proc on the first L1cache on the first node, then the + next rank would be assiged to the first proc on the second L1cache + on that node, proceeding across until the first proc had been + ranked on all L1caches used by the job before circling around to + rank the second proc on each object. + +The rank-by command line option has no qualifiers. Note that directives +are case-insensitive. + +A more detailed description of the mapping, ranking, and binding procedure +can be obtained via the "--help placement" option. +# +[must-map-by-obj] +When ranking by FILL or by SPAN, you must map by an object +type (e.g., HWTHREAD, NUMA, or cache level): + + Map policy: %s + Rank policy: %s + +Please specify a supported combination and try again. +# +[out-of-resource] +Either there are not enough slots available in the system to launch +the %d processes that were requested by the application, or there are +not enough CPUs to bind them as requested: + + App: %s + Mapping: %s + Binding: %s + +Either request fewer processes for your application, make more slots +available for use by expanding the allocation, or do not bind the +processes so that the number of CPUs is no longer a limiting factor. + +A "slot" is the PRRTE term for an allocatable unit where we can +launch a process. The number of slots available are defined by the +environment in which PRRTE processes are run: + + 1. Hostfile, via "slots=N" clauses (N defaults to number of + processor cores if not provided) + 2. The --host command line parameter, via a ":N" suffix on the + hostname (N defaults to 1 if not provided) + 3. Resource manager (e.g., SLURM, PBS/Torque, LSF, etc.) + 4. If none of a hostfile, the --host command line parameter, or an + RM is present, PRRTE defaults to the number of processor cores + +In all the above cases, if you want PRRTE to default to the number +of hardware threads instead of the number of processor cores, use the +--use-hwthread-cpus option. + +Alternatively, you can use the --map-by :OVERSUBSCRIBE option to ignore the +number of available slots when deciding the number of processes to +launch. Similarly, you can use the --bind-to :OVERLOAD option to bind +more than one process to a CPU, if desired, or --bind-to NONE to avoid +binding altogether. diff --git a/src/mca/rmaps/base/rmaps_base_assign_locations.c b/src/mca/rmaps/base/rmaps_base_assign_locations.c deleted file mode 100644 index 3552c91708..0000000000 --- a/src/mca/rmaps/base/rmaps_base_assign_locations.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. - * Copyright (c) 2016 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "prte_config.h" -#include "constants.h" - -#include - -#include "src/mca/base/base.h" -#include "src/mca/mca.h" -#include "src/util/output.h" - -#include "src/mca/errmgr/errmgr.h" -#include "src/runtime/prte_globals.h" -#include "src/util/pmix_show_help.h" - -#include "src/mca/rmaps/base/base.h" -#include "src/mca/rmaps/base/rmaps_private.h" - -int prte_rmaps_base_assign_locations(prte_job_t *jdata) -{ - int rc; - prte_rmaps_base_selected_module_t *mod; - - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: assigning locations for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - - /* cycle thru the available mappers until one agrees to assign - * locations for the job - */ - if (1 == pmix_list_get_size(&prte_rmaps_base.selected_modules)) { - /* forced selection */ - mod = (prte_rmaps_base_selected_module_t *) pmix_list_get_first( - &prte_rmaps_base.selected_modules); - jdata->map->req_mapper = strdup(mod->component->mca_component_name); - } - PMIX_LIST_FOREACH(mod, &prte_rmaps_base.selected_modules, prte_rmaps_base_selected_module_t) - { - if (NULL == mod->module->assign_locations) { - continue; - } - if (PRTE_SUCCESS == (rc = mod->module->assign_locations(jdata))) { - return rc; - } - /* mappers return "next option" if they didn't attempt to - * process the job. anything else is a true error. - */ - if (PRTE_ERR_TAKE_NEXT_OPTION != rc) { - PRTE_ERROR_LOG(rc); - return rc; - } - } - - /* if we get here without doing the assignments, then that's an error */ - pmix_show_help("help-prte-rmaps-base.txt", "failed-assignments", true, - prte_process_info.nodename, prte_rmaps_base_print_mapping(jdata->map->mapping)); - return PRTE_ERROR; -} diff --git a/src/mca/rmaps/base/rmaps_base_binding.c b/src/mca/rmaps/base/rmaps_base_binding.c index 798b0cfa27..ef4c82aa04 100644 --- a/src/mca/rmaps/base/rmaps_base_binding.c +++ b/src/mca/rmaps/base/rmaps_base_binding.c @@ -52,912 +52,279 @@ #include "src/mca/rmaps/base/base.h" #include "src/mca/rmaps/base/rmaps_private.h" -static bool membind_warned = false; - -/* CRITICAL NOTE: the hwloc topology tree is in a shared memory - * region that is passed to the applications for their use. HWLOC - * does NOT provide any locking support in this shmem region. Thus, - * it is critical that the topology tree information itself remain - * unmodified. - * - * We can, however, fiddle with the userdata attached to an object - * in the topology tree because the applications that might also - * be attached to the shared memory region don't have visibility - * into the userdata. They also cannot conflict with us as they - * cannot write into the shared memory region. So we leave the - * topology itself untouched (critical!) and confine ourselves - * to recording usage etc in the userdata object */ - -static void reset_usage(prte_node_t *node, pmix_nspace_t jobid) -{ - int j; - prte_proc_t *proc; - prte_hwloc_obj_data_t *data = NULL; - hwloc_obj_t bound; - - prte_output_verbose(10, prte_rmaps_base_framework.framework_output, - "%s reset_usage: node %s has %d procs on it", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), node->name, node->num_procs); - - /* start by clearing any existing proc binding - * records from the userdata in this topo */ - prte_hwloc_base_clear_usage(node->topology->topo); - - /* cycle thru the procs on the node and record - * their usage in the topology - */ - for (j = 0; j < node->procs->size; j++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from this job */ - if (PMIX_CHECK_NSPACE(proc->name.nspace, jobid)) { - prte_output_verbose(10, prte_rmaps_base_framework.framework_output, - "%s reset_usage: ignoring proc %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&proc->name)); - continue; - } - bound = NULL; - /* get the object to which this proc is bound */ - if (!prte_get_attribute(&proc->attributes, PRTE_PROC_HWLOC_BOUND, (void **) &bound, - PMIX_POINTER) - || NULL == bound) { - /* this proc isn't bound - ignore it */ - prte_output_verbose(10, prte_rmaps_base_framework.framework_output, - "%s reset_usage: proc %s has no bind location", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&proc->name)); - continue; - } - /* get the userdata struct for this object - create it if necessary */ - data = (prte_hwloc_obj_data_t *) bound->userdata; - if (NULL == data) { - data = PMIX_NEW(prte_hwloc_obj_data_t); - bound->userdata = data; - } - /* count that this proc is bound to this object */ - data->num_bound++; - prte_output_verbose(10, prte_rmaps_base_framework.framework_output, - "%s reset_usage: proc %s is bound - total %d", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&proc->name), - data->num_bound); - } -} - -static void unbind_procs(prte_job_t *jdata) -{ - int j; - prte_proc_t *proc; - - for (j = 0; j < jdata->procs->size; j++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(jdata->procs, j))) { - continue; - } - prte_remove_attribute(&proc->attributes, PRTE_PROC_HWLOC_BOUND); - prte_remove_attribute(&proc->attributes, PRTE_PROC_CPU_BITMAP); - } -} - -static int bind_generic(prte_job_t *jdata, prte_node_t *node, int target_depth) +static int bind_generic(prte_job_t *jdata, prte_proc_t *proc, + prte_node_t *node, hwloc_obj_t obj, + prte_rmaps_options_t *options) { - int j; - prte_job_map_t *map; - prte_proc_t *proc; - hwloc_obj_t trg_obj, tmp_obj, nxt_obj; - unsigned int ncpus; - prte_hwloc_obj_data_t *data; - int total_cpus, cpus_per_rank; - hwloc_cpuset_t totalcpuset, available, mycpus; - hwloc_obj_t locale; - char *cpu_bitmap, *job_cpuset; - unsigned min_bound; - bool dobind, use_hwthread_cpus; - struct hwloc_topology_support *support; - hwloc_obj_t root; - prte_hwloc_topo_data_t *rdata; - uint16_t u16, *u16ptr = &u16; + hwloc_obj_t trg_obj, tmp_obj; + unsigned ncpus; + uint16_t n; + unsigned minload; + hwloc_obj_type_t type; + hwloc_obj_t target, nxt; + hwloc_cpuset_t tgtcpus, tmpcpus; + char *t1, *t2; prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: bind downward for job %s with bindings %s", - PRTE_JOBID_PRINT(jdata->nspace), + "mca:rmaps: bind %s with policy %s", + PRTE_NAME_PRINT(&proc->name), prte_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ - map = jdata->map; - totalcpuset = hwloc_bitmap_alloc(); - - dobind = false; - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL) - || prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_MAP, NULL, PMIX_BOOL) - || prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_DEVEL_MAP, NULL, PMIX_BOOL)) { - dobind = true; - } - /* reset usage */ - reset_usage(node, jdata->nspace); - - /* get the available processors on this node */ - root = hwloc_get_root_obj(node->topology->topo); - if (NULL == root->userdata) { - /* incorrect */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - return PRTE_ERR_BAD_PARAM; - } - rdata = (prte_hwloc_topo_data_t *) root->userdata; - available = hwloc_bitmap_dup(rdata->available); - - /* see if they want multiple cpus/rank */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, (void **) &u16ptr, - PMIX_UINT16)) { - cpus_per_rank = u16; + if (NULL == obj) { + target = hwloc_get_root_obj(node->topology->topo); } else { - cpus_per_rank = 1; + target = obj; } - - /* check for type of cpu being used */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL)) { - use_hwthread_cpus = true; - } else { - use_hwthread_cpus = false; + if (NULL == options->target) { + return PRTE_ERROR; } +#if HWLOC_API_VERSION < 0x20000 + tgtcpus = target->allowed_cpuset; +#else + tgtcpus = target->cpuset; +#endif + hwloc_bitmap_and(prte_rmaps_base.baseset, options->target, tgtcpus); - /* see if this job has a "soft" cgroup assignment */ - job_cpuset = NULL; - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_CPUSET, (void **) &job_cpuset, PMIX_STRING) - && NULL != job_cpuset) { - mycpus = prte_hwloc_base_generate_cpuset(node->topology->topo, use_hwthread_cpus, - job_cpuset); - hwloc_bitmap_and(available, mycpus, available); - hwloc_bitmap_free(mycpus); - } + hwloc_bitmap_list_asprintf(&t1, prte_rmaps_base.baseset); + hwloc_bitmap_list_asprintf(&t2, node->available); - /* cycle thru the procs */ - for (j = 0; j < node->procs->size; j++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - continue; - } - if ((int) PRTE_PROC_MY_NAME->rank != node->index && !dobind) { - continue; - } - - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { - /* if we don't want to launch, then we are just testing the system, - * so ignore questions about support capabilities - */ - support = (struct hwloc_topology_support *) hwloc_topology_get_support( - node->topology->topo); - /* check if topology supports cpubind - have to be careful here - * as Linux doesn't currently support thread-level binding. This - * may change in the future, though, and it isn't clear how hwloc - * interprets the current behavior. So check both flags to be sure. - */ - if (!support->cpubind->set_thisproc_cpubind - && !support->cpubind->set_thisthread_cpubind) { - if (!PRTE_BINDING_REQUIRED(map->binding) - || !PRTE_BINDING_POLICY_IS_SET(map->binding)) { - /* we are not required to bind, so ignore this */ - continue; - } - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, - node->name); - hwloc_bitmap_free(totalcpuset); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - /* check if topology supports membind - have to be careful here - * as hwloc treats this differently than I (at least) would have - * expected. Per hwloc, Linux memory binding is at the thread, - * and not process, level. Thus, hwloc sets the "thisproc" flag - * to "false" on all Linux systems, and uses the "thisthread" flag - * to indicate binding capability - don't warn if the user didn't - * specifically request binding + trg_obj = NULL; + /* find the first object of that type in the target that has at least one available CPU */ + tmp_obj = hwloc_get_next_obj_inside_cpuset_by_type(node->topology->topo, + prte_rmaps_base.baseset, + options->hwb, NULL); + while (NULL != tmp_obj) { +#if HWLOC_API_VERSION < 0x20000 + tmpcpus = tmp_obj->allowed_cpuset; +#else + tmpcpus = tmp_obj->cpuset; +#endif + hwloc_bitmap_and(prte_rmaps_base.available, node->available, tmpcpus); + if (options->use_hwthreads) { + ncpus = hwloc_bitmap_weight(prte_rmaps_base.available); + } else { + /* if we are treating cores as cpus, then we really + * want to know how many cores are in this object. + * hwloc sets a bit for each "pu", so we can't just + * count bits in this case as there may be more than + * one hwthread/core. Instead, find the number of cores + * under the object */ - if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind - && PRTE_BINDING_POLICY_IS_SET(map->binding)) { - if (PRTE_HWLOC_BASE_MBFA_WARN == prte_hwloc_base_mbfa && !membind_warned) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:membind-not-supported", true, - node->name); - membind_warned = true; - } else if (PRTE_HWLOC_BASE_MBFA_ERROR == prte_hwloc_base_mbfa) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", - true, node->name); - hwloc_bitmap_free(totalcpuset); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - } + ncpus = hwloc_get_nbobjs_inside_cpuset_by_type(node->topology->topo, + prte_rmaps_base.available, + HWLOC_OBJ_CORE); } - - /* bozo check */ - locale = NULL; - if (!prte_get_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, (void **) &locale, PMIX_POINTER) || - NULL == locale) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:no-locale", true, - PRTE_NAME_PRINT(&proc->name)); - hwloc_bitmap_free(totalcpuset); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; + if (0 < ncpus) { + trg_obj = tmp_obj; + break; } + tmp_obj = hwloc_get_next_obj_inside_cpuset_by_type(node->topology->topo, + prte_rmaps_base.baseset, + options->hwb, tmp_obj); + } + if (NULL == trg_obj) { + /* there aren't any appropriate targets under this object */ + pmix_show_help("help-prte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); + return PRTE_ERR_SILENT; + } - /* use the min_bound object that intersects locale->cpuset at target_depth */ - tmp_obj = NULL; - trg_obj = NULL; - min_bound = UINT_MAX; - while (NULL - != (tmp_obj = hwloc_get_next_obj_by_depth(node->topology->topo, target_depth, - tmp_obj))) { - if (!hwloc_bitmap_intersects(locale->cpuset, tmp_obj->cpuset)) - continue; - - /* if there are no available cpus under this object, then ignore it */ - if (!hwloc_bitmap_intersects(available, tmp_obj->cpuset)) - continue; - - data = (prte_hwloc_obj_data_t *) tmp_obj->userdata; - if (NULL == data) { - data = PMIX_NEW(prte_hwloc_obj_data_t); - tmp_obj->userdata = data; - } - if (data->num_bound < min_bound) { - min_bound = data->num_bound; - trg_obj = tmp_obj; - } - } - if (NULL == trg_obj) { - /* there aren't any such targets under this object */ - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); - hwloc_bitmap_free(totalcpuset); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - /* record the location */ - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_BOUND, PRTE_ATTR_LOCAL, trg_obj, - PMIX_POINTER); - - /* start with a clean slate */ - hwloc_bitmap_zero(totalcpuset); - total_cpus = 0; - nxt_obj = trg_obj; - do { - if (NULL == nxt_obj) { - /* could not find enough cpus to meet request */ - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:no-available-cpus", true, - node->name); - hwloc_bitmap_free(totalcpuset); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - trg_obj = nxt_obj; - /* get the number of available cpus under this location */ - ncpus = prte_hwloc_base_get_npus(node->topology->topo, use_hwthread_cpus, available, - trg_obj); - /* track the number bound */ - if (NULL == (data = (prte_hwloc_obj_data_t *) trg_obj->userdata)) { - data = PMIX_NEW(prte_hwloc_obj_data_t); - trg_obj->userdata = data; - } - data->num_bound++; - /* error out if adding a proc would cause overload and that wasn't allowed, - * and it wasn't a default binding policy (i.e., the user requested it) - */ - if (ncpus < data->num_bound && !PRTE_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { - if (PRTE_BINDING_POLICY_IS_SET(jdata->map->binding)) { - /* if the user specified a binding policy, then we cannot meet - * it since overload isn't allowed, so error out - have the - * message indicate that setting overload allowed will remove - * this restriction */ - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:binding-overload", true, - prte_hwloc_base_print_binding(map->binding), node->name, - data->num_bound, ncpus); - hwloc_bitmap_free(totalcpuset); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } else if (1 < cpus_per_rank) { - /* if the user specified cpus/proc, then we weren't able - * to meet that request - this constitutes an error that - * must be reported */ - pmix_show_help("help-prte-rmaps-base.txt", "insufficient-cpus-per-proc", true, - prte_hwloc_base_print_binding(map->binding), node->name, - (NULL != job_cpuset) ? job_cpuset - : (NULL == prte_hwloc_default_cpu_list) - ? "FULL" - : prte_hwloc_default_cpu_list, - cpus_per_rank); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } else { - /* if we have the default binding policy, then just don't bind */ - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "%s NOT ENOUGH CPUS TO COMPLETE BINDING - BINDING NOT " - "REQUIRED, REVERTING TO NOT BINDING", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - PRTE_SET_BINDING_POLICY(map->binding, PRTE_BIND_TO_NONE); - unbind_procs(jdata); - hwloc_bitmap_free(totalcpuset); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_SUCCESS; - } - } - /* bind the proc here */ - hwloc_bitmap_or(totalcpuset, totalcpuset, trg_obj->cpuset); - /* track total #cpus */ - total_cpus += ncpus; - /* move to the next location, in case we need it */ - nxt_obj = trg_obj->next_cousin; - } while (total_cpus < cpus_per_rank); - hwloc_bitmap_list_asprintf(&cpu_bitmap, totalcpuset); - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, "%s PROC %s BITMAP %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&proc->name), - cpu_bitmap); - prte_set_attribute(&proc->attributes, PRTE_PROC_CPU_BITMAP, PRTE_ATTR_GLOBAL, cpu_bitmap, - PMIX_STRING); - if (NULL != cpu_bitmap) { - free(cpu_bitmap); - } - if (4 < prte_output_get_verbosity(prte_rmaps_base_framework.framework_output)) { - char *tmp1; - tmp1 = prte_hwloc_base_cset2str(totalcpuset, use_hwthread_cpus, node->topology->topo); - prte_output(prte_rmaps_base_framework.framework_output, "%s BOUND PROC %s[%s] TO %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&proc->name), - node->name, tmp1); - free(tmp1); - } +#if HWLOC_API_VERSION < 0x20000 + tgtcpus = trg_obj->allowed_cpuset; +#else + tgtcpus = trg_obj->cpuset; +#endif + hwloc_bitmap_list_asprintf(&proc->cpuset, tgtcpus); // bind to the entire target object + if (4 < prte_output_get_verbosity(prte_rmaps_base_framework.framework_output)) { + char *tmp1; + tmp1 = prte_hwloc_base_cset2str(trg_obj->cpuset, options->use_hwthreads, node->topology->topo); + prte_output(prte_rmaps_base_framework.framework_output, "%s BOUND PROC %s[%s] TO %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&proc->name), + node->name, tmp1); + free(tmp1); } - hwloc_bitmap_free(totalcpuset); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); + + /* mark the assigned cpus as having been consumed. If we are binding to + * an object that has multiple CPUs, then we only mark one here to indicate + * that a process was bound to that object. This provides an accounting + * mechanism that lets us know when we become overloaded - i.e., more procs + * are bound to the object than there are CPUs. We must also mark the bits + * in the options->target cpuset so that the next proc we assign doesn't + * attempt to take the same location. */ + if (options->use_hwthreads) { + type = HWLOC_OBJ_PU; + } else { + type = HWLOC_OBJ_CORE; } + tmp_obj = hwloc_get_obj_inside_cpuset_by_type(node->topology->topo, + prte_rmaps_base.available, + type, 0); +#if HWLOC_API_VERSION < 0x20000 + hwloc_bitmap_andnot(node->available, node->available, tmp_obj->allowed_cpuset); + hwloc_bitmap_andnot(options->target, options->target, tmp_obj->allowed_cpuset); +#else + hwloc_bitmap_list_asprintf(&t2, tmp_obj->cpuset); + hwloc_bitmap_andnot(node->available, node->available, tmp_obj->cpuset); +// hwloc_bitmap_andnot(options->target, options->target, tmp_obj->cpuset); +#endif + hwloc_bitmap_list_asprintf(&t2, node->available); return PRTE_SUCCESS; } -static int bind_in_place(prte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) +static int bind_to_cpuset(prte_job_t *jdata, + prte_proc_t *proc, + prte_node_t *node, + prte_rmaps_options_t *options) { - /* traverse the hwloc topology tree on each node downwards - * until we find an unused object of type target - and then bind - * the process to that target - */ - int i, j; - prte_job_map_t *map; - prte_node_t *node; - prte_proc_t *proc; - unsigned int idx, ncpus; - struct hwloc_topology_support *support; - prte_hwloc_obj_data_t *data; - hwloc_obj_t locale, sib; - char *cpu_bitmap, *job_cpuset; - bool found, use_hwthread_cpus; - bool dobind; - int cpus_per_rank; - hwloc_cpuset_t available, mycpus; - hwloc_obj_t root; - prte_hwloc_topo_data_t *rdata; - uint16_t u16, *u16ptr = &u16; - PRTE_HIDE_UNUSED_PARAMS(target, cache_level); + /* bind each process to prte_hwloc_base_cpu_list */ + unsigned idx; + hwloc_bitmap_t tset; + hwloc_obj_t obj, root; + char **cpus; + hwloc_obj_type_t type; prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: bind in place for job %s with bindings %s", + "mca:rmaps: bind job %s to cpus %s %s", PRTE_JOBID_PRINT(jdata->nspace), - prte_hwloc_base_print_binding(jdata->map->binding)); - /* initialize */ - map = jdata->map; + options->cpuset, + options->ordered ? "ordered" : "not-ordered"); - dobind = false; - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL) - || prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_MAP, NULL, PMIX_BOOL) - || prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_DEVEL_MAP, NULL, PMIX_BOOL)) { - dobind = true; + if (NULL == options->cpuset) { + /* not enough cpus were specified */ + return PRTE_ERR_OUT_OF_RESOURCE; } - - /* see if this job has a "soft" cgroup assignment */ - job_cpuset = NULL; - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_CPUSET, (void **) &job_cpuset, PMIX_STRING)) { - job_cpuset = NULL; + cpus = pmix_argv_split(options->cpuset, ','); + /* take the first one */ + idx = strtoul(cpus[0], NULL, 10); + if (options->use_hwthreads) { + type = HWLOC_OBJ_PU; + } else { + type = HWLOC_OBJ_CORE; } + /* the CPU numbers would have been given to us based on the total + * available CPUs on the machine. Thus, we cannot use the node->available + * CPU set as we are removing CPUs for accounting purposes there. + * Instead, refer back to the root-level information */ + root = hwloc_get_root_obj(node->topology->topo); - /* see if they want multiple cpus/rank */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, (void **) &u16ptr, PMIX_UINT16)) { - cpus_per_rank = u16; + if (options->ordered) { + /* assign each proc to the next available + * cpu in the list. Since we are assigning + * procs as they are mapped, this ensures they + * will be assigned in order */ +#if HWLOC_API_VERSION < 0x20000 + tset = root->allowed_cpuset; +#else + tset = root->cpuset; +#endif + obj = hwloc_get_obj_inside_cpuset_by_type(node->topology->topo, tset, type, idx); + if (NULL == obj) { + pmix_argv_free(cpus); + return PRTE_ERR_OUT_OF_RESOURCE; + } +#if HWLOC_API_VERSION < 0x20000 + tset = obj->allowed_cpuset; +#else + tset = obj->cpuset; +#endif } else { - cpus_per_rank = 1; + /* bind the proc to all assigned cpus */ + tset = options->target; } - - /* check for type of cpu being used */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL)) { - use_hwthread_cpus = true; + /* bind to the specified cpuset */ + hwloc_bitmap_list_asprintf(&proc->cpuset, tset); + + /* remove one of the CPUs from the cpuset to indicate that + * we assigned a proc to this range */ + free(options->cpuset); + if (NULL == cpus[1]) { + options->cpuset = NULL; } else { - use_hwthread_cpus = false; + options->cpuset = pmix_argv_join(&cpus[1], ','); } + pmix_argv_free(cpus); - for (i = 0; i < map->nodes->size; i++) { - if (NULL == (node = (prte_node_t *) pmix_pointer_array_get_item(map->nodes, i))) { - continue; - } - if ((int) PRTE_PROC_MY_NAME->rank != node->index && !dobind) { - continue; - } - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { - /* if we don't want to launch, then we are just testing the system, - * so ignore questions about support capabilities - */ - support = (struct hwloc_topology_support *) hwloc_topology_get_support( - node->topology->topo); - /* check if topology supports cpubind - have to be careful here - * as Linux doesn't currently support thread-level binding. This - * may change in the future, though, and it isn't clear how hwloc - * interprets the current behavior. So check both flags to be sure. - */ - if (!support->cpubind->set_thisproc_cpubind - && !support->cpubind->set_thisthread_cpubind) { - if (!PRTE_BINDING_REQUIRED(map->binding) - || !PRTE_BINDING_POLICY_IS_SET(map->binding)) { - /* we are not required to bind, so ignore this */ - continue; - } - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, - node->name); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - /* check if topology supports membind - have to be careful here - * as hwloc treats this differently than I (at least) would have - * expected. Per hwloc, Linux memory binding is at the thread, - * and not process, level. Thus, hwloc sets the "thisproc" flag - * to "false" on all Linux systems, and uses the "thisthread" flag - * to indicate binding capability - don't warn if the user didn't - * specifically request binding - */ - if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind - && PRTE_BINDING_POLICY_IS_SET(map->binding)) { - if (PRTE_HWLOC_BASE_MBFA_WARN == prte_hwloc_base_mbfa && !membind_warned) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:membind-not-supported", true, - node->name); - membind_warned = true; - } else if (PRTE_HWLOC_BASE_MBFA_ERROR == prte_hwloc_base_mbfa) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", - true, node->name); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - } - } - - /* some systems do not report cores, and so we can get a situation where our - * default binding policy will fail for no necessary reason. So if we are - * computing a binding due to our default policy, and no cores are found - * on this node, just silently skip it - we will not bind - */ - if (!PRTE_BINDING_POLICY_IS_SET(map->binding) - && HWLOC_TYPE_DEPTH_UNKNOWN - == hwloc_get_type_depth(node->topology->topo, HWLOC_OBJ_CORE)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "Unable to bind-to core by default on node %s as no cores detected", - node->name); - continue; - } - - /* we share topologies in order - * to save space, so we need to reset the usage info to reflect - * our own current state - */ - reset_usage(node, jdata->nspace); - /* get the available processors on this node */ - root = hwloc_get_root_obj(node->topology->topo); - if (NULL == root->userdata) { - /* incorrect */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_BAD_PARAM; - } - rdata = (prte_hwloc_topo_data_t *) root->userdata; - available = hwloc_bitmap_dup(rdata->available); - if (NULL != job_cpuset) { - mycpus = prte_hwloc_base_generate_cpuset(node->topology->topo, use_hwthread_cpus, - job_cpuset); - hwloc_bitmap_and(available, mycpus, available); - hwloc_bitmap_free(mycpus); - } - /* cycle thru the procs */ - for (j = 0; j < node->procs->size; j++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - continue; - } - /* bozo check */ - locale = NULL; - if (!prte_get_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, (void **) &locale, PMIX_POINTER)) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:no-locale", true, - PRTE_NAME_PRINT(&proc->name)); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - /* get the index of this location */ - if (UINT_MAX == (idx = prte_hwloc_base_get_obj_idx(node->topology->topo, locale))) { - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - /* get the number of cpus under this location */ - if (0 == (ncpus = prte_hwloc_base_get_npus(node->topology->topo, use_hwthread_cpus, - available, locale))) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:no-available-cpus", true, - node->name); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - data = (prte_hwloc_obj_data_t *) locale->userdata; - if (NULL == data) { - data = PMIX_NEW(prte_hwloc_obj_data_t); - locale->userdata = data; - } - /* if we don't have enough cpus to support this additional proc, try - * shifting the location to a cousin that can support it - the important - * thing is that we maintain the same level in the topology */ - if (ncpus < (data->num_bound + cpus_per_rank)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "%s bind_in_place: searching right", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - sib = locale; - found = false; - while (NULL != (sib = sib->next_cousin)) { - ncpus = prte_hwloc_base_get_npus(node->topology->topo, use_hwthread_cpus, - available, sib); - data = (prte_hwloc_obj_data_t *) sib->userdata; - if (NULL == data) { - data = PMIX_NEW(prte_hwloc_obj_data_t); - sib->userdata = data; - } - if ((data->num_bound + cpus_per_rank) <= ncpus) { - found = true; - locale = sib; - break; - } - } - if (!found) { - /* try the other direction */ - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "%s bind_in_place: searching left", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - sib = locale; - while (NULL != (sib = sib->prev_cousin)) { - ncpus = prte_hwloc_base_get_npus(node->topology->topo, use_hwthread_cpus, - available, sib); - data = (prte_hwloc_obj_data_t *) sib->userdata; - if (NULL == data) { - data = PMIX_NEW(prte_hwloc_obj_data_t); - sib->userdata = data; - } - if ((data->num_bound + cpus_per_rank) <= ncpus) { - found = true; - locale = sib; - break; - } - } - } - if (!found) { - /* no place to put this - see if overload is allowed */ - if (!PRTE_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { - if (PRTE_BINDING_POLICY_IS_SET(jdata->map->binding)) { - /* if the user specified a binding policy, then we cannot meet - * it since overload isn't allowed, so error out - have the - * message indicate that setting overload allowed will remove - * this restriction */ - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:binding-overload", - true, prte_hwloc_base_print_binding(map->binding), - node->name, data->num_bound, ncpus); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } else if (1 < cpus_per_rank) { - /* if the user specified cpus/proc, then we weren't able - * to meet that request - this constitutes an error that - * must be reported */ - pmix_show_help("help-prte-rmaps-base.txt", "insufficient-cpus-per-proc", - true, prte_hwloc_base_print_binding(map->binding), - node->name, - (NULL != job_cpuset) ? job_cpuset - : (NULL == prte_hwloc_default_cpu_list) - ? "FULL" - : prte_hwloc_default_cpu_list, - cpus_per_rank); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } else { - /* if we have the default binding policy, then just don't bind */ - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "%s NOT ENOUGH CPUS TO COMPLETE BINDING - BINDING " - "NOT REQUIRED, REVERTING TO NOT BINDING", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)); - unbind_procs(jdata); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_SUCCESS; - } - } - } - } - /* track the number bound */ - data = (prte_hwloc_obj_data_t *) locale->userdata; // just in case it changed - if (NULL == data) { - data = PMIX_NEW(prte_hwloc_obj_data_t); - locale->userdata = data; - } - data->num_bound++; - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "BINDING PROC %s TO %s NUMBER %u", PRTE_NAME_PRINT(&proc->name), - hwloc_obj_type_string(locale->type), idx); - /* bind the proc here, masking it to any "soft" cgroup the user provided */ - mycpus = hwloc_bitmap_alloc(); - hwloc_bitmap_and(mycpus, available, locale->cpuset); - hwloc_bitmap_list_asprintf(&cpu_bitmap, mycpus); - hwloc_bitmap_free(mycpus); - prte_set_attribute(&proc->attributes, PRTE_PROC_CPU_BITMAP, PRTE_ATTR_GLOBAL, - cpu_bitmap, PMIX_STRING); - /* update the location, in case it changed */ - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_BOUND, PRTE_ATTR_LOCAL, locale, - PMIX_POINTER); - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "%s BOUND PROC %s TO %s[%s:%u] on node %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&proc->name), - cpu_bitmap, hwloc_obj_type_string(locale->type), idx, node->name); - if (NULL != cpu_bitmap) { - free(cpu_bitmap); - } - } - hwloc_bitmap_free(available); - } - if (NULL != job_cpuset) { - free(job_cpuset); + /* mark that we used ONE of these cpus - we do this each time we + * the cpuset is assigned to a proc. When all the cpus in the + * set have been removed, we know that the set will be overloaded + * if any more procs are assigned to it. */ +#if HWLOC_API_VERSION < 0x20000 + tset = root->allowed_cpuset; +#else + tset = root->cpuset; +#endif + obj = hwloc_get_obj_inside_cpuset_by_type(node->topology->topo, tset, type, idx); + if (NULL == obj) { + } else { +#if HWLOC_API_VERSION < 0x20000 + hwloc_bitmap_andnot(node->available, node->available, obj->allowed_cpuset); +#else + hwloc_bitmap_andnot(node->available, node->available, obj->cpuset); +#endif } + char *tmp; + hwloc_bitmap_list_asprintf(&tmp, node->available); + free(tmp); return PRTE_SUCCESS; } -static int bind_to_cpuset(prte_job_t *jdata) +static int bind_multiple(prte_job_t *jdata, prte_proc_t *proc, + prte_node_t *node, hwloc_obj_t obj, + prte_rmaps_options_t *options) { - /* bind each process to prte_hwloc_base_cpu_list */ - int i, j; - prte_job_map_t *map; - prte_node_t *node; - prte_proc_t *proc; - struct hwloc_topology_support *support; - prte_hwloc_topo_data_t *sum; - hwloc_obj_t root; - char *cpu_bitmap, *job_cpuset; - unsigned id; - prte_local_rank_t lrank; - hwloc_bitmap_t mycpuset, tset, mycpus; - bool dobind, use_hwthread_cpus; - uint16_t u16, *u16ptr = &u16, ncpus, cpus_per_rank; - - /* see if this job has a "soft" cgroup assignment */ - job_cpuset = NULL; - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_CPUSET, (void **) &job_cpuset, PMIX_STRING) - || NULL == job_cpuset) { - return PRTE_ERR_BAD_PARAM; - } - - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: bind job %s to cpus %s", PRTE_JOBID_PRINT(jdata->nspace), - job_cpuset); + hwloc_obj_type_t type; + hwloc_cpuset_t available, result, tgtcpus; + hwloc_obj_t target, tmp_obj; + uint16_t n; /* initialize */ - map = jdata->map; - mycpuset = hwloc_bitmap_alloc(); - - dobind = false; - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL) - || prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_MAP, NULL, PMIX_BOOL) - || prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_DEVEL_MAP, NULL, PMIX_BOOL)) { - dobind = true; - } - - /* see if they want multiple cpus/rank */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, (void **) &u16ptr, - PMIX_UINT16)) { - cpus_per_rank = u16; + available = hwloc_bitmap_alloc(); + result = hwloc_bitmap_alloc(); + if (NULL == obj) { + target = hwloc_get_root_obj(node->topology->topo); } else { - cpus_per_rank = 1; + target = obj; } - - /* see if they want are using hwthreads as cpus */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL)) { - use_hwthread_cpus = true; +#if HWLOC_API_VERSION < 0x20000 + tgtcpus = target->allowed_cpuset; +#else + tgtcpus = target->cpuset; +#endif + hwloc_bitmap_and(available, options->target, tgtcpus); + if (options->use_hwthreads) { + type = HWLOC_OBJ_PU; } else { - use_hwthread_cpus = false; + type = HWLOC_OBJ_CORE; } - for (i = 0; i < map->nodes->size; i++) { - if (NULL == (node = (prte_node_t *) pmix_pointer_array_get_item(map->nodes, i))) { - continue; - } - if ((int) PRTE_PROC_MY_NAME->rank != node->index && !dobind) { - continue; - } - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { - /* if we don't want to launch, then we are just testing the system, - * so ignore questions about support capabilities - */ - support = (struct hwloc_topology_support *) hwloc_topology_get_support( - node->topology->topo); - /* check if topology supports cpubind - have to be careful here - * as Linux doesn't currently support thread-level binding. This - * may change in the future, though, and it isn't clear how hwloc - * interprets the current behavior. So check both flags to be sure. - */ - if (!support->cpubind->set_thisproc_cpubind - && !support->cpubind->set_thisthread_cpubind) { - if (!PRTE_BINDING_REQUIRED(jdata->map->binding)) { - /* we are not required to bind, so ignore this */ - continue; - } - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, - node->name); - free(job_cpuset); - hwloc_bitmap_free(mycpuset); - return PRTE_ERR_SILENT; - } - /* check if topology supports membind - have to be careful here - * as hwloc treats this differently than I (at least) would have - * expected. Per hwloc, Linux memory binding is at the thread, - * and not process, level. Thus, hwloc sets the "thisproc" flag - * to "false" on all Linux systems, and uses the "thisthread" flag - * to indicate binding capability - */ - if (!support->membind->set_thisproc_membind - && !support->membind->set_thisthread_membind) { - if (PRTE_HWLOC_BASE_MBFA_WARN == prte_hwloc_base_mbfa && !membind_warned) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:membind-not-supported", true, - node->name); - membind_warned = true; - } else if (PRTE_HWLOC_BASE_MBFA_ERROR == prte_hwloc_base_mbfa) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", - true, node->name); - free(job_cpuset); - hwloc_bitmap_free(mycpuset); - return PRTE_ERR_SILENT; - } - } - } - root = hwloc_get_root_obj(node->topology->topo); - if (NULL == root->userdata) { - /* something went wrong */ - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - free(job_cpuset); - hwloc_bitmap_free(mycpuset); - return PRTE_ERR_NOT_FOUND; - } - sum = (prte_hwloc_topo_data_t *) root->userdata; - if (NULL == sum->available) { - /* another error */ - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - free(job_cpuset); - hwloc_bitmap_free(mycpuset); - return PRTE_ERR_NOT_FOUND; - } - reset_usage(node, jdata->nspace); - hwloc_bitmap_zero(mycpuset); - - /* filter the node-available cpus against the specified "soft" cgroup */ - mycpus = prte_hwloc_base_generate_cpuset(node->topology->topo, use_hwthread_cpus, - job_cpuset); - hwloc_bitmap_and(mycpus, mycpus, sum->available); - - for (j = 0; j < node->procs->size; j++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - continue; - } - if (PRTE_BIND_ORDERED_REQUESTED(jdata->map->binding)) { - /* assign each proc, in local rank order, to - * the corresponding cpu in the list */ - id = hwloc_bitmap_first(mycpus); - lrank = 0; - while (lrank != proc->local_rank) { - ncpus = 0; - while ((unsigned) -1 != id && ncpus < cpus_per_rank) { - id = hwloc_bitmap_next(mycpus, id); - /* set the bit of interest */ - hwloc_bitmap_only(mycpuset, id); - ++ncpus; - } - if ((unsigned) -1 == id) { - break; - } - ++lrank; - } - if ((unsigned) -1 == id) { - /* ran out of cpus - that's an error */ - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:insufficient-cpus", true, - node->name, (int) proc->local_rank, job_cpuset); - free(job_cpuset); - hwloc_bitmap_free(mycpuset); - hwloc_bitmap_free(mycpus); - return PRTE_ERR_OUT_OF_RESOURCE; - } - tset = mycpuset; - } else { - /* bind the proc to all assigned cpus */ - tset = mycpus; - } - hwloc_bitmap_list_asprintf(&cpu_bitmap, tset); - prte_set_attribute(&proc->attributes, PRTE_PROC_CPU_BITMAP, PRTE_ATTR_GLOBAL, - cpu_bitmap, PMIX_STRING); - if (NULL != cpu_bitmap) { - free(cpu_bitmap); - } - hwloc_bitmap_free(mycpus); + /* we bind-to-cpu for the number of cpus that was specified, + * restricting ourselves to the available cpus in the object */ + for (n=0; n < options->cpus_per_rank; n++) { + tmp_obj = hwloc_get_obj_inside_cpuset_by_type(node->topology->topo, available, type, n); + if (NULL != tmp_obj) { +#if HWLOC_API_VERSION < 0x20000 + hwloc_bitmap_or(result, result, tmp_obj->allowed_cpuset); + hwloc_bitmap_andnot(node->available, node->available, tmp_obj->allowed_cpuset); + hwloc_bitmap_andnot(options->target, options->target, tmp_obj->allowed_cpuset); +#else + hwloc_bitmap_or(result, result, tmp_obj->cpuset); + hwloc_bitmap_andnot(node->available, node->available, tmp_obj->cpuset); + hwloc_bitmap_andnot(options->target, options->target, tmp_obj->cpuset); +#endif } } - hwloc_bitmap_free(mycpuset); - free(job_cpuset); + hwloc_bitmap_list_asprintf(&proc->cpuset, result); return PRTE_SUCCESS; } -int prte_rmaps_base_compute_bindings(prte_job_t *jdata) +int prte_rmaps_base_bind_proc(prte_job_t *jdata, + prte_proc_t *proc, + prte_node_t *node, + hwloc_obj_t obj, + prte_rmaps_options_t *options) { - hwloc_obj_type_t hwb; - unsigned clvl = 0; - prte_binding_policy_t bind; - prte_mapping_policy_t map; - prte_node_t *node; int i, rc; - struct hwloc_topology_support *support; - int bind_depth; bool dobind; prte_output_verbose(5, prte_rmaps_base_framework.framework_output, @@ -965,195 +332,57 @@ int prte_rmaps_base_compute_bindings(prte_job_t *jdata) PRTE_JOBID_PRINT(jdata->nspace), prte_hwloc_base_print_binding(jdata->map->binding), jdata->map->binding); - map = PRTE_GET_MAPPING_POLICY(jdata->map->mapping); - bind = PRTE_GET_BINDING_POLICY(jdata->map->binding); - - if (PRTE_MAPPING_BYUSER == map) { + if (PRTE_MAPPING_BYUSER == options->map) { /* user specified binding by rankfile - nothing for us to do */ return PRTE_SUCCESS; } - if (PRTE_BIND_TO_NONE == bind) { + if (PRTE_BIND_TO_NONE == options->bind) { rc = PRTE_SUCCESS; - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_CPUSET, NULL, PMIX_STRING)) { + if (NULL != options->job_cpuset) { /* "soft" cgroup was given but no other * binding directive was provided, so bind * to those specific cpus */ - if (PRTE_SUCCESS != (rc = bind_to_cpuset(jdata))) { + if (PRTE_SUCCESS != (rc = bind_to_cpuset(jdata, proc, node, options))) { PRTE_ERROR_LOG(rc); } } return rc; } - /* binding requested - convert the binding level to the hwloc obj type */ - switch (bind) { - case PRTE_BIND_TO_PACKAGE: - hwb = HWLOC_OBJ_PACKAGE; - break; - case PRTE_BIND_TO_NUMA: - hwb = HWLOC_OBJ_NUMANODE; - break; - case PRTE_BIND_TO_L3CACHE: - PRTE_HWLOC_MAKE_OBJ_CACHE(3, hwb, clvl); - break; - case PRTE_BIND_TO_L2CACHE: - PRTE_HWLOC_MAKE_OBJ_CACHE(2, hwb, clvl); - break; - case PRTE_BIND_TO_L1CACHE: - PRTE_HWLOC_MAKE_OBJ_CACHE(1, hwb, clvl); - break; - case PRTE_BIND_TO_CORE: - hwb = HWLOC_OBJ_CORE; - break; - case PRTE_BIND_TO_HWTHREAD: - hwb = HWLOC_OBJ_PU; - break; - default: - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - return PRTE_ERR_BAD_PARAM; - } - - /* if the job was mapped by the corresponding target, then - * we bind in place - * - * otherwise, we have to bind either up or down the hwloc - * tree. If we are binding upwards (e.g., mapped to hwthread - * but binding to core), then we just climb the tree to find - * the first matching object. - * - * if we are binding downwards (e.g., mapped to node and bind - * to core), then we have to do a round-robin assigment of - * procs to the resources below. + /* some systems do not report cores, and so we can get a situation where our + * default binding policy will fail for no necessary reason. So if we are + * computing a binding due to our default policy, and no cores are found + * on this node, just silently skip it - we will not bind */ - - if (PRTE_MAPPING_BYDIST == map) { - /* bind every proc downwards */ - goto execute; + if (!PRTE_BINDING_POLICY_IS_SET(jdata->map->binding) && + HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology->topo, HWLOC_OBJ_CORE)) { + prte_output_verbose(5, prte_rmaps_base_framework.framework_output, + "Unable to bind-to core by default on node %s as no cores detected", + node->name); + return PRTE_SUCCESS; } - /* now deal with the remaining binding policies based on hardware */ - if (bind == map) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: bindings for job %s - bind in place", - PRTE_JOBID_PRINT(jdata->nspace)); - if (PRTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) { + if (PRTE_MAPPING_PELIST == options->map) { + rc = bind_to_cpuset(jdata, proc, node, options); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); } return rc; } - /* we need to handle the remaining binding options on a per-node - * basis because different nodes could potentially have different - * topologies, with different relative depths for the two levels - */ -execute: - /* initialize */ - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: computing bindings for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - - dobind = false; - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL) || - prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_MAP, NULL, PMIX_BOOL) || - prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_DEVEL_MAP, NULL, PMIX_BOOL)) { - dobind = true; - } - - for (i = 0; i < jdata->map->nodes->size; i++) { - if (NULL == (node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, i))) { - continue; - } - if ((int) PRTE_PROC_MY_NAME->rank != node->index && !dobind) { - continue; - } - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { - /* if we don't want to launch, then we are just testing the system, - * so ignore questions about support capabilities - */ - support = (struct hwloc_topology_support *) hwloc_topology_get_support( - node->topology->topo); - /* check if topology supports cpubind - have to be careful here - * as Linux doesn't currently support thread-level binding. This - * may change in the future, though, and it isn't clear how hwloc - * interprets the current behavior. So check both flags to be sure. - */ - if (!support->cpubind->set_thisproc_cpubind - && !support->cpubind->set_thisthread_cpubind) { - if (!PRTE_BINDING_REQUIRED(jdata->map->binding) - || !PRTE_BINDING_POLICY_IS_SET(jdata->map->binding)) { - /* we are not required to bind, so ignore this */ - continue; - } - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, - node->name); - return PRTE_ERR_SILENT; - } - /* check if topology supports membind - have to be careful here - * as hwloc treats this differently than I (at least) would have - * expected. Per hwloc, Linux memory binding is at the thread, - * and not process, level. Thus, hwloc sets the "thisproc" flag - * to "false" on all Linux systems, and uses the "thisthread" flag - * to indicate binding capability - don't warn if the user didn't - * specifically request binding - */ - if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind - && PRTE_BINDING_POLICY_IS_SET(jdata->map->binding)) { - if (PRTE_HWLOC_BASE_MBFA_WARN == prte_hwloc_base_mbfa && !membind_warned) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:membind-not-supported", true, - node->name); - membind_warned = true; - } else if (PRTE_HWLOC_BASE_MBFA_ERROR == prte_hwloc_base_mbfa) { - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", - true, node->name); - return PRTE_ERR_SILENT; - } - } - } - - /* some systems do not report cores, and so we can get a situation where our - * default binding policy will fail for no necessary reason. So if we are - * computing a binding due to our default policy, and no cores are found - * on this node, just silently skip it - we will not bind - */ - if (!PRTE_BINDING_POLICY_IS_SET(jdata->map->binding) - && HWLOC_TYPE_DEPTH_UNKNOWN - == hwloc_get_type_depth(node->topology->topo, HWLOC_OBJ_CORE)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "Unable to bind-to core by default on node %s as no cores detected", - node->name); - continue; - } - - /* determine the relative depth on this node */ -#if HWLOC_API_VERSION < 0x20000 - if (HWLOC_OBJ_CACHE == hwb) { - /* must use a unique function because blasted hwloc - * just doesn't deal with caches very well...sigh - */ - bind_depth = hwloc_get_cache_type_depth(node->topology->topo, clvl, - (hwloc_obj_cache_type_t) -1); - } else -#endif - bind_depth = hwloc_get_type_depth(node->topology->topo, hwb); -#if HWLOC_API_VERSION < 0x20000 - if (0 > bind_depth) -#else - if (0 > bind_depth && HWLOC_TYPE_DEPTH_NUMANODE != bind_depth) -#endif - { - /* didn't find such an object */ - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:no-objects", true, - hwloc_obj_type_string(hwb), node->name); - return PRTE_ERR_SILENT; - } - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, "%s bind_depth: %d", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), bind_depth); - if (PRTE_SUCCESS != (rc = bind_generic(jdata, node, bind_depth))) { + if (1 < options->cpus_per_rank) { + rc = bind_multiple(jdata, proc, node, obj, options); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); - return rc; } + return rc; } - return PRTE_SUCCESS; + rc = bind_generic(jdata, proc, node, obj, options); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + } + + return rc; } diff --git a/src/mca/rmaps/base/rmaps_base_frame.c b/src/mca/rmaps/base/rmaps_base_frame.c index 6633d17c9a..b5e9e7ba6a 100644 --- a/src/mca/rmaps/base/rmaps_base_frame.c +++ b/src/mca/rmaps/base/rmaps_base_frame.c @@ -58,7 +58,9 @@ prte_rmaps_base_t prte_rmaps_base = { .device = NULL, .inherit = false, .hwthread_cpus = false, - .file = NULL + .file = NULL, + .available = NULL, + .baseset = NULL }; /* @@ -77,11 +79,11 @@ static int prte_rmaps_base_register(prte_mca_base_register_flag_t flags) (void) prte_mca_base_var_register( "prte", "rmaps", "default", "mapping_policy", "Default mapping Policy [slot | hwthread | core (default:np<=2) | l1cache | " - "l2cache | l3cache | numa (default:np>2) | package | node | seq | dist | ppr | rankfile]," + "l2cache | l3cache | numa (default:np>2) | package | node | seq | dist | ppr | " + "rankfile | pe-list=a,b (comma-delimited ranges of cpus to use for this job)]," " with supported colon-delimited modifiers: PE=y (for multiple cpus/proc), " "SPAN, OVERSUBSCRIBE, NOOVERSUBSCRIBE, NOLOCAL, HWTCPUS, CORECPUS, " - "DEVICE=dev (for dist policy), INHERIT, NOINHERIT, PE-LIST=a,b (comma-delimited " - "ranges of cpus to use for this job), FILE=%s (path to file containing sequential " + "DEVICE=dev (for dist policy), INHERIT, NOINHERIT, ORDERED, FILE=%s (path to file containing sequential " "or rankfile entries)", PRTE_MCA_BASE_VAR_TYPE_STRING, NULL, 0, PRTE_MCA_BASE_VAR_FLAG_NONE, PRTE_INFO_LVL_9, PRTE_MCA_BASE_VAR_SCOPE_READONLY, &rmaps_base_mapping_policy); @@ -90,8 +92,7 @@ static int prte_rmaps_base_register(prte_mca_base_register_flag_t flags) rmaps_base_ranking_policy = NULL; (void) prte_mca_base_var_register( "prte", "rmaps", "default", "ranking_policy", - "Default ranking Policy [slot (default:np<=2) | hwthread | core | l1cache " - "| l2cache | l3cache | numa (default:np>2) | package | node], with modifier :SPAN or :FILL", + "Default ranking Policy [slot | node | span | fill]", PRTE_MCA_BASE_VAR_TYPE_STRING, NULL, 0, PRTE_MCA_BASE_VAR_FLAG_NONE, PRTE_INFO_LVL_9, PRTE_MCA_BASE_VAR_SCOPE_READONLY, &rmaps_base_ranking_policy); @@ -115,6 +116,8 @@ static int prte_rmaps_base_close(void) PMIX_RELEASE(item); } PMIX_DESTRUCT(&prte_rmaps_base.selected_modules); + hwloc_bitmap_free(prte_rmaps_base.available); + hwloc_bitmap_free(prte_rmaps_base.baseset); return prte_mca_base_framework_components_close(&prte_rmaps_base_framework, NULL); } @@ -136,6 +139,8 @@ static int prte_rmaps_base_open(prte_mca_base_open_flag_t flags) if (NULL == prte_set_slots) { prte_set_slots = strdup("core"); } + prte_rmaps_base.available = hwloc_bitmap_alloc(); + prte_rmaps_base.baseset = hwloc_bitmap_alloc(); /* set the default mapping and ranking policies */ if (NULL != rmaps_base_mapping_policy) { @@ -165,7 +170,7 @@ PMIX_CLASS_INSTANCE(prte_rmaps_base_selected_module_t, pmix_list_item_t, NULL, N static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *tmp) { - char **ck2, *ptr, *temp_parm, *temp_token, *parm_delimiter; + char **ck2, *ptr; int i; uint16_t u16; bool inherit_given = false; @@ -220,8 +225,8 @@ static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *t "mapping policy", ck2[i]); return PRTE_ERR_SILENT; } - prte_set_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_MAP, PRTE_ATTR_GLOBAL, NULL, - PMIX_BOOL); + prte_set_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_MAP, PRTE_ATTR_GLOBAL, + NULL, PMIX_BOOL); } else if (0 == strcasecmp(ck2[i], "DISPLAYDEVEL")) { if (NULL == jdata) { @@ -293,41 +298,16 @@ static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *t "mapping policy", ck2[i]); return PRTE_ERR_SILENT; } - prte_set_attribute(&jdata->attributes, PRTE_JOB_XML_OUTPUT, PRTE_ATTR_GLOBAL, NULL, - PMIX_BOOL); + prte_set_attribute(&jdata->attributes, PRTE_JOB_XML_OUTPUT, PRTE_ATTR_GLOBAL, + NULL, PMIX_BOOL); - } else if (0 == strncasecmp(ck2[i], "PE-LIST=", 8)) { + } else if (0 == strcasecmp(ck2[i], "ORDERED")) { if (NULL == jdata) { pmix_show_help("help-prte-rmaps-base.txt", "unsupported-default-modifier", true, "mapping policy", ck2[i]); return PRTE_ERR_SILENT; } - ptr = &ck2[i][8]; - /* Verify the option parmeter is a list of numeric tokens */ - temp_parm = strdup(ptr); - temp_token = strtok(temp_parm, ","); - while (NULL != temp_token) { - u16 = strtol(temp_token, &parm_delimiter, 10); - if ('\0' != *parm_delimiter) { - pmix_show_help("help-prte-rmaps-base.txt", "invalid-value", true, - "mapping policy", "PE", ck2[i]); - pmix_argv_free(ck2); - free(temp_parm); - return PRTE_ERR_SILENT; - } - temp_token = strtok(NULL, ","); - } - free(temp_parm); - /* quick check - if it matches the default, then don't set it */ - if (NULL != prte_hwloc_default_cpu_list) { - if (0 != strcmp(prte_hwloc_default_cpu_list, ptr)) { - prte_set_attribute(&jdata->attributes, PRTE_JOB_CPUSET, PRTE_ATTR_GLOBAL, ptr, - PMIX_STRING); - } - } else { - prte_set_attribute(&jdata->attributes, PRTE_JOB_CPUSET, PRTE_ATTR_GLOBAL, ptr, - PMIX_STRING); - } + PRTE_SET_MAPPING_DIRECTIVE(*tmp, PRTE_MAPPING_ORDERED); } else if (0 == strncasecmp(ck2[i], "PE=", 3)) { if (NULL == jdata) { @@ -344,8 +324,8 @@ static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *t pmix_argv_free(ck2); return PRTE_ERR_SILENT; } - prte_set_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, PRTE_ATTR_GLOBAL, &u16, - PMIX_UINT16); + prte_set_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, PRTE_ATTR_GLOBAL, + &u16, PMIX_UINT16); } else if (0 == strcasecmp(ck2[i], "INHERIT")) { if (noinherit_given) { @@ -358,8 +338,8 @@ static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *t if (NULL == jdata) { prte_rmaps_base.inherit = true; } else { - prte_set_attribute(&jdata->attributes, PRTE_JOB_INHERIT, PRTE_ATTR_GLOBAL, NULL, - PMIX_BOOL); + prte_set_attribute(&jdata->attributes, PRTE_JOB_INHERIT, PRTE_ATTR_GLOBAL, + NULL, PMIX_BOOL); } inherit_given = true; @@ -374,8 +354,8 @@ static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *t if (NULL == jdata) { prte_rmaps_base.inherit = false; } else { - prte_set_attribute(&jdata->attributes, PRTE_JOB_NOINHERIT, PRTE_ATTR_GLOBAL, NULL, - PMIX_BOOL); + prte_set_attribute(&jdata->attributes, PRTE_JOB_NOINHERIT, PRTE_ATTR_GLOBAL, + NULL, PMIX_BOOL); } noinherit_given = true; @@ -404,8 +384,8 @@ static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *t if (NULL == jdata) { prte_rmaps_base.hwthread_cpus = true; } else { - prte_set_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, PRTE_ATTR_GLOBAL, NULL, - PMIX_BOOL); + prte_set_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, PRTE_ATTR_GLOBAL, + NULL, PMIX_BOOL); } hwthread_cpus_given = true; @@ -419,8 +399,8 @@ static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *t if (NULL == jdata) { prte_rmaps_base.hwthread_cpus = false; } else { - prte_set_attribute(&jdata->attributes, PRTE_JOB_CORE_CPUS, PRTE_ATTR_GLOBAL, NULL, - PMIX_BOOL); + prte_set_attribute(&jdata->attributes, PRTE_JOB_CORE_CPUS, + PRTE_ATTR_GLOBAL, NULL, PMIX_BOOL); } core_cpus_given = true; @@ -435,8 +415,8 @@ static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *t if (NULL == jdata) { prte_rmaps_base.file = strdup(&ck2[i][5]); } else { - prte_set_attribute(&jdata->attributes, PRTE_JOB_FILE, PRTE_ATTR_GLOBAL, &ck2[i][5], - PMIX_STRING); + prte_set_attribute(&jdata->attributes, PRTE_JOB_FILE, PRTE_ATTR_GLOBAL, + &ck2[i][5], PMIX_STRING); } } else { @@ -450,7 +430,7 @@ static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *t } int prte_rmaps_base_set_default_mapping(prte_job_t *jdata, - prte_schizo_options_t *options) + prte_rmaps_options_t *options) { /* default based on number of procs */ if (options->nprocs <= 2) { @@ -503,12 +483,15 @@ int prte_rmaps_base_set_mapping_policy(prte_job_t *jdata, char *inspec) size_t len; char *spec = NULL; bool ppr = false; + char *temp_parm, *temp_token, *parm_delimiter; + uint16_t u16; /* set defaults */ tmp = 0; prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "%s rmaps:base set policy with %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + "%s rmaps:base set policy with %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (NULL == inspec) ? "NULL" : inspec); if (NULL == inspec) { @@ -665,6 +648,34 @@ int prte_rmaps_base_set_mapping_policy(prte_job_t *jdata, char *inspec) return PRTE_ERR_SILENT; } PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYDIST); + } else if (0 == strncasecmp(spec, "PE-LIST=", 8)) { + if (NULL == jdata) { + pmix_show_help("help-prte-rmaps-base.txt", "unsupported-default-policy", true, + "mapping policy", spec); + free(spec); + return PRTE_ERR_SILENT; + } + ptr = strchr(spec, '='); // cannot be NULL as we checked for it + ptr++; // move past the equal sign + /* Verify the list is composed of numeric tokens */ + temp_parm = strdup(ptr); + temp_token = strtok(temp_parm, ","); + while (NULL != temp_token) { + u16 = strtol(temp_token, &parm_delimiter, 10); + if ('\0' != *parm_delimiter) { + pmix_show_help("help-prte-rmaps-base.txt", "invalid-value", true, + "mapping policy", "PE-LIST", ptr); + free(spec); + free(temp_parm); + return PRTE_ERR_SILENT; + } + temp_token = strtok(NULL, ","); + } + free(temp_parm); + prte_set_attribute(&jdata->attributes, PRTE_JOB_CPUSET, PRTE_ATTR_GLOBAL, + ptr, PMIX_STRING); + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_PELIST); + PRTE_SET_MAPPING_DIRECTIVE(tmp, PRTE_MAPPING_GIVEN); } else { pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", true, "mapping", spec); @@ -692,7 +703,7 @@ int prte_rmaps_base_set_mapping_policy(prte_job_t *jdata, char *inspec) } int prte_rmaps_base_set_default_ranking(prte_job_t *jdata, - prte_schizo_options_t *options) + prte_rmaps_options_t *options) { int rc; rc = prte_rmaps_base_set_ranking_policy(jdata, NULL); @@ -701,117 +712,35 @@ int prte_rmaps_base_set_default_ranking(prte_job_t *jdata, int prte_rmaps_base_set_ranking_policy(prte_job_t *jdata, char *spec) { - prte_mapping_policy_t map, mapping; prte_ranking_policy_t tmp; - char **ck; size_t len; /* set default */ tmp = 0; if (NULL == spec) { - if (NULL != jdata) { - if (NULL == jdata->map) { - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - return PRTE_ERR_BAD_PARAM; - } - mapping = jdata->map->mapping; - } else { - mapping = prte_rmaps_base.mapping; - } - /* check for map-by object directives - we set the - * ranking to match if one was given - */ - if (PRTE_MAPPING_GIVEN & PRTE_GET_MAPPING_DIRECTIVE(mapping)) { - map = PRTE_GET_MAPPING_POLICY(mapping); - switch (map) { - case PRTE_MAPPING_BYSLOT: - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_SLOT); - break; - case PRTE_MAPPING_BYNODE: - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_NODE); - break; - case PRTE_MAPPING_BYCORE: - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_CORE); - break; - case PRTE_MAPPING_BYL1CACHE: - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_L1CACHE); - break; - case PRTE_MAPPING_BYL2CACHE: - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_L2CACHE); - break; - case PRTE_MAPPING_BYL3CACHE: - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_L3CACHE); - break; - case PRTE_MAPPING_BYNUMA: - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_NUMA); - break; - case PRTE_MAPPING_BYPACKAGE: - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_PACKAGE); - break; - case PRTE_MAPPING_BYHWTHREAD: - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_HWTHREAD); - break; - case PRTE_MAPPING_PPR: - // do not set the policy for PPR - we will set it in - // the ppr mapper - break; - default: - /* anything not tied to a specific hw obj can rank by slot */ - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_SLOT); - break; - } - } else { - /* if no map-by was given, default to by-slot */ + /* if mapping by-node, then default to rank-by node */ + if (PRTE_MAPPING_BYNODE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_NODE); + } else if (PRTE_MAPPING_PPR != PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + /* default to by-slot */ PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_SLOT); } } else { - ck = pmix_argv_split(spec, ':'); - if (2 < pmix_argv_count(ck)) { - /* incorrect format */ - pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", true, "ranking", - spec); - pmix_argv_free(ck); - return PRTE_ERR_SILENT; - } - if (2 == pmix_argv_count(ck)) { - if (0 == strncasecmp(ck[1], "span", strlen(ck[1]))) { - PRTE_SET_RANKING_DIRECTIVE(tmp, PRTE_RANKING_SPAN); - } else if (0 == strncasecmp(ck[1], "fill", strlen(ck[1]))) { - PRTE_SET_RANKING_DIRECTIVE(tmp, PRTE_RANKING_FILL); - } else { - /* unrecognized modifier */ - pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-modifier", true, ck[1]); - pmix_argv_free(ck); - return PRTE_ERR_SILENT; - } - } - len = strlen(ck[0]); - if (0 == strncasecmp(ck[0], "slot", len)) { + len = strlen(spec); + if (0 == strncasecmp(spec, "slot", len)) { PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_SLOT); - } else if (0 == strncasecmp(ck[0], "node", len)) { + } else if (0 == strncasecmp(spec, "node", len)) { PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_NODE); - } else if (0 == strncasecmp(ck[0], "hwthread", len)) { - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_HWTHREAD); - } else if (0 == strncasecmp(ck[0], "core", len)) { - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_CORE); - } else if (0 == strncasecmp(ck[0], "l1cache", len)) { - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_L1CACHE); - } else if (0 == strncasecmp(ck[0], "l2cache", len)) { - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_L2CACHE); - } else if (0 == strncasecmp(ck[0], "l3cache", len)) { - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_L3CACHE); - } else if (0 == strncasecmp(ck[0], "numa", len)) { - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_NUMA); - } else if (0 == strncasecmp(ck[0], "package", len)) { - PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_PACKAGE); + } else if (0 == strncasecmp(spec, "fill", len)) { + PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_FILL); + } else if (0 == strncasecmp(spec, "span", len)) { + PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_SPAN); } else { pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", true, - "ranking", ck[0]); - pmix_argv_free(ck); + "ranking", spec); return PRTE_ERR_SILENT; } - pmix_argv_free(ck); PRTE_SET_RANKING_DIRECTIVE(tmp, PRTE_RANKING_GIVEN); } diff --git a/src/mca/rmaps/base/rmaps_base_map_job.c b/src/mca/rmaps/base/rmaps_base_map_job.c index a5642a58f8..16b6164141 100644 --- a/src/mca/rmaps/base/rmaps_base_map_job.c +++ b/src/mca/rmaps/base/rmaps_base_map_job.c @@ -31,6 +31,7 @@ #include #include "src/hwloc/hwloc-internal.h" +#include "src/pmix/pmix-internal.h" #include "src/mca/base/base.h" #include "src/mca/mca.h" #include "src/util/pmix_argv.h" @@ -50,7 +51,8 @@ static int map_colocate(prte_job_t *jdata, bool daemons, bool pernode, pmix_data_array_t *darray, - uint16_t procs_per_target); + uint16_t procs_per_target, + prte_rmaps_options_t *options); void prte_rmaps_base_map_job(int fd, short args, void *cbdata) { @@ -62,24 +64,25 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) pmix_proc_t *pptr; int rc = PRTE_SUCCESS; bool did_map, pernode = false, perpackage = false, pernuma = false; + bool pelist = false; prte_rmaps_base_selected_module_t *mod; prte_job_t *parent = NULL, *target_jdata; - pmix_rank_t nprocs; prte_app_context_t *app, *daemon_app; bool inherit = false; pmix_proc_t *nptr, *target_proc; - char *tmp, *p; + char *tmp, *p, **t2; uint16_t u16 = 0, procs_per_target = 0; - uint16_t *u16ptr = &u16, cpus_per_rank; - bool use_hwthreads = false, colocate_daemons = false; + uint16_t *u16ptr = &u16; + bool colocate_daemons = false; bool colocate = false; size_t ncolocated; bool sequential = false; - int32_t slots; + int32_t slots, npelist; hwloc_obj_t obj = NULL; prte_schizo_base_module_t *schizo; - prte_schizo_options_t options; + prte_rmaps_options_t options; pmix_data_array_t *darray = NULL; + prte_binding_policy_t bind; PRTE_HIDE_UNUSED_PARAMS(fd, args); @@ -92,7 +95,21 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); goto cleanup; } + if (NULL == jdata->map) { + jdata->map = PMIX_NEW(prte_job_map_t); + } jdata->state = PRTE_JOB_STATE_MAP; + memset(&options, 0, sizeof(prte_rmaps_options_t)); + options.stream = prte_rmaps_base_framework.framework_output; + options.verbosity = 5; // usual value for base-level functions + if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { + options.donotlaunch = true; + } + if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL) || + prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_MAP, NULL, PMIX_BOOL) || + prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_DEVEL_MAP, NULL, PMIX_BOOL)) { + options.dobind = true; + } prte_output_verbose(5, prte_rmaps_base_framework.framework_output, "mca:rmaps: mapping job %s", @@ -196,21 +213,6 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) } if (colocate || colocate_daemons) { - if (procs_per_target == 0) { - prte_output(0, "Error: COLOCATION REQUESTED WITH ZERO PROCS/TARGET\n"); - jdata->exit_code = PRTE_ERR_BAD_PARAM; - PRTE_ERROR_LOG(jdata->exit_code); - PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); - goto cleanup; - } - rc = map_colocate(jdata, colocate_daemons, pernode, darray, procs_per_target); - PMIX_DATA_ARRAY_FREE(darray); - if (PRTE_SUCCESS != rc) { - jdata->exit_code = PRTE_ERR_BAD_PARAM; - PRTE_ERROR_LOG(jdata->exit_code); - PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); - goto cleanup; - } goto ranking; } @@ -245,10 +247,6 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) inherit = true; } - if (NULL == jdata->map) { - jdata->map = PMIX_NEW(prte_job_map_t); - } - if (inherit) { if (NULL != parent) { /* if not already assigned, inherit the parent's ppr */ @@ -284,11 +282,14 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) } } } else { - /* inherit the base defaults */ - if (prte_rmaps_base.hwthread_cpus) { - prte_set_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, PRTE_ATTR_GLOBAL, NULL, PMIX_BOOL); - } else { - prte_set_attribute(&jdata->attributes, PRTE_JOB_CORE_CPUS, PRTE_ATTR_GLOBAL, NULL, PMIX_BOOL); + if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL) + && !prte_get_attribute(&jdata->attributes, PRTE_JOB_CORE_CPUS, NULL, PMIX_BOOL)) { + /* inherit the base defaults */ + if (prte_rmaps_base.hwthread_cpus) { + prte_set_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, PRTE_ATTR_GLOBAL, NULL, PMIX_BOOL); + } else { + prte_set_attribute(&jdata->attributes, PRTE_JOB_CORE_CPUS, PRTE_ATTR_GLOBAL, NULL, PMIX_BOOL); + } } } } @@ -304,111 +305,25 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) } } - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PPR, (void **) &tmp, PMIX_STRING)) { - if (NULL != strcasestr(tmp, "node")) { - pernode = true; - /* get the ppn */ - if (NULL == (p = strchr(tmp, ':'))) { - /* should never happen */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - jdata->exit_code = PRTE_ERR_BAD_PARAM; - PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); - goto cleanup; - } - ++p; // step over the colon - u16 = strtoul(p, NULL, 10); - } else if (NULL != strcasestr(tmp, "package")) { - perpackage = true; - /* get the ppn */ - if (NULL == (p = strchr(tmp, ':'))) { - /* should never happen */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - jdata->exit_code = PRTE_ERR_BAD_PARAM; - PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); - goto cleanup; - } - ++p; // step over the colon - u16 = strtoul(p, NULL, 10); - } else if (NULL != strcasestr(tmp, "numa")) { - pernuma = true; - /* get the ppn */ - if (NULL == (p = strchr(tmp, ':'))) { - /* should never happen */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - jdata->exit_code = PRTE_ERR_BAD_PARAM; - PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); - goto cleanup; - } - ++p; // step over the colon - u16 = strtoul(p, NULL, 10); - } - free(tmp); - } else if (PRTE_MAPPING_SEQ == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - sequential = true; - } else if (PRTE_MAPPING_BYUSER == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - /* defer */ - nprocs = 10; // number doesn't matter as long as it is > 2 - goto compute; - } - - /* estimate the number of procs for assigning default mapping/ranking policies */ - nprocs = 0; - for (int i = 0; i < jdata->apps->size; i++) { - if (NULL != (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i))) { - if (0 == app->num_procs) { - pmix_list_t nodes; - PMIX_CONSTRUCT(&nodes, pmix_list_t); - prte_rmaps_base_get_target_nodes(&nodes, &slots, app, PRTE_MAPPING_BYNODE, true, - true); - if (pernode) { - slots = u16 * pmix_list_get_size(&nodes); - } else if (perpackage) { - /* add in #packages for each node */ - PMIX_LIST_FOREACH(node, &nodes, prte_node_t) - { - slots += u16 * prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_PACKAGE, 0); - } - } else if (pernuma) { - /* add in #NUMA for each node */ - PMIX_LIST_FOREACH(node, &nodes, prte_node_t) - { - slots += u16 * prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, - HWLOC_OBJ_NUMANODE, 0); - } - } else if (sequential) { - slots = pmix_list_get_size(&nodes); - } - PMIX_LIST_DESTRUCT(&nodes); - } else { - slots = app->num_procs; - } - nprocs += slots; - } - } - -compute: /* set some convenience params */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, (void **) &u16ptr, - PMIX_UINT16)) { - cpus_per_rank = u16; + prte_get_attribute(&jdata->attributes, PRTE_JOB_CPUSET, (void**)&options.cpuset, PMIX_STRING); + if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, (void **) &u16ptr, PMIX_UINT16)) { + options.cpus_per_rank = u16; } else { - cpus_per_rank = 1; + options.cpus_per_rank = 1; } if (prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL)) { - use_hwthreads = true; + options.use_hwthreads = true; + } + if (!(PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { + options.oversubscribe = true; } prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: setting mapping policies for job %s nprocs %d inherit %s hwtcpus %s", - PRTE_JOBID_PRINT(jdata->nspace), (int) nprocs, - inherit ? "TRUE" : "FALSE", use_hwthreads ? "TRUE" : "FALSE"); - - options.nprocs = nprocs; - options.cpus_per_rank = cpus_per_rank; - options.use_hwthreads = use_hwthreads; - options.stream = prte_rmaps_base_framework.framework_output; - options.verbosity = 5; // usual value for base-level functions + "mca:rmaps: setting mapping policies for job %s inherit %s hwtcpus %s", + PRTE_JOBID_PRINT(jdata->nspace), + inherit ? "TRUE" : "FALSE", + options.use_hwthreads ? "TRUE" : "FALSE"); /* set the default mapping policy IFF it wasn't provided */ if (!PRTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { @@ -465,36 +380,113 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) } ranking: - /* set the default ranking policy IFF it wasn't provided */ - if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { - did_map = false; - if (inherit) { - if (NULL != parent) { - jdata->map->ranking = parent->map->ranking; - did_map = true; - } else if (PRTE_RANKING_GIVEN & PRTE_GET_RANKING_DIRECTIVE(prte_rmaps_base.ranking)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps ranking given by MCA param"); - jdata->map->ranking = prte_rmaps_base.ranking; - did_map = true; - } - } - if (!did_map) { - // let the job's personality set the default ranking behavior - if (NULL != schizo->set_default_ranking) { - rc = schizo->set_default_ranking(jdata, &options); - } else { - rc = prte_rmaps_base_set_default_ranking(jdata, &options); + options.map = PRTE_GET_MAPPING_POLICY(jdata->map->mapping); + if (PRTE_MAPPING_SPAN & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { + options.mapspan = true; + } + if (PRTE_MAPPING_ORDERED & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { + options.ordered = true; + } + + switch (options.map) { + case PRTE_MAPPING_BYNODE: + case PRTE_MAPPING_BYSLOT: + case PRTE_MAPPING_BYDIST: + case PRTE_MAPPING_PPR: + case PRTE_MAPPING_PELIST: + options.mapdepth = PRTE_BIND_TO_NONE; + options.maptype = HWLOC_OBJ_MACHINE; + break; + case PRTE_MAPPING_BYUSER: + case PRTE_MAPPING_SEQ: + options.mapdepth = PRTE_BIND_TO_NONE; + options.userranked = true; + options.maptype = HWLOC_OBJ_MACHINE; + break; + case PRTE_MAPPING_BYNUMA: + options.mapdepth = PRTE_BIND_TO_NUMA; + options.maptype = HWLOC_OBJ_NUMANODE; + break; + case PRTE_MAPPING_BYPACKAGE: + options.mapdepth = PRTE_BIND_TO_PACKAGE; + options.maptype = HWLOC_OBJ_PACKAGE; + break; + case PRTE_MAPPING_BYL3CACHE: + options.mapdepth = PRTE_BIND_TO_L3CACHE; + PRTE_HWLOC_MAKE_OBJ_CACHE(3, options.maptype, options.cmaplvl); + break; + case PRTE_MAPPING_BYL2CACHE: + options.mapdepth = PRTE_BIND_TO_L2CACHE; + PRTE_HWLOC_MAKE_OBJ_CACHE(2, options.maptype, options.cmaplvl); + break; + case PRTE_MAPPING_BYL1CACHE: + options.mapdepth = PRTE_BIND_TO_L1CACHE; + PRTE_HWLOC_MAKE_OBJ_CACHE(1, options.maptype, options.cmaplvl); + break; + case PRTE_MAPPING_BYCORE: + options.mapdepth = PRTE_BIND_TO_CORE; + options.maptype = HWLOC_OBJ_CORE; + break; + case PRTE_MAPPING_BYHWTHREAD: + options.mapdepth = PRTE_BIND_TO_HWTHREAD; + options.maptype = HWLOC_OBJ_PU; + break; + default: + PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); + jdata->exit_code = PRTE_ERR_BAD_PARAM; + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } + + if (options.userranked) { + /* must rank by user */ + PRTE_SET_RANKING_POLICY(jdata->map->ranking, PRTE_RANKING_BYUSER); + } else { + /* set the default ranking policy IFF it wasn't provided */ + if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { + did_map = false; + if (inherit) { + if (NULL != parent) { + jdata->map->ranking = parent->map->ranking; + did_map = true; + } else if (PRTE_RANKING_GIVEN & PRTE_GET_RANKING_DIRECTIVE(prte_rmaps_base.ranking)) { + prte_output_verbose(5, prte_rmaps_base_framework.framework_output, + "mca:rmaps ranking given by MCA param"); + jdata->map->ranking = prte_rmaps_base.ranking; + did_map = true; + } } - if (PRTE_SUCCESS != rc) { - // the error message should have been printed - jdata->exit_code = rc; - PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); - goto cleanup; + if (!did_map) { + // let the job's personality set the default ranking behavior + if (NULL != schizo->set_default_ranking) { + rc = schizo->set_default_ranking(jdata, &options); + } else { + rc = prte_rmaps_base_set_default_ranking(jdata, &options); + } + if (PRTE_SUCCESS != rc) { + // the error message should have been printed + jdata->exit_code = rc; + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } } } } - + options.rank = PRTE_GET_RANKING_POLICY(jdata->map->ranking); + /* if we are ranking by FILL or SPAN, then we must map by an object */ + if (PRTE_RANK_BY_SPAN == options.rank || + PRTE_RANK_BY_FILL == options.rank && + PRTE_MAPPING_PPR != options.map) { + if (options.map < PRTE_MAPPING_BYNUMA || + options.map > PRTE_MAPPING_BYHWTHREAD) { + pmix_show_help("help-prte-rmaps-base.txt", "must-map-by-obj", + true, prte_rmaps_base_print_mapping(options.map), + prte_rmaps_base_print_ranking(options.rank)); + jdata->exit_code = PRTE_ERR_SILENT; + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } + } /* define the binding policy for this job - if the user specified one * already (e.g., during the call to comm_spawn), then we don't * override it */ @@ -529,12 +521,86 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) } } } + options.overload = PRTE_BIND_OVERLOAD_ALLOWED(jdata->map->binding); + options.bind = PRTE_GET_BINDING_POLICY(jdata->map->binding); + /* sanity check */ + if (options.mapdepth > options.bind) { + /* we cannot bind to objects higher in the + * topology than where we mapped */ + pmix_show_help("help-prte-hwloc-base.txt", "bind-upwards", true, + prte_rmaps_base_print_mapping(options.map), + prte_hwloc_base_print_binding(options.bind)); + jdata->exit_code = rc; + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } + switch (options.bind) { + case PRTE_BIND_TO_NONE: + options.hwb = HWLOC_OBJ_MACHINE; + break; + case PRTE_BIND_TO_PACKAGE: + options.hwb = HWLOC_OBJ_PACKAGE; + break; + case PRTE_BIND_TO_NUMA: + options.hwb = HWLOC_OBJ_NUMANODE; + break; + case PRTE_BIND_TO_L3CACHE: + PRTE_HWLOC_MAKE_OBJ_CACHE(3, options.hwb, options.clvl); + break; + case PRTE_BIND_TO_L2CACHE: + PRTE_HWLOC_MAKE_OBJ_CACHE(2, options.hwb, options.clvl); + break; + case PRTE_BIND_TO_L1CACHE: + PRTE_HWLOC_MAKE_OBJ_CACHE(1, options.hwb, options.clvl); + break; + case PRTE_BIND_TO_CORE: + options.hwb = HWLOC_OBJ_CORE; + break; + case PRTE_BIND_TO_HWTHREAD: + options.hwb = HWLOC_OBJ_PU; + break; + default: + PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); + jdata->exit_code = PRTE_ERR_BAD_PARAM; + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } + if (1 < options.cpus_per_rank || + NULL != options.job_cpuset || + options.ordered) { + /* REQUIRES binding to cpu */ + if (PRTE_BINDING_POLICY_IS_SET(jdata->map->binding)) { + if (PRTE_BIND_TO_CORE != options.bind && + PRTE_BIND_TO_HWTHREAD != options.bind) { + pmix_show_help("help-prte-rmaps-base.txt", "unsupported-combination", true, + "binding", prte_hwloc_base_print_binding(options.bind)); + PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); + jdata->exit_code = PRTE_ERR_BAD_PARAM; + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } + /* ensure the cpu usage setting matches the provided bind directive */ + if (PRTE_BIND_TO_HWTHREAD == options.bind) { + options.use_hwthreads = true; + } else { + options.use_hwthreads = false; + } + } else { + if (options.use_hwthreads) { + PRTE_SET_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_HWTHREAD); + options.bind = PRTE_BIND_TO_HWTHREAD; + } else { + PRTE_SET_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_CORE); + options.bind = PRTE_BIND_TO_CORE; + } + } + } /* if we are not going to launch, then we need to set any * undefined topologies to match our own so the mapper * can operate */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { + if (options.donotlaunch) { prte_topology_t *t0; if (NULL == (node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, 0))) { PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); @@ -555,10 +621,22 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) } if (colocate_daemons || colocate) { - /* This is a mapping request for a tool daemon which is handled above - * so don't run any mapping modules */ - did_map = true; - rc = PMIX_SUCCESS; + /* This is a colocation request, so we don't run any mapping modules */ + if (procs_per_target == 0) { + prte_output(0, "Error: COLOCATION REQUESTED WITH ZERO PROCS/TARGET\n"); + jdata->exit_code = PRTE_ERR_BAD_PARAM; + PRTE_ERROR_LOG(jdata->exit_code); + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } + rc = map_colocate(jdata, colocate_daemons, pernode, darray, procs_per_target, &options); + PMIX_DATA_ARRAY_FREE(darray); + if (PRTE_SUCCESS != rc) { + jdata->exit_code = PRTE_ERR_BAD_PARAM; + PRTE_ERROR_LOG(jdata->exit_code); + PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); + goto cleanup; + } } else { /* cycle thru the available mappers until one agrees to map * the job @@ -572,8 +650,8 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) } PMIX_LIST_FOREACH(mod, &prte_rmaps_base.selected_modules, prte_rmaps_base_selected_module_t) { - if (PRTE_SUCCESS == (rc = mod->module->map_job(jdata)) - || PRTE_ERR_RESOURCE_BUSY == rc) { + if (PRTE_SUCCESS == (rc = mod->module->map_job(jdata, &options)) || + PRTE_ERR_RESOURCE_BUSY == rc) { did_map = true; break; } @@ -609,45 +687,7 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) goto cleanup; } - /* if any node is oversubscribed, then check to see if a binding - * directive was given - if not, then we want to clear the default - * binding policy so we don't attempt to bind */ - if (PRTE_FLAG_TEST(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED)) { - if (!PRTE_BINDING_POLICY_IS_SET(jdata->map->binding)) { - /* clear any default binding policy we might have set */ - PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_NONE); - } - } - - /* compute the ranks and add the proc objects - * to the jdata->procs array */ - if (PRTE_SUCCESS != (rc = prte_rmaps_base_compute_vpids(jdata))) { - PRTE_ERROR_LOG(rc); - jdata->exit_code = rc; - PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); - goto cleanup; - } - moveon: - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL) || - prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_MAP, NULL, PMIX_BOOL) || - prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_DEVEL_MAP, NULL, PMIX_BOOL) || - prte_get_attribute(&jdata->attributes, PRTE_JOB_FULLY_DESCRIBED, NULL, PMIX_BOOL)) { /* compute and save local ranks */ - if (PRTE_SUCCESS != (rc = prte_rmaps_base_compute_local_ranks(jdata))) { - PRTE_ERROR_LOG(rc); - jdata->exit_code = rc; - PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); - goto cleanup; - } - /* compute and save bindings */ - if (PRTE_SUCCESS != (rc = prte_rmaps_base_compute_bindings(jdata))) { - PRTE_ERROR_LOG(rc); - jdata->exit_code = rc; - PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_MAP_FAILED); - goto cleanup; - } - } - /* set the offset so shared memory components can potentially * connect to any spawned jobs */ @@ -678,13 +718,30 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) PRTE_FLAG_UNSET(node, PRTE_NODE_FLAG_MAPPED); } } - + if (NULL != options.job_cpuset) { + free(options.job_cpuset); + } /* cleanup */ PMIX_RELEASE(caddy); } +static void lkcbfunc(pmix_status_t status, void *cbdata) +{ + pmix_byte_object_t *bo = (pmix_byte_object_t*)cbdata; + + /* nothing to do here - we use this solely to + * ensure that IOF_deliver doesn't block */ + if (PMIX_SUCCESS != status) { + PMIX_ERROR_LOG(status); + } + PMIX_BYTE_OBJECT_FREE(bo, 1); +} + void prte_rmaps_base_display_map(prte_job_t *jdata) { + pmix_byte_object_t *bo; + pmix_status_t rc; + /* ignore daemon job */ char *output = NULL; @@ -693,15 +750,21 @@ void prte_rmaps_base_display_map(prte_job_t *jdata) return; } - prte_map_print(&output, jdata); - prte_output(prte_clean_output, "%s\n", output); - free(output); + PMIX_BYTE_OBJECT_CREATE(bo, 1); + prte_map_print(&bo->bytes, jdata); + bo->size = strlen(bo->bytes); + rc = PMIx_server_IOF_deliver(&prte_process_info.myproc, PMIX_FWD_STDOUT_CHANNEL, bo, NULL, 0, lkcbfunc, (void*)bo); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_BYTE_OBJECT_FREE(bo, 1); + } } static int map_colocate(prte_job_t *jdata, bool daemons, bool pernode, pmix_data_array_t *darray, - uint16_t procs_per_target) + uint16_t procs_per_target, + prte_rmaps_options_t *options) { char *tmp; pmix_status_t rc; @@ -712,9 +775,8 @@ static int map_colocate(prte_job_t *jdata, prte_app_context_t *target_app, *app; int i, j, ret, cnt; pmix_list_t targets; - hwloc_obj_t obj = NULL; prte_proc_t *proc; - prte_node_t *node, *nptr; + prte_node_t *node, *nptr, *n2; if (4 < prte_output_get_verbosity(prte_rmaps_base_framework.framework_output)) { rc = PMIx_Data_print(&tmp, NULL, darray, PMIX_DATA_ARRAY); @@ -732,10 +794,6 @@ static int map_colocate(prte_job_t *jdata, } procs = (pmix_proc_t*)darray->array; nprocs = darray->size; - - if (NULL == jdata->map) { // Just in case - jdata->map = PMIX_NEW(prte_job_map_t); - } map = jdata->map; if (daemons) { /* daemons are never bound and always rank by-slot */ @@ -743,10 +801,6 @@ static int map_colocate(prte_job_t *jdata, PRTE_SET_RANKING_POLICY(map->ranking, PRTE_RANK_BY_SLOT); } jdata->num_procs = 0; - /* mark that this job is to be fully - * described in the launch msg */ - prte_set_attribute(&jdata->attributes, PRTE_JOB_FULLY_DESCRIBED, - PRTE_ATTR_GLOBAL, NULL, PMIX_BOOL); /* create a list of the target nodes */ PMIX_CONSTRUCT(&targets, pmix_list_t); @@ -797,12 +851,11 @@ static int map_colocate(prte_job_t *jdata, if (pernode) { /* cycle across the target nodes and place the specified * number of procs on each one */ - PMIX_LIST_FOREACH(nptr, &targets, prte_node_t) { + PMIX_LIST_FOREACH_SAFE(nptr, n2, &targets, prte_node_t) { // Map the node to this job - note we already set the "mapped" flag PMIX_RETAIN(nptr); pmix_pointer_array_add(map->nodes, nptr); map->num_nodes += 1; - obj = hwloc_get_root_obj(nptr->topology->topo); // Assign N procs per node for each app_context for (i=0; i < jdata->apps->size; i++) { app = (prte_app_context_t*)pmix_pointer_array_get_item(jdata->apps, i); @@ -810,7 +863,7 @@ static int map_colocate(prte_job_t *jdata, continue; } // is there room on this node? daemons don't count - if (!daemons && nptr->slots < (nptr->slots_inuse + procs_per_target)) { + if (!daemons && !prte_rmaps_base_check_avail(jdata, app, nptr, &targets, NULL, options)) { if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(map->mapping)) { pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, app->num_procs, app->app, prte_process_info.nodename); @@ -822,15 +875,12 @@ static int map_colocate(prte_job_t *jdata, PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); } for (j = 0; j < procs_per_target; ++j) { - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, nptr, app->idx))) { + if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, app->idx, nptr, NULL, options))) { ret = PRTE_ERR_OUT_OF_RESOURCE; goto done; } jdata->num_procs += 1; app->num_procs += 1; - // we are REQUIRED to set the locale - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, - PRTE_ATTR_LOCAL, obj, PMIX_POINTER); } } } @@ -839,7 +889,7 @@ static int map_colocate(prte_job_t *jdata, } /* handle the case of colocate by process */ - PMIX_LIST_FOREACH(nptr, &targets, prte_node_t) { + PMIX_LIST_FOREACH_SAFE(nptr, n2, &targets, prte_node_t) { // count the number of target procs on this node cnt = 0; for (i=0; i < nptr->procs->size; i++) { @@ -862,13 +912,12 @@ static int map_colocate(prte_job_t *jdata, PMIX_RETAIN(nptr); pmix_pointer_array_add(map->nodes, nptr); map->num_nodes += 1; - obj = hwloc_get_root_obj(nptr->topology->topo); cnt = cnt * procs_per_target; // total number of procs to place on this node // Assign cnt procs for each app_context for (i=0; i < jdata->apps->size; i++) { app = (prte_app_context_t*)pmix_pointer_array_get_item(jdata->apps, i); // is there room on this node? daemons don't count - if (!daemons && nptr->slots < (nptr->slots_inuse + cnt)) { + if (!daemons && !prte_rmaps_base_check_avail(jdata, app, nptr, &targets, NULL, options)) { if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(map->mapping)) { pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, app->num_procs, app->app, prte_process_info.nodename); @@ -880,15 +929,12 @@ static int map_colocate(prte_job_t *jdata, PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); } for (j = 0; j < cnt; ++j) { - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, nptr, i))) { + if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, i, nptr, NULL, options))) { ret = PRTE_ERR_OUT_OF_RESOURCE; goto done; } jdata->num_procs += 1; app->num_procs += 1; - // we are REQUIRED to set the locale - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, - PRTE_ATTR_LOCAL, obj, PMIX_POINTER); } } } diff --git a/src/mca/rmaps/base/rmaps_base_print_fns.c b/src/mca/rmaps/base/rmaps_base_print_fns.c index 8c8f200198..71eb254894 100644 --- a/src/mca/rmaps/base/rmaps_base_print_fns.c +++ b/src/mca/rmaps/base/rmaps_base_print_fns.c @@ -104,12 +104,9 @@ static prte_rmaps_print_buffers_t *get_print_buffer(void) char *prte_rmaps_base_print_mapping(prte_mapping_policy_t mapping) { char *ret, *map, *mymap, *tmp; + char **qls = NULL; prte_rmaps_print_buffers_t *ptr; - if (PRTE_MAPPING_CONFLICTED & PRTE_GET_MAPPING_DIRECTIVE(mapping)) { - return "CONFLICTED"; - } - ptr = get_print_buffer(); if (NULL == ptr) { PRTE_ERROR_LOG(PRTE_ERR_OUT_OF_RESOURCE); @@ -157,40 +154,39 @@ char *prte_rmaps_base_print_mapping(prte_mapping_policy_t mapping) case PRTE_MAPPING_BYDIST: map = "MINDIST"; break; + case PRTE_MAPPING_PELIST: + map = "PE-LIST"; + break; + case PRTE_MAPPING_PPR: + map = "PPR"; + break; default: - if (PRTE_MAPPING_PPR == PRTE_GET_MAPPING_POLICY(mapping)) { - map = "PPR"; - } else { - map = "UNKNOWN"; - } - } - if (0 != strcmp(map, "PPR") && (PRTE_MAPPING_PPR == PRTE_GET_MAPPING_POLICY(mapping))) { - pmix_asprintf(&mymap, "%s[PPR]:", map); - } else { - pmix_asprintf(&mymap, "%s:", map); + map = "UNKNOWN"; } + pmix_asprintf(&mymap, "%s:", map); if (PRTE_MAPPING_NO_USE_LOCAL & PRTE_GET_MAPPING_DIRECTIVE(mapping)) { - pmix_asprintf(&tmp, "%sNO_USE_LOCAL,", mymap); - free(mymap); - mymap = tmp; + pmix_argv_append_nosize(&qls, "NO_USE_LOCAL"); } if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(mapping)) { - pmix_asprintf(&tmp, "%sNOOVERSUBSCRIBE,", mymap); - free(mymap); - mymap = tmp; + pmix_argv_append_nosize(&qls, "NOOVERSUBSCRIBE"); } else if (PRTE_MAPPING_SUBSCRIBE_GIVEN & PRTE_GET_MAPPING_DIRECTIVE(mapping)) { - pmix_asprintf(&tmp, "%sOVERSUBSCRIBE,", mymap); - free(mymap); - mymap = tmp; + pmix_argv_append_nosize(&qls, "OVERSUBSCRIBE"); } if (PRTE_MAPPING_SPAN & PRTE_GET_MAPPING_DIRECTIVE(mapping)) { - pmix_asprintf(&tmp, "%sSPAN,", mymap); - free(mymap); - mymap = tmp; + pmix_argv_append_nosize(&qls, "SPAN"); + } + if (PRTE_MAPPING_ORDERED & PRTE_GET_MAPPING_DIRECTIVE(mapping)) { + pmix_argv_append_nosize(&qls, "ORDERED"); } - /* remove the trailing mark */ - mymap[strlen(mymap) - 1] = '\0'; + if (NULL != qls) { + tmp = pmix_argv_join(qls, ':'); + pmix_argv_free(qls); + pmix_asprintf(&mymap, "%s:%s", map, tmp); + free(tmp); + } else { + mymap = strdup(map); + } snprintf(ptr->buffers[ptr->cntr], PRTE_RMAPS_PRINT_MAX_SIZE, "%s", mymap); free(mymap); @@ -202,20 +198,28 @@ char *prte_rmaps_base_print_mapping(prte_mapping_policy_t mapping) char *prte_rmaps_base_print_ranking(prte_ranking_policy_t ranking) { + char *ret; + switch (PRTE_GET_RANKING_POLICY(ranking)) { case PRTE_RANK_BY_NODE: - return "NODE"; - case PRTE_RANK_BY_PACKAGE: - return "PACKAGE"; - case PRTE_RANK_BY_NUMA: - return "NUMA"; - case PRTE_RANK_BY_CORE: - return "CORE"; - case PRTE_RANK_BY_HWTHREAD: - return "HWTHREAD"; + ret = "NODE"; + break; case PRTE_RANK_BY_SLOT: - return "SLOT"; + ret = "SLOT"; + break; + case PRTE_RANK_BY_FILL: + ret = "FILL"; + break; + case PRTE_RANK_BY_SPAN: + ret = "SPAN"; + break; + case PRTE_RANKING_BYUSER: + ret = "BYUSER"; + break; default: - return "UNKNOWN"; + ret = "UNKNOWN"; + break; } + + return ret; } diff --git a/src/mca/rmaps/base/rmaps_base_ranking.c b/src/mca/rmaps/base/rmaps_base_ranking.c index 492a5fc654..9d89ec331b 100644 --- a/src/mca/rmaps/base/rmaps_base_ranking.c +++ b/src/mca/rmaps/base/rmaps_base_ranking.c @@ -33,17 +33,10 @@ #include "src/class/pmix_pointer_array.h" #include "src/hwloc/hwloc-internal.h" -#include "src/mca/base/base.h" -#include "src/mca/mca.h" -#include "src/threads/pmix_tsd.h" -#include "src/util/pmix_if.h" #include "src/util/output.h" #include "src/mca/errmgr/errmgr.h" -#include "src/mca/ess/ess.h" #include "src/runtime/prte_globals.h" -#include "src/util/dash_host/dash_host.h" -#include "src/util/hostfile/hostfile.h" #include "src/util/name_fns.h" #include "src/util/pmix_show_help.h" #include "types.h" @@ -51,931 +44,231 @@ #include "src/mca/rmaps/base/base.h" #include "src/mca/rmaps/base/rmaps_private.h" -static int assign_proc(prte_job_t *jdata, - prte_proc_t *proc, - pmix_rank_t vpid) +int prte_rmaps_base_compute_vpids(prte_job_t *jdata, + prte_app_context_t *app, + prte_rmaps_options_t *options) { + int m, n; + unsigned k, nobjs, pass; + prte_node_t *node; + prte_proc_t *proc; int rc; - prte_proc_t *pptr; - - /* tie proc to its job */ - proc->job = jdata; - proc->name.rank = vpid; - proc->rank = vpid; - /* insert the proc into the jdata array */ - pptr = (prte_proc_t *) pmix_pointer_array_get_item(jdata->procs, proc->name.rank); - if (NULL != pptr) { - PMIX_RELEASE(pptr); - } - PMIX_RETAIN(proc); - rc = pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc); - if (PRTE_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - } - return rc; -} - -static int rank_span(prte_job_t *jdata, hwloc_obj_type_t target, - unsigned cache_level, bool matched) -{ - prte_app_context_t *app; hwloc_obj_t obj; - int num_objs, i, j, m, n, rc; - pmix_rank_t num_ranked = 0; - prte_node_t *node; - prte_proc_t *proc, *pptr; - pmix_rank_t vpid; - int delta; - hwloc_obj_t locale; - bool first; - - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: for job %s", PRTE_JOBID_PRINT(jdata->nspace)); + pmix_rank_t rank, lrank, apprank; - /* if the ranking is spanned, then we perform the - * ranking as if it was one big node - i.e., we - * rank one proc on each object, step to the next object - * moving across all the nodes, then wrap around to the - * first object on the first node. - * - * Node 0 Node 1 - * Obj 0 Obj 1 Obj 0 Obj 1 - * 0 4 1 5 2 6 3 7 - * 8 12 9 13 10 14 11 15 - */ - - if (matched) { - /* if mapping and ranking are a matched pair, then we know that - * the procs were entered in order in their respective node array. - * We can use that to simplify the ranking procedure */ + if (options->userranked) { + /* ranking has already been done */ + return PRTE_SUCCESS; + } - /* compute the total number of objects in the mapped nodes */ - delta = 0; - for (m = 0; m < jdata->map->nodes->size; m++) { - node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m); + /* if we are ranking by SLOT, then we simply go thru + * each node and rank all thr procs from this app + * in the order in which they are in the node's + * proc array - this is the order in which they + * were assigned */ + if (PRTE_RANK_BY_SLOT == options->rank) { + rank = options->last_rank; + apprank = 0; + for (n=0; n < jdata->map->nodes->size; n++) { + node = (prte_node_t*)pmix_pointer_array_get_item(jdata->map->nodes, n); if (NULL == node) { continue; } - /* get the number of objects - only consider those we can actually use */ - delta += prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level); - } - - /* cycle across the apps as they were mapped in order */ - for (n = 0; n < jdata->apps->size; n++) { - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, n))) { - continue; - } - first = true; - i = 0; - /* cycle across the nodes looking for procs from that app */ - for (m = 0; m < jdata->map->nodes->size; m++) { - node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m); - if (NULL == node) { + lrank = 0; + for (m=0; m < node->procs->size; m++) { + proc = (prte_proc_t*)pmix_pointer_array_get_item(node->procs, m); + if (NULL == proc) { continue; } - vpid = i; - /* cycle thru the procs on this node */ - for (j = 0; j < node->procs->size; j++) { - proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j); - if (NULL == proc) { - continue; - } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span skipping proc %s - from " - "another job, num_ranked %d", - PRTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - /* tie proc to its job */ - proc->job = jdata; - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: assigning vpid %s", - PRTE_VPID_PRINT(vpid)); - rc = assign_proc(jdata, proc, vpid); - vpid += delta; - if (first) { - app->first_rank = proc->name.rank; - first = false; - } + if (!PMIX_CHECK_NSPACE(jdata->nspace, proc->name.nspace)) { + continue; } - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - ++i; + if (app->idx != proc->app_idx) { + continue; + } + proc->name.rank = rank; + proc->local_rank = lrank; + proc->app_rank = apprank; + PMIX_RETAIN(proc); + rc = pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc); + if (PMIX_SUCCESS != rc) { + PMIX_RELEASE(proc); + return rc; + } + ++rank; + ++lrank; + ++apprank; } } + /* save the starting place for the next app */ + options->last_rank = rank; return PRTE_SUCCESS; } - /* If mapping and ranking are NOT matched, then things get more complex. - * In the interest of getting this committed in finite time, - * just loop across the nodes and objects until all procs - * are mapped. Fortunately, this case is RARE. - */ - - vpid = 0; - for (n = 0; n < jdata->apps->size; n++) { - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, n))) { - continue; - } - - first = true; - for (m = 0; m < jdata->map->nodes->size; m++) { - node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m); + /* if we are ranking by NODE, then we use the number of nodes + * used by this app (which is stored in the "options" struct) + * and increment the rank for each proc on each node by that */ + if (PRTE_RANK_BY_NODE == options->rank) { + apprank = 0; + for (n=0; n < jdata->map->nodes->size; n++) { + node = (prte_node_t*)pmix_pointer_array_get_item(jdata->map->nodes, n); if (NULL == node) { continue; } - /* get the number of objects - only consider those we can actually use */ - num_objs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level); - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: found %d objects on node %s with %d procs", num_objs, - node->name, (int) node->num_procs); - if (0 == num_objs) { - return PRTE_ERR_NOT_SUPPORTED; - } - - /* for each object */ - for (i = 0; i < num_objs; i++) { - obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, i); - - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: working object %d", i); - - /* cycle thru the procs on this node */ - for (j = 0; j < node->procs->size; j++) { - proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j); - if (NULL == proc) { - continue; - } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span skipping proc %s - from " - "another job, num_ranked %d", - PRTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* ignore procs that are already assigned */ - if (PMIX_RANK_INVALID != proc->name.rank) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - /* protect against bozo case */ - locale = NULL; - if (!prte_get_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, - (void **) &locale, PMIX_POINTER) || - NULL == locale) { - /* all mappers are _required_ to set the locale where the proc - * has been mapped - it is therefore an error for this attribute - * not to be set. Likewise, only a programming error could allow - * the attribute to be set to a NULL value - however, we add that - * conditional here to silence any compiler warnings */ - PRTE_ERROR_LOG(PRTE_ERROR); - return PRTE_ERROR; - } - /* ignore procs not on this object */ - if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: proc at position %d is not on object %d", - j, i); - continue; - } - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: assigning vpid %s", - PRTE_VPID_PRINT(vpid)); - rc = assign_proc(jdata, proc, vpid); - ++vpid; - if (first) { - app->first_rank = proc->name.rank; - first = false; - } - - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - /* move to next object */ - break; + rank = n + options->last_rank; + lrank = 0; + for (m=0; m < node->procs->size; m++) { + proc = (prte_proc_t*)pmix_pointer_array_get_item(node->procs, m); + if (NULL == proc) { + continue; + } + if (!PMIX_CHECK_NSPACE(jdata->nspace, proc->name.nspace)) { + continue; } + if (app->idx != proc->app_idx) { + continue; + } + proc->name.rank = rank; + proc->local_rank = lrank; + proc->app_rank = apprank; + PMIX_RETAIN(proc); + rc = pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc); + if (PMIX_SUCCESS != rc) { + PMIX_RELEASE(proc); + return rc; + } + rank += options->nnodes; + ++lrank; + ++apprank; } } + /* save the starting place for the next app */ + options->last_rank = rank; + return PRTE_SUCCESS; } - return PRTE_SUCCESS; -} - -static int rank_fill(prte_job_t *jdata, - hwloc_obj_type_t target, - unsigned cache_level) -{ - prte_app_context_t *app; - hwloc_obj_t obj; - int num_objs, i, j, m, n, rc; - pmix_rank_t num_ranked = 0; - prte_node_t *node; - prte_proc_t *proc, *pptr; - pmix_rank_t vpid; - int cnt; - hwloc_obj_t locale; - - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: for job %s", PRTE_JOBID_PRINT(jdata->nspace)); - - /* if the ranking is fill, then we rank all the procs - * within a given object before moving on to the next - * - * Node 0 Node 1 - * Obj 0 Obj 1 Obj 0 Obj 1 - * 0 1 4 5 8 9 12 13 - * 2 3 6 7 10 11 14 15 - */ - - vpid = 0; - for (n = 0; n < jdata->apps->size; n++) { - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, n))) { - continue; - } - - cnt = 0; - for (m = 0; m < jdata->map->nodes->size; m++) { - node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m); + /* if we are ranking FILL, we rank all procs on a given + * object on each node prior to moving to the next object + * on that node */ + if (PRTE_RANK_BY_FILL == options->rank) { + rank = options->last_rank; + apprank = 0; + for (n=0; n < jdata->map->nodes->size; n++) { + node = (prte_node_t*)pmix_pointer_array_get_item(jdata->map->nodes, n); if (NULL == node) { continue; } - /* get the number of objects - only consider those we can actually use */ - num_objs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, - cache_level); - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: found %d objects on node %s with %d procs", - num_objs, node->name, (int) node->num_procs); - if (0 == num_objs) { - return PRTE_ERR_NOT_SUPPORTED; - } - - /* for each object */ - for (i = 0; i < num_objs && cnt < app->num_procs; i++) { - obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, i); - - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: working object %d", i); - - /* cycle thru the procs on this node */ - for (j = 0; j < node->procs->size && cnt < app->num_procs; j++) { - proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j); + lrank = 0; + nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, + options->maptype, options->cmaplvl); + for (k=0; k < nobjs; k++) { + obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, + options->maptype, options->cmaplvl, k); + for (m=0; m < node->procs->size; m++) { + proc = (prte_proc_t*)pmix_pointer_array_get_item(node->procs, m); if (NULL == proc) { continue; } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill skipping proc %s - from another " - "job, num_ranked %d", - PRTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* tie proc to its job */ - proc->job = jdata; - /* ignore procs that are already assigned */ - if (PMIX_RANK_INVALID != proc->name.rank) { + if (!PMIX_CHECK_NSPACE(jdata->nspace, proc->name.nspace)) { continue; } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { + if (app->idx != proc->app_idx) { continue; } - /* protect against bozo case */ - locale = NULL; - if (!prte_get_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, - (void **) &locale, PMIX_POINTER) || - NULL == locale) { - /* all mappers are _required_ to set the locale where the proc - * has been mapped - it is therefore an error for this attribute - * not to be set. Likewise, only a programming error could allow - * the attribute to be set to a NULL value - however, we add that - * conditional here to silence any compiler warnings */ - PRTE_ERROR_LOG(PRTE_ERROR); - return PRTE_ERROR; - } - /* ignore procs not on this object */ - if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { - prte_output_verbose( - 5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: proc at position %d is not on object %d", j, i); + if (obj != proc->obj) { continue; } - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: assigning vpid %s", - PRTE_VPID_PRINT(vpid)); - proc->name.rank = vpid; - proc->rank = vpid++; - if (0 == cnt) { - app->first_rank = proc->name.rank; - } - cnt++; - - /* insert the proc into the jdata array */ - pptr = (prte_proc_t *) pmix_pointer_array_get_item(jdata->procs, - proc->name.rank); - if (NULL != pptr) { - PMIX_RELEASE(pptr); - } + /* this proc is on this object, so rank it */ + proc->name.rank = rank; + proc->local_rank = lrank; + proc->app_rank = apprank; PMIX_RETAIN(proc); rc = pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc); - if (PRTE_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); + if (PMIX_SUCCESS != rc) { + PMIX_RELEASE(proc); return rc; } - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; + rank++; + lrank++; + apprank++; } } } - - /* Are all the procs ranked? we don't want to crash on INVALID ranks */ - if (cnt < app->num_procs) { - return PRTE_ERR_FAILED_TO_MAP; - } - } - - return PRTE_SUCCESS; -} - -static int rank_by(prte_job_t *jdata, - hwloc_obj_type_t target, - unsigned cache_level, - bool matched) -{ - prte_app_context_t *app; - hwloc_obj_t obj; - int num_objs, i, j, m, n, rc, nn; - pmix_rank_t num_ranked = 0; - prte_node_t *node; - prte_proc_t *proc, *pptr; - pmix_rank_t vpid; - int cnt; - pmix_pointer_array_t objs; - hwloc_obj_t locale; - prte_app_idx_t napp; - bool noassign, first; - - if (PRTE_RANKING_SPAN & PRTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { - return rank_span(jdata, target, cache_level, matched); - } else if (PRTE_RANKING_FILL & PRTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { - return rank_fill(jdata, target, cache_level); // cannot be matched as mapping has no "fill" mode + /* save the starting place for the next app */ + options->last_rank = rank; + return PRTE_SUCCESS; } - /* if ranking is not spanned or filled, then we - * default to assign ranks sequentially across - * target objects within a node until that node - * is fully ranked, and then move on to the next - * node + /* if we are ranking SPAN, we rank round-robin across the + * all the objects on the nodes, treating all the objects as + * being part of one giant "super-node" * - * Node 0 Node 1 - * Obj 0 Obj 1 Obj 0 Obj 1 - * 0 2 1 3 8 10 9 11 - * 4 6 5 7 12 14 13 15 - */ - - if (matched) { - // the procs were placed in object order on each node, so we - // can just cycle within each node and rank sequentially - vpid = 0; - for (n = 0; n < jdata->apps->size; n++) { - app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, n); - if (NULL == app) { - continue; - } - first = true; - /* cycle across the nodes looking for procs from that app */ - for (m = 0; m < jdata->map->nodes->size; m++) { - node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m); + * Even though we are ranking by SPAN, we cannot assume that + * we mapped by span, and so we cannot assume that the procs + * are in the node's proc array in object order. Hence, we have + * to search for them even though that eats up time */ + if (PRTE_RANK_BY_SPAN == options->rank) { + apprank = 0; + rank = options->last_rank; + pass = 0; + while (apprank < app->num_procs) { + for (n=0; n < jdata->map->nodes->size && apprank < app->num_procs; n++) { + node = (prte_node_t*)pmix_pointer_array_get_item(jdata->map->nodes, n); if (NULL == node) { continue; } - /* cycle thru the procs on this node */ - for (j = 0; j < node->procs->size; j++) { - proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j); - if (NULL == proc) { - continue; - } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank skipping proc %s - from " - "another job", - PRTE_NAME_PRINT(&proc->name)); - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - /* tie proc to its job */ - proc->job = jdata; - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank assigning vpid %s", - PRTE_VPID_PRINT(vpid)); - rc = assign_proc(jdata, proc, vpid); - ++vpid; - if (first) { - app->first_rank = proc->name.rank; - first = false; - } - } - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - } - } - return PRTE_SUCCESS; - } - - // if the mapping/ranking aren't matched, then do it the hard way - vpid = 0; - for (n = 0, napp = 0; napp < jdata->num_apps && n < jdata->apps->size; n++) { - app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, n); - if (NULL == app) { - continue; - } - napp++; - /* setup the pointer array */ - PMIX_CONSTRUCT(&objs, pmix_pointer_array_t); - pmix_pointer_array_init(&objs, 2, INT_MAX, 2); - - cnt = 0; - for (m = 0, nn = 0; nn < jdata->map->num_nodes && m < jdata->map->nodes->size; m++) { - node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m); - if (NULL == node) { - continue; - } - nn++; - - /* get the number of objects - only consider those we can actually use */ - num_objs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, - cache_level); - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: found %d objects on node %s with %d procs", - num_objs, node->name, (int) node->num_procs); - if (0 == num_objs) { - PMIX_DESTRUCT(&objs); - return PRTE_ERR_NOT_SUPPORTED; - } - /* collect all the objects */ - for (i = 0; i < num_objs; i++) { - obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, i); - pmix_pointer_array_set_item(&objs, i, obj); - } - - /* cycle across the objects, assigning a proc to each one, - * until all procs have been assigned - unfortunately, since - * more than this job may be mapped onto a node, the number - * of procs on the node can't be used to tell us when we - * are done. Instead, we have to just keep going until all - * procs are ranked - which means we have to make one extra - * pass thru the loop. In addition, if we pass thru the entire - * loop without assigning anything then we are done - * - * Perhaps someday someone will come up with a more efficient - * algorithm, but this works for now. - */ - while (cnt < app->num_procs) { - noassign = true; - for (i = 0; i < num_objs && cnt < app->num_procs; i++) { - /* get the next object */ - obj = (hwloc_obj_t) pmix_pointer_array_get_item(&objs, i); - if (NULL == obj) { - break; - } - /* scan across the procs and find the first unassigned one that includes this - * object */ - for (j = 0; j < node->procs->size && cnt < app->num_procs; j++) { - proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j); + nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, + options->maptype, options->cmaplvl); + lrank = pass * nobjs; + /* make a pass across all objects on this node */ + for (k=0; k < nobjs && apprank < app->num_procs; k++) { + /* get this object */ + obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, + options->maptype, options->cmaplvl, k); + /* find an unranked proc on this object */ + for (m=0; m < node->procs->size && apprank < app->num_procs; m++) { + proc = (prte_proc_t*)pmix_pointer_array_get_item(node->procs, m); if (NULL == proc) { continue; } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by skipping proc %s - from another " - "job, num_ranked %d", - PRTE_NAME_PRINT(&proc->name), num_ranked); + if (!PMIX_CHECK_NSPACE(jdata->nspace, proc->name.nspace)) { continue; } - /* ignore procs that are already ranked */ - if (PMIX_RANK_INVALID != proc->name.rank) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by skipping proc %s - already " - "ranked, num_ranked %d", - PRTE_NAME_PRINT(&proc->name), num_ranked); + if (app->idx != proc->app_idx) { continue; } - /* ignore procs from other apps - we will get to them */ - if (proc->app_idx != app->idx) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by skipping proc %s - from another " - "app, num_ranked %d", - PRTE_NAME_PRINT(&proc->name), num_ranked); + if (obj != proc->obj) { continue; } - /* tie proc to its job */ - proc->job = jdata; - /* protect against bozo case */ - locale = NULL; - if (!prte_get_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, - (void **) &locale, PMIX_POINTER) || NULL == locale) { - /* all mappers are _required_ to set the locale where the proc - * has been mapped - it is therefore an error for this attribute - * not to be set. Likewise, only a programming error could allow - * the attribute to be set to a NULL value - however, we add that - * conditional here to silence any compiler warnings */ - PRTE_ERROR_LOG(PRTE_ERROR); - return PRTE_ERROR; - } - /* ignore procs not on this object */ - if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: proc at position %d is not on object %d", - j, i); - continue; - } - /* assign the vpid */ - proc->name.rank = vpid; - proc->rank = vpid++; - if (0 == cnt) { - app->first_rank = proc->name.rank; - } - cnt++; - noassign = false; - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: proc in position %d is on object " - "%d assigned rank %s", - j, i, PRTE_VPID_PRINT(proc->name.rank)); - /* insert the proc into the jdata array */ - pptr = (prte_proc_t *) pmix_pointer_array_get_item(jdata->procs, proc->name.rank); - if (NULL != pptr) { - PMIX_RELEASE(pptr); - } - PMIX_RETAIN(proc); - rc = pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc); - if (PRTE_SUCCESS != rc) { - PRTE_ERROR_LOG(rc); - PMIX_DESTRUCT(&objs); - return rc; + if (PMIX_RANK_INVALID == proc->name.rank) { + proc->name.rank = rank; + proc->app_rank = apprank; + proc->local_rank = lrank; + PMIX_RETAIN(proc); + rc = pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc); + if (PMIX_SUCCESS != rc) { + PMIX_RELEASE(proc); + return rc; + } + ++rank; + ++apprank; + ++lrank; + break; } - num_ranked++; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - /* move to next object */ - break; } } - if (noassign) { - break; - } - } - } - /* cleanup */ - PMIX_DESTRUCT(&objs); - - /* Are all the procs ranked? we don't want to crash on INVALID ranks */ - if (cnt < app->num_procs) { - return PRTE_ERR_FAILED_TO_MAP; - } - } - return PRTE_SUCCESS; -} - -int prte_rmaps_base_compute_vpids(prte_job_t *jdata) -{ - prte_job_map_t *map; - prte_app_context_t *app; - pmix_rank_t vpid, delta; - int j, m, n, cnt; - prte_node_t *node; - prte_proc_t *proc, *pptr; - int rc; - bool one_found; - hwloc_obj_type_t target; - unsigned cache_level; - prte_ranking_policy_t ranking; - prte_mapping_policy_t mapping; - bool map_span = false; - bool rank_span = false; - bool matched = false; - bool first; - - map = jdata->map; - - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, "RANKING POLICY: %s", - prte_rmaps_base_print_ranking(map->ranking)); - - /* if the mapping policy matches the ranking policy, then we can - * simply compute across the nodes */ - ranking = PRTE_GET_RANKING_POLICY(map->ranking); - mapping = PRTE_GET_MAPPING_POLICY(map->mapping); - if (PRTE_RANKING_SPAN & PRTE_GET_RANKING_DIRECTIVE(map->ranking)) { - rank_span = true; - } - if (PRTE_MAPPING_SPAN & PRTE_GET_MAPPING_DIRECTIVE(map->mapping)) { - map_span = true; - } - if (ranking == mapping && rank_span == map_span) { - matched = true; - } - - /* start with the rank-by object options - if the object isn't - * included in the topology, then we obviously cannot rank by it. - * However, if this was the default ranking policy (as opposed to - * something given by the user), then fall back to rank-by slot - */ - if (PRTE_RANK_BY_PACKAGE == PRTE_GET_RANKING_POLICY(map->ranking)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: computing ranks by package for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - if (PRTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_PACKAGE, 0, matched))) { - if (PRTE_ERR_NOT_SUPPORTED == rc - && !(PRTE_RANKING_GIVEN & PRTE_GET_RANKING_DIRECTIVE(map->ranking))) { - PRTE_SET_RANKING_POLICY(map->ranking, PRTE_RANK_BY_SLOT); - goto rankbyslot; - } - PRTE_ERROR_LOG(rc); - } - return rc; - } - - if (PRTE_RANK_BY_NUMA == PRTE_GET_RANKING_POLICY(map->ranking)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: computing ranks by NUMA for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - if (PRTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_NUMANODE, 0, matched))) { - if (PRTE_ERR_NOT_SUPPORTED == rc - && !(PRTE_RANKING_GIVEN & PRTE_GET_RANKING_DIRECTIVE(map->ranking))) { - PRTE_SET_RANKING_POLICY(map->ranking, PRTE_RANK_BY_SLOT); - goto rankbyslot; - } - PRTE_ERROR_LOG(rc); - } - return rc; - } - - if (PRTE_RANK_BY_L3CACHE == PRTE_GET_RANKING_POLICY(map->ranking)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: computing ranks by L3cache for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - PRTE_HWLOC_MAKE_OBJ_CACHE(3, target, cache_level); - if (PRTE_SUCCESS != (rc = rank_by(jdata, target, cache_level, matched))) { - if (PRTE_ERR_NOT_SUPPORTED == rc - && !(PRTE_RANKING_GIVEN & PRTE_GET_RANKING_DIRECTIVE(map->ranking))) { - PRTE_SET_RANKING_POLICY(map->ranking, PRTE_RANK_BY_SLOT); - goto rankbyslot; - } - PRTE_ERROR_LOG(rc); - } - return rc; - } - - if (PRTE_RANK_BY_L2CACHE == PRTE_GET_RANKING_POLICY(map->ranking)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: computing ranks by L2cache for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - PRTE_HWLOC_MAKE_OBJ_CACHE(2, target, cache_level); - if (PRTE_SUCCESS != (rc = rank_by(jdata, target, cache_level, matched))) { - if (PRTE_ERR_NOT_SUPPORTED == rc - && !(PRTE_RANKING_GIVEN & PRTE_GET_RANKING_DIRECTIVE(map->ranking))) { - PRTE_SET_RANKING_POLICY(map->ranking, PRTE_RANK_BY_SLOT); - goto rankbyslot; - } - PRTE_ERROR_LOG(rc); - } - return rc; - } - - if (PRTE_RANK_BY_L1CACHE == PRTE_GET_RANKING_POLICY(map->ranking)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: computing ranks by L1cache for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - PRTE_HWLOC_MAKE_OBJ_CACHE(1, target, cache_level); - if (PRTE_SUCCESS != (rc = rank_by(jdata, target, cache_level, matched))) { - if (PRTE_ERR_NOT_SUPPORTED == rc - && !(PRTE_RANKING_GIVEN & PRTE_GET_RANKING_DIRECTIVE(map->ranking))) { - PRTE_SET_RANKING_POLICY(map->ranking, PRTE_RANK_BY_SLOT); - goto rankbyslot; - } - PRTE_ERROR_LOG(rc); - } - return rc; - } - - if (PRTE_RANK_BY_CORE == PRTE_GET_RANKING_POLICY(map->ranking)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: computing ranks by core for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - if (PRTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CORE, 0, matched))) { - if (PRTE_ERR_NOT_SUPPORTED == rc - && !(PRTE_RANKING_GIVEN & PRTE_GET_RANKING_DIRECTIVE(map->ranking))) { - PRTE_SET_RANKING_POLICY(map->ranking, PRTE_RANK_BY_SLOT); - goto rankbyslot; - } - PRTE_ERROR_LOG(rc); - } - return rc; - } - - if (PRTE_RANK_BY_HWTHREAD == PRTE_GET_RANKING_POLICY(map->ranking)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps: computing ranks by hwthread for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - if (PRTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_PU, 0, matched))) { - if (PRTE_ERR_NOT_SUPPORTED == rc - && !(PRTE_RANKING_GIVEN & PRTE_GET_RANKING_DIRECTIVE(map->ranking))) { - PRTE_SET_RANKING_POLICY(map->ranking, PRTE_RANK_BY_SLOT); - goto rankbyslot; - } - PRTE_ERROR_LOG(rc); - } - return rc; - } - - if (PRTE_RANK_BY_NODE == PRTE_GET_RANKING_POLICY(map->ranking)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:base: computing vpids by node for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - /* assign the ranks round-robin across nodes */ - for (n = 0; n < jdata->apps->size; n++) { - app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, n); - if (NULL == app) { - continue; - } - first = true; - cnt = 0; - for (m = 0; m < jdata->map->nodes->size; m++) { - node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m); - if (NULL == node) { - continue; - } - vpid = cnt; - for (j = 0; j < node->procs->size; j++) { - proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j); - if (NULL == proc) { - continue; - } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - rc = assign_proc(jdata, proc, vpid); - vpid += jdata->map->num_nodes; - if (first) { - app->first_rank = proc->name.rank; - first = false; - } - } - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - cnt++; } + ++pass; } + /* save the starting place for the next app */ + options->last_rank = rank; return PRTE_SUCCESS; } -rankbyslot: - if (PRTE_RANK_BY_SLOT == PRTE_GET_RANKING_POLICY(map->ranking)) { - /* assign the ranks sequentially */ - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:base: computing vpids by slot for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - /* if they mapped by core or by hwthread, then rank-by slot is a match */ - if (PRTE_MAPPING_BYHWTHREAD == mapping || PRTE_MAPPING_BYCORE == mapping) { - matched = true; - } - if (PRTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_PU, 0, matched))) { - PRTE_ERROR_LOG(rc); - } - return rc; - } - + /* cannot be anything else */ return PRTE_ERR_NOT_IMPLEMENTED; } -int prte_rmaps_base_compute_local_ranks(prte_job_t *jdata) -{ - int32_t i; - int j, k; - prte_node_t *node; - prte_proc_t *proc, *psave, *psave2; - pmix_rank_t minv, minv2; - prte_local_rank_t local_rank; - prte_job_map_t *map; - prte_app_context_t *app; - - PRTE_OUTPUT_VERBOSE((5, prte_rmaps_base_framework.framework_output, - "%s rmaps:base:compute_usage", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); - - /* point to map */ - map = jdata->map; - - /* for each node in the map... */ - for (i = 0; i < map->nodes->size; i++) { - /* cycle through the array of procs on this node, setting - * local and node ranks, until we - * have done so for all procs on nodes in this map - */ - if (NULL == (node = (prte_node_t *) pmix_pointer_array_get_item(map->nodes, i))) { - continue; - } - - /* init search values */ - local_rank = 0; - - /* the proc map may have holes in it, so cycle - * all the way through and avoid the holes - */ - for (k = 0; k < node->procs->size; k++) { - /* if this proc is NULL, skip it */ - if (NULL == pmix_pointer_array_get_item(node->procs, k)) { - continue; - } - minv = PMIX_RANK_VALID; - minv2 = PMIX_RANK_VALID; - psave = NULL; - psave2 = NULL; - /* find the minimum vpid proc */ - for (j = 0; j < node->procs->size; j++) { - /* if this proc is NULL, skip it */ - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j))) { - continue; - } - /* only look at procs for this job when - * determining local rank - */ - if (PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace) - && PRTE_LOCAL_RANK_INVALID == proc->local_rank && proc->name.rank < minv) { - minv = proc->name.rank; - psave = proc; - } - /* no matter what job...still have to handle node_rank */ - if (PRTE_NODE_RANK_INVALID == proc->node_rank && proc->name.rank < minv2) { - minv2 = proc->name.rank; - psave2 = proc; - } - } - if (NULL == psave && NULL == psave2) { - /* we must have processed them all for this node! */ - break; - } - if (NULL != psave) { - psave->local_rank = local_rank; - ++local_rank; - } - if (NULL != psave2) { - psave2->node_rank = node->next_node_rank; - node->next_node_rank++; - } - } - } - - /* compute app_rank */ - for (i = 0; i < jdata->apps->size; i++) { - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i))) { - continue; - } - k = 0; - /* loop thru all procs in job to find those from this app_context */ - for (j = 0; j < jdata->procs->size; j++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(jdata->procs, j))) { - continue; - } - if (proc->app_idx != app->idx) { - continue; - } - proc->app_rank = k++; - } - } - - return PRTE_SUCCESS; -} - /* when we restart a process on a different node, we have to * ensure that the node and local ranks assigned to the proc * don't overlap with any pre-existing proc on that node. If @@ -991,7 +284,8 @@ void prte_rmaps_base_update_local_ranks(prte_job_t *jdata, prte_node_t *oldnode, prte_proc_t *proc; PRTE_OUTPUT_VERBOSE((5, prte_rmaps_base_framework.framework_output, - "%s rmaps:base:update_usage", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); + "%s rmaps:base:update_local_ranks", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); /* if the node hasn't changed, then we can just use the * pre-defined values diff --git a/src/mca/rmaps/base/rmaps_base_support_fns.c b/src/mca/rmaps/base/rmaps_base_support_fns.c index 7edc42d3ef..46684e93ae 100644 --- a/src/mca/rmaps/base/rmaps_base_support_fns.c +++ b/src/mca/rmaps/base/rmaps_base_support_fns.c @@ -141,8 +141,10 @@ int prte_rmaps_base_filter_nodes(prte_app_context_t *app, pmix_list_t *nodes, bo /* * Query the registry for all nodes allocated to a specified app_context */ -int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, int32_t *total_num_slots, - prte_app_context_t *app, prte_mapping_policy_t policy, +int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, + int32_t *total_num_slots, + prte_job_t *jdata, prte_app_context_t *app, + prte_mapping_policy_t policy, bool initial_map, bool silent) { pmix_list_item_t *item; @@ -154,7 +156,7 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, int32_t *tota bool novm; pmix_list_t nodes; char *hosts = NULL; - + bool needhosts = false; /** set default answer */ *total_num_slots = 0; @@ -170,17 +172,16 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, int32_t *tota * However, if it is a managed allocation AND the hostfile or the hostlist was * provided, those take precedence, so process them and filter as we normally do. */ - if (!prte_managed_allocation - || (prte_managed_allocation - && (prte_get_attribute(&app->attributes, PRTE_APP_DASH_HOST, (void **) &hosts, - PMIX_STRING) - || prte_get_attribute(&app->attributes, PRTE_APP_HOSTFILE, (void **) &hosts, - PMIX_STRING)))) { + if (prte_get_attribute(&app->attributes, PRTE_APP_DASH_HOST, (void **) &hosts, PMIX_STRING) || + prte_get_attribute(&app->attributes, PRTE_APP_HOSTFILE, (void **) &hosts, PMIX_STRING)) { + needhosts = true; + } + if (!prte_managed_allocation || + (prte_managed_allocation && needhosts)) { PMIX_CONSTRUCT(&nodes, pmix_list_t); /* if the app provided a dash-host, then use those nodes */ hosts = NULL; - if (prte_get_attribute(&app->attributes, PRTE_APP_DASH_HOST, (void **) &hosts, - PMIX_STRING)) { + if (prte_get_attribute(&app->attributes, PRTE_APP_DASH_HOST, (void **) &hosts, PMIX_STRING)) { PRTE_OUTPUT_VERBOSE((5, prte_rmaps_base_framework.framework_output, "%s using dash_host %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), hosts)); @@ -190,8 +191,7 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, int32_t *tota return rc; } free(hosts); - } else if (prte_get_attribute(&app->attributes, PRTE_APP_HOSTFILE, (void **) &hosts, - PMIX_STRING)) { + } else if (prte_get_attribute(&app->attributes, PRTE_APP_HOSTFILE, (void **) &hosts, PMIX_STRING)) { /* otherwise, if the app provided a hostfile, then use that */ PRTE_OUTPUT_VERBOSE((5, prte_rmaps_base_framework.framework_output, "%s using hostfile %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), @@ -230,8 +230,8 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, int32_t *tota PMIX_LIST_FOREACH_SAFE(nptr, next, &nodes, prte_node_t) { for (i = 0; i < prte_node_pool->size; i++) { - if (NULL - == (node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, i))) { + node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, i); + if (NULL == node) { continue; } /* ignore nodes that are non-usable */ @@ -362,8 +362,9 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, int32_t *tota */ PRTE_FLAG_UNSET(node, PRTE_NODE_FLAG_MAPPED); } - if (NULL == nd || NULL == nd->daemon || NULL == node->daemon - || nd->daemon->name.rank < node->daemon->name.rank) { + if (NULL == nd || NULL == nd->daemon || + NULL == node->daemon || + nd->daemon->name.rank < node->daemon->name.rank) { /* just append to end */ pmix_list_append(allocated_nodes, &node->super); nd = node; @@ -423,25 +424,36 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, int32_t *tota * the allocation */ if (PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { num_slots = INT32_MAX; - if (!prte_hnp_is_allocated - || (PRTE_GET_MAPPING_DIRECTIVE(policy) & PRTE_MAPPING_NO_USE_LOCAL)) { - PMIX_LIST_FOREACH_SAFE(node, next, allocated_nodes, prte_node_t) - { - if (0 == node->index) { + PMIX_LIST_FOREACH_SAFE(node, next, allocated_nodes, prte_node_t) + { + if (0 == node->index) { + if (!prte_hnp_is_allocated || + (PRTE_GET_MAPPING_DIRECTIVE(policy) & PRTE_MAPPING_NO_USE_LOCAL)) { pmix_list_remove_item(allocated_nodes, &node->super); PMIX_RELEASE(node); /* "un-retain" it */ - break; + continue; } } + if (NULL == node->topology || NULL == node->topology->topo) { + /* cannot use this node - should never happen */ + pmix_list_remove_item(allocated_nodes, &node->super); + PMIX_RELEASE(node); + } } } else { num_slots = 0; PMIX_LIST_FOREACH_SAFE(node, next, allocated_nodes, prte_node_t) { + if (NULL == node->topology || NULL == node->topology->topo) { + /* cannot use this node - should never happen */ + pmix_list_remove_item(allocated_nodes, &node->super); + PMIX_RELEASE(node); + continue; + } /* if the hnp was not allocated, or flagged not to be used, * then remove it here */ - if (!prte_hnp_is_allocated - || (PRTE_GET_MAPPING_DIRECTIVE(policy) & PRTE_MAPPING_NO_USE_LOCAL)) { + if (!prte_hnp_is_allocated || + (PRTE_GET_MAPPING_DIRECTIVE(policy) & PRTE_MAPPING_NO_USE_LOCAL)) { if (0 == node->index) { pmix_list_remove_item(allocated_nodes, &node->super); PMIX_RELEASE(node); /* "un-retain" it */ @@ -458,8 +470,8 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, int32_t *tota PMIX_RELEASE(node); /* "un-retain" it */ continue; } - if (node->slots <= node->slots_inuse - && (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(policy))) { + if (node->slots <= node->slots_inuse && + (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(policy))) { /* remove the node as fully used */ PRTE_OUTPUT_VERBOSE((5, prte_rmaps_base_framework.framework_output, "%s Removing node %s slots %d inuse %d", @@ -519,6 +531,9 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, int32_t *tota /* pass back the total number of available slots */ *total_num_slots = num_slots; + /* check for prior bookmark */ + prte_rmaps_base_get_starting_point(allocated_nodes, jdata); + if (4 < prte_output_get_verbosity(prte_rmaps_base_framework.framework_output)) { prte_output(0, "AVAILABLE NODES FOR MAPPING:"); for (item = pmix_list_get_first(allocated_nodes); @@ -533,7 +548,11 @@ int prte_rmaps_base_get_target_nodes(pmix_list_t *allocated_nodes, int32_t *tota return PRTE_SUCCESS; } -prte_proc_t *prte_rmaps_base_setup_proc(prte_job_t *jdata, prte_node_t *node, prte_app_idx_t idx) +prte_proc_t *prte_rmaps_base_setup_proc(prte_job_t *jdata, + prte_app_idx_t idx, + prte_node_t *node, + hwloc_obj_t obj, + prte_rmaps_options_t *options) { prte_proc_t *proc; int rc; @@ -560,19 +579,36 @@ prte_proc_t *prte_rmaps_base_setup_proc(prte_job_t *jdata, prte_node_t *node, pr proc->parent = node->daemon->name.rank; } - PMIX_RETAIN(node); /* maintain accounting on object */ + // point the proc at its node proc->node = node; - /* if this is a debugger job, then it doesn't count against + PMIX_RETAIN(node); /* maintain accounting on object */ + /* if this is a tool app, then it doesn't count against * available slots - otherwise, it does */ - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { + if (PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { + proc->local_rank = 0; + proc->node_rank = UINT16_MAX; + } else { + proc->node_rank = node->num_procs; node->num_procs++; ++node->slots_inuse; } - if (0 > (rc = pmix_pointer_array_add(node->procs, (void *) proc))) { + if (0 > (idx = pmix_pointer_array_add(node->procs, (void *) proc))) { PRTE_ERROR_LOG(rc); - PMIX_RELEASE(proc); + PMIX_RELEASE(proc); // releases node to maintain accounting + return NULL; + } + + /* point the proc to its locale */ + proc->obj = obj; + + /* bind the process so we know which cpus have been taken */ + rc = prte_rmaps_base_bind_proc(jdata, proc, node, obj, options); + if (PRTE_SUCCESS != rc) { + pmix_pointer_array_set_item(node->procs, idx, NULL); + PMIX_RELEASE(proc); // releases node to maintain accounting return NULL; } + /* retain the proc struct so that we correctly track its release */ PMIX_RETAIN(proc); @@ -582,7 +618,7 @@ prte_proc_t *prte_rmaps_base_setup_proc(prte_job_t *jdata, prte_node_t *node, pr /* * determine the proper starting point for the next mapping operation */ -prte_node_t *prte_rmaps_base_get_starting_point(pmix_list_t *node_list, prte_job_t *jdata) +void prte_rmaps_base_get_starting_point(pmix_list_t *node_list, prte_job_t *jdata) { pmix_list_item_t *item, *cur_node_item; prte_node_t *node, *nd1, *ndmin; @@ -664,9 +700,216 @@ prte_node_t *prte_rmaps_base_get_starting_point(pmix_list_t *node_list, prte_job } process: - PRTE_OUTPUT_VERBOSE((5, prte_rmaps_base_framework.framework_output, "%s Starting at node %s", + PRTE_OUTPUT_VERBOSE((5, prte_rmaps_base_framework.framework_output, + "%s Starting at node %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), ((prte_node_t *) cur_node_item)->name)); - return (prte_node_t *) cur_node_item; + /* put this node at the front of the list */ + pmix_list_remove_item(node_list, cur_node_item); + pmix_list_prepend(node_list, cur_node_item); + + return; +} + +bool prte_rmaps_base_check_avail(prte_job_t *jdata, + prte_app_context_t *app, + prte_node_t *node, + pmix_list_t *node_list, + hwloc_obj_t obj, + prte_rmaps_options_t *options) +{ + hwloc_obj_t root; + hwloc_cpuset_t available; + int nprocs; + bool avail = false; + + prte_output_verbose(10, prte_rmaps_base_framework.framework_output, + "%s get_avail_ncpus: node %s has %d procs on it", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), node->name, node->num_procs); + + if (PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { + avail = true; + goto done; + } + + if (!options->oversubscribe) { + if (node->slots <= node->slots_inuse) { + prte_output_verbose(2, prte_rmaps_base_framework.framework_output, + "mca:rmaps: node %s is full - skipping", + node->name); + goto done; + } + } + if (0 != node->slots_max && + node->slots_max <= node->slots_inuse) { + /* cannot use this node - already at max_slots */ + pmix_list_remove_item(node_list, &node->super); + PMIX_RELEASE(node); + goto done; + } + + if (PRTE_BIND_TO_NONE == options->bind) { + options->target = NULL; + avail = true; + goto done; + } + +#if HWLOC_API_VERSION < 0x20000 + root = hwloc_get_root_obj(node->topology->topo); + if (NULL == options->job_cpuset) { + available = hwloc_bitmap_dup(root->allowed_cpuset); + } else { + available = hwloc_bitmap_alloc(); + hwloc_bitmap_and(available, root->allowed_cpuset, options->job_cpuset); + } + if (NULL != obj) { + hwloc_bitmap_and(available, available, obj->allowed_cpuset); + } +#else + if (NULL == options->job_cpuset) { + available = hwloc_bitmap_dup(hwloc_topology_get_allowed_cpuset(node->topology->topo)); + } else { + available = hwloc_bitmap_alloc(); + hwloc_bitmap_and(available, hwloc_topology_get_allowed_cpuset(node->topology->topo), options->job_cpuset); + } + if (NULL != obj) { + hwloc_bitmap_and(available, available, obj->cpuset); + } +#endif + if (options->use_hwthreads) { + options->ncpus = hwloc_bitmap_weight(available); + } else { + /* if we are treating cores as cpus, then we really + * want to know how many cores are in this object. + * hwloc sets a bit for each "pu", so we can't just + * count bits in this case as there may be more than + * one hwthread/core. Instead, find the number of cores + * under the object + */ + options->ncpus = hwloc_get_nbobjs_inside_cpuset_by_type(node->topology->topo, available, HWLOC_OBJ_CORE); + } + options->target = available; + + nprocs = options->ncpus / options->cpus_per_rank; + if (options->nprocs < nprocs) { + avail = true; + } else if (options->overload) { + /* doesn't matter how many cpus are in use */ + avail = true; + } else if (0 < nprocs) { + options->nprocs = nprocs; + avail = true; + } + +done: + /* add this node to the map - do it only once */ + if (avail && !PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_MAPPED)) { + PRTE_FLAG_SET(node, PRTE_NODE_FLAG_MAPPED); + PMIX_RETAIN(node); + pmix_pointer_array_add(jdata->map->nodes, node); + ++(jdata->map->num_nodes); + options->nnodes++; // track #nodes for this app + } + + return avail; +} + +void prte_rmaps_base_get_cpuset(prte_job_t *jdata, + prte_node_t *node, + prte_rmaps_options_t *options) +{ + if (NULL != options->cpuset) { + options->job_cpuset = prte_hwloc_base_generate_cpuset(node->topology->topo, + options->use_hwthreads, + options->cpuset); + } else { + options->job_cpuset = hwloc_bitmap_dup(node->available); + } +} + +int prte_rmaps_base_check_support(prte_job_t *jdata, + prte_node_t *node, + prte_rmaps_options_t *options) +{ + struct hwloc_topology_support *support; + + /* if we don't want to launch, then we are just testing the system, + * so ignore questions about support capabilities + */ + support = (struct hwloc_topology_support *) hwloc_topology_get_support(node->topology->topo); + /* check if topology supports cpubind - have to be careful here + * as Linux doesn't currently support thread-level binding. This + * may change in the future, though, and it isn't clear how hwloc + * interprets the current behavior. So check both flags to be sure. + */ + if (support->cpubind->set_thisproc_cpubind || + support->cpubind->set_thisthread_cpubind) { + if (PRTE_BINDING_REQUIRED(jdata->map->binding) && + PRTE_BINDING_POLICY_IS_SET(jdata->map->binding)) { + /* we are required to bind but cannot */ + pmix_show_help("help-prte-rmaps-base.txt", "rmaps:cpubind-not-supported", + true, node->name); + return PRTE_ERR_SILENT; + } + } + /* check if topology supports membind - have to be careful here + * as hwloc treats this differently than I (at least) would have + * expected. Per hwloc, Linux memory binding is at the thread, + * and not process, level. Thus, hwloc sets the "thisproc" flag + * to "false" on all Linux systems, and uses the "thisthread" flag + * to indicate binding capability - don't warn if the user didn't + * specifically request binding + */ + if (!support->membind->set_thisproc_membind && + !support->membind->set_thisthread_membind && + PRTE_BINDING_POLICY_IS_SET(jdata->map->binding)) { + if (PRTE_HWLOC_BASE_MBFA_WARN == prte_hwloc_base_mbfa && !options->membind_warned) { + pmix_show_help("help-prte-rmaps-base.txt", "rmaps:membind-not-supported", true, + node->name); + options->membind_warned = true; + } else if (PRTE_HWLOC_BASE_MBFA_ERROR == prte_hwloc_base_mbfa) { + pmix_show_help("help-prte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", + true, node->name); + return PRTE_ERR_SILENT; + } + } + return PRTE_SUCCESS; +} + +int prte_rmaps_base_check_oversubscribed(prte_job_t *jdata, + prte_app_context_t *app, + prte_node_t *node) +{ + /* not all nodes are equal, so only set oversubscribed for + * this node if it is in that state + */ + if (node->slots < (int) node->num_procs) { + /* flag the node as oversubscribed so that sched-yield gets + * properly set + */ + PRTE_FLAG_SET(node, PRTE_NODE_FLAG_OVERSUBSCRIBED); + PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); + /* check for permission */ + if (PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_SLOTS_GIVEN)) { + /* if we weren't given a directive either way, then we will error out + * as the #slots were specifically given, either by the host RM or + * via hostfile/dash-host */ + if (!(PRTE_MAPPING_SUBSCRIBE_GIVEN & + PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { + pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", + true, app->num_procs, app->app, prte_process_info.nodename); + PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); + return PRTE_ERR_SILENT; + } else if (PRTE_MAPPING_NO_OVERSUBSCRIBE & + PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { + /* if we were explicitly told not to oversubscribe, then don't */ + pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", + true, app->num_procs, app->app, prte_process_info.nodename); + PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); + return PRTE_ERR_SILENT; + } + } + } + return PRTE_SUCCESS; } diff --git a/src/mca/rmaps/base/rmaps_private.h b/src/mca/rmaps/base/rmaps_private.h index 67d7fc09b5..cfad9591e6 100644 --- a/src/mca/rmaps/base/rmaps_private.h +++ b/src/mca/rmaps/base/rmaps_private.h @@ -32,8 +32,10 @@ #include "prte_config.h" #include "types.h" +#include "src/hwloc/hwloc-internal.h" #include "src/runtime/prte_globals.h" +#include "src/mca/schizo/schizo.h" #include "src/mca/rmaps/rmaps.h" BEGIN_C_DECLS @@ -45,28 +47,33 @@ BEGIN_C_DECLS /* LOCAL FUNCTIONS for use by RMAPS components */ PRTE_EXPORT int prte_rmaps_base_get_target_nodes(pmix_list_t *node_list, int32_t *total_num_slots, - prte_app_context_t *app, + prte_job_t *jdata, prte_app_context_t *app, prte_mapping_policy_t policy, bool initial_map, bool silent); -PRTE_EXPORT prte_proc_t *prte_rmaps_base_setup_proc(prte_job_t *jdata, prte_node_t *node, - prte_app_idx_t idx); +PRTE_EXPORT prte_proc_t *prte_rmaps_base_setup_proc(prte_job_t *jdata, + prte_app_idx_t idx, + prte_node_t *node, + hwloc_obj_t obj, + prte_rmaps_options_t *options); -PRTE_EXPORT prte_node_t *prte_rmaps_base_get_starting_point(pmix_list_t *node_list, - prte_job_t *jdata); +PRTE_EXPORT void prte_rmaps_base_get_starting_point(pmix_list_t *node_list, + prte_job_t *jdata); -PRTE_EXPORT int prte_rmaps_base_compute_vpids(prte_job_t *jdata); -PRTE_EXPORT int prte_rmaps_base_compute_local_ranks(prte_job_t *jdata); +PRTE_EXPORT int prte_rmaps_base_compute_vpids(prte_job_t *jdata, + prte_app_context_t *app, + prte_rmaps_options_t *options); -PRTE_EXPORT int prte_rmaps_base_compute_bindings(prte_job_t *jdata); +PRTE_EXPORT int prte_rmaps_base_bind_proc(prte_job_t *jdata, + prte_proc_t *proc, + prte_node_t *node, + hwloc_obj_t obj, + prte_rmaps_options_t *options); PRTE_EXPORT void prte_rmaps_base_update_local_ranks(prte_job_t *jdata, prte_node_t *oldnode, prte_node_t *newnode, prte_proc_t *newproc); -PRTE_EXPORT int prte_rmaps_base_rearrange_map(prte_app_context_t *app, prte_job_map_t *map, - pmix_list_t *procs); - END_C_DECLS #endif diff --git a/src/mca/rmaps/mindist/.prte_ignore b/src/mca/rmaps/mindist/.prte_ignore new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/mca/rmaps/mindist/mindist_sort.c b/src/mca/rmaps/mindist/mindist_sort.c new file mode 100644 index 0000000000..fed190ccd8 --- /dev/null +++ b/src/mca/rmaps/mindist/mindist_sort.c @@ -0,0 +1,183 @@ +// +// mindist_sort.c +// +// +// Created by Ralph Castain on 6/15/22. +// + +#include "mindist_sort.h" + +static int dist_cmp_fn(pmix_list_item_t **a, pmix_list_item_t **b) +{ + prte_rmaps_numa_node_t *aitem = *((prte_rmaps_numa_node_t **) a); + prte_rmaps_numa_node_t *bitem = *((prte_rmaps_numa_node_t **) b); + + if (aitem->dist_from_closed > bitem->dist_from_closed) { + return 1; + } else if (aitem->dist_from_closed == bitem->dist_from_closed) { + return 0; + } else { + return -1; + } +} + +static void sort_by_dist(hwloc_topology_t topo, char *device_name, pmix_list_t *sorted_list) +{ + hwloc_obj_t device_obj = NULL; + hwloc_obj_t obj = NULL; + struct hwloc_distances_s *distances; + prte_rmaps_numa_node_t *numa_node; + int close_node_index; + float latency; + unsigned int j; +#if HWLOC_API_VERSION < 0x20000 + hwloc_obj_t root = NULL; + int depth; + unsigned i; +#else + unsigned distances_nr = 0; +#endif + + for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); device_obj; + device_obj = hwloc_get_next_osdev(topo, device_obj)) { + if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS + || device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { + if (!strcmp(device_obj->name, device_name)) { + /* find numa node containing this device */ + obj = device_obj->parent; +#if HWLOC_API_VERSION < 0x20000 + while ((obj != NULL) && (obj->type != HWLOC_OBJ_NUMANODE)) { + obj = obj->parent; + } +#else + while (obj && !obj->memory_arity) { + obj = obj->parent; /* no memory child, walk up */ + } + if (obj != NULL) { + obj = obj->memory_first_child; + } +#endif + if (obj == NULL) { + prte_output_verbose( + 5, prte_hwloc_base_output, + "hwloc:base:get_sorted_numa_list: NUMA node closest to %s wasn't found.", + device_name); + return; + } else { + close_node_index = obj->logical_index; + } + + /* find distance matrix for all numa nodes */ +#if HWLOC_API_VERSION < 0x20000 + distances = (struct hwloc_distances_s *) + hwloc_get_whole_distance_matrix_by_type(topo, HWLOC_OBJ_NUMANODE); + if (NULL == distances) { + /* we can try to find distances under group object. This info can be there. */ + depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NUMANODE); + if (HWLOC_TYPE_DEPTH_UNKNOWN == depth) { + prte_output_verbose(5, prte_hwloc_base_output, + "hwloc:base:get_sorted_numa_list: There is no " + "information about distances on the node."); + return; + } + root = hwloc_get_root_obj(topo); + for (i = 0; i < root->arity; i++) { + obj = root->children[i]; + if (obj->distances_count > 0) { + for (j = 0; j < obj->distances_count; j++) { + if (obj->distances[j]->relative_depth + 1 == (unsigned) depth) { + distances = obj->distances[j]; + break; + } + } + } + } + } + /* find all distances for our close node with logical index = close_node_index as + * close_node_index + nbobjs*j */ + if ((NULL == distances) || (0 == distances->nbobjs)) { + prte_output_verbose(5, prte_hwloc_base_output, + "hwloc:base:get_sorted_numa_list: There is no information " + "about distances on the node."); + return; + } + /* fill list of numa nodes */ + for (j = 0; j < distances->nbobjs; j++) { + latency = distances->latency[close_node_index + distances->nbobjs * j]; + numa_node = PMIX_NEW(prte_rmaps_numa_node_t); + numa_node->index = j; + numa_node->dist_from_closed = latency; + pmix_list_append(sorted_list, &numa_node->super); + } +#else + distances_nr = 1; + if (0 != hwloc_distances_get_by_type(topo, HWLOC_OBJ_NUMANODE, &distances_nr, + &distances, HWLOC_DISTANCES_KIND_MEANS_LATENCY, 0) || + 0 == distances_nr) { + prte_output_verbose(5, prte_hwloc_base_output, + "hwloc:base:get_sorted_numa_list: There is no information " + "about distances on the node."); + return; + } + /* fill list of numa nodes */ + for (j = 0; j < distances->nbobjs; j++) { + latency = distances->values[close_node_index + distances->nbobjs * j]; + numa_node = PMIX_NEW(prte_rmaps_numa_node_t); + numa_node->index = j; + numa_node->dist_from_closed = latency; + pmix_list_append(sorted_list, &numa_node->super); + } + hwloc_distances_release(topo, distances); +#endif + /* sort numa nodes by distance from the closest one to PCI */ + pmix_list_sort(sorted_list, dist_cmp_fn); + return; + } + } + } +} + +static int find_devices(hwloc_topology_t topo, char **device_name) +{ + hwloc_obj_t device_obj = NULL; + int count = 0; + for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); device_obj; + device_obj = hwloc_get_next_osdev(topo, device_obj)) { + if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { + count++; + free(*device_name); + *device_name = strdup(device_obj->name); + } + } + return count; +} + +int prte_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char *device_name, + pmix_list_t *sorted_list) +{ + bool free_device_name = false; + int count; + + /* firstly we check if we need to autodetect OpenFabrics devices or we have the + * specified one */ + if (!strcmp(device_name, "auto")) { + count = find_devices(topo, &device_name); + if (count > 1) { + free(device_name); + return count; + } + free_device_name = true; + } + if (!device_name) { + return PRTE_ERR_NOT_FOUND; + } else if (free_device_name && (0 == strlen(device_name))) { + free(device_name); + return PRTE_ERR_NOT_FOUND; + } + sort_by_dist(topo, device_name, sorted_list); + if (free_device_name) { + free(device_name); + } + return PRTE_SUCCESS; +} + diff --git a/src/mca/rmaps/mindist/mindist_sort.h b/src/mca/rmaps/mindist/mindist_sort.h new file mode 100644 index 0000000000..4993ea6df0 --- /dev/null +++ b/src/mca/rmaps/mindist/mindist_sort.h @@ -0,0 +1,13 @@ +// +// mindist_sort.h +// +// +// Created by Ralph Castain on 6/15/22. +// + +#ifndef mindist_sort_h +#define mindist_sort_h + +#include + +#endif /* mindist_sort_h */ diff --git a/src/mca/rmaps/mindist/rmaps_mindist_module.c b/src/mca/rmaps/mindist/rmaps_mindist_module.c index 497f8d9790..ea3231f5d9 100644 --- a/src/mca/rmaps/mindist/rmaps_mindist_module.c +++ b/src/mca/rmaps/mindist/rmaps_mindist_module.c @@ -213,9 +213,6 @@ static int mindist_map(prte_job_t *jdata) /* flag that all subsequent requests should not reset the node->mapped flag */ initial_map = false; - /* if a bookmark exists from some prior mapping, set us to start there */ - jdata->bookmark = prte_rmaps_base_get_starting_point(&node_list, jdata); - if (0 == app->num_procs) { /* set the num_procs to equal the number of slots on these mapped nodes */ app->num_procs = num_slots; diff --git a/src/mca/rmaps/ppr/rmaps_ppr.c b/src/mca/rmaps/ppr/rmaps_ppr.c index 74efe498ca..0839dd65c9 100644 --- a/src/mca/rmaps/ppr/rmaps_ppr.c +++ b/src/mca/rmaps/ppr/rmaps_ppr.c @@ -33,57 +33,28 @@ #include "src/mca/rmaps/base/base.h" #include "src/mca/rmaps/base/rmaps_private.h" -static int ppr_mapper(prte_job_t *jdata); -static int assign_locations(prte_job_t *jdata); +static int ppr_mapper(prte_job_t *jdata, + prte_rmaps_options_t *options); -prte_rmaps_base_module_t prte_rmaps_ppr_module = {.map_job = ppr_mapper, - .assign_locations = assign_locations}; +prte_rmaps_base_module_t prte_rmaps_ppr_module = { + .map_job = ppr_mapper +}; -/* RHC: will eventually remove this - * definition as it is no longer reqd - * in the rest of OMPI system. - * - * Define a hierarchical level value that - * helps resolve the hwloc behavior of - * treating caches as a single type of - * entity - must always be available - */ -typedef enum { - PRTE_HWLOC_NODE_LEVEL = 0, - PRTE_HWLOC_NUMA_LEVEL, - PRTE_HWLOC_PACKAGE_LEVEL, - PRTE_HWLOC_L3CACHE_LEVEL, - PRTE_HWLOC_L2CACHE_LEVEL, - PRTE_HWLOC_L1CACHE_LEVEL, - PRTE_HWLOC_CORE_LEVEL, - PRTE_HWLOC_HWTHREAD_LEVEL -} prte_hwloc_level_t; - -static void prune(pmix_nspace_t jobid, prte_app_idx_t app_idx, prte_node_t *node, - prte_hwloc_level_t *level, pmix_rank_t *nmapped); - -static int rmaps_ppr_global[PRTE_HWLOC_HWTHREAD_LEVEL + 1]; - -static int ppr_mapper(prte_job_t *jdata) +static int ppr_mapper(prte_job_t *jdata, + prte_rmaps_options_t *options) { - int rc = PRTE_SUCCESS, j, n; + int rc = PRTE_SUCCESS, j, n, ppr, idx; prte_proc_t *proc; prte_mca_base_component_t *c = &prte_rmaps_ppr_component.base_version; - prte_node_t *node; + prte_node_t *node, *nd; prte_app_context_t *app; pmix_rank_t total_procs, nprocs_mapped; - prte_hwloc_level_t start = PRTE_HWLOC_NODE_LEVEL; + prte_mapping_policy_t mapping = 0; + prte_ranking_policy_t ranking; hwloc_obj_t obj; - hwloc_obj_type_t lowest; - unsigned cache_level = 0; unsigned int nobjs, i, num_available; - ; - bool pruning_reqd = false; - prte_hwloc_level_t level; pmix_list_t node_list; - pmix_list_item_t *item; int32_t num_slots; - prte_app_idx_t idx; char **ppr_req, **ck, *jobppr = NULL; size_t len; bool initial_map = true; @@ -106,8 +77,8 @@ static int ppr_mapper(prte_job_t *jdata) return PRTE_ERR_TAKE_NEXT_OPTION; } - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_PPR, (void **) &jobppr, PMIX_STRING) - || NULL == jobppr || PRTE_MAPPING_PPR != PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_PPR, (void **) &jobppr, PMIX_STRING) || + NULL == jobppr || PRTE_MAPPING_PPR != PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* not for us */ prte_output_verbose(5, prte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: job %s not using ppr mapper PPR %s policy %s", @@ -131,153 +102,96 @@ static int ppr_mapper(prte_job_t *jdata) } jdata->map->last_mapper = strdup(c->mca_component_name); - /* initialize */ - memset(rmaps_ppr_global, 0, (PRTE_HWLOC_HWTHREAD_LEVEL+1) * sizeof(prte_hwloc_level_t)); - - /* parse option */ - n = 0; - ppr_req = pmix_argv_split(jobppr, ','); - for (j = 0; NULL != ppr_req[j]; j++) { /* split on the colon */ - ck = pmix_argv_split(ppr_req[j], ':'); - if (2 != pmix_argv_count(ck)) { - /* must provide a specification */ - pmix_show_help("help-prte-rmaps-ppr.txt", "invalid-ppr", true, jobppr); - pmix_argv_free(ppr_req); - pmix_argv_free(ck); - free(jobppr); - return PRTE_ERR_SILENT; - } - len = strlen(ck[1]); - if (0 == strncasecmp(ck[1], "node", len)) { - rmaps_ppr_global[PRTE_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10); - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYNODE); - if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { - PRTE_SET_RANKING_POLICY(jdata->map->ranking, PRTE_RANK_BY_NODE); - } - start = PRTE_HWLOC_NODE_LEVEL; - n++; - } else if (0 == strncasecmp(ck[1], "hwthread", len) - || 0 == strncasecmp(ck[1], "thread", len)) { - rmaps_ppr_global[PRTE_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10); - start = PRTE_HWLOC_HWTHREAD_LEVEL; - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYHWTHREAD); - if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { - PRTE_SET_RANKING_POLICY(jdata->map->ranking, PRTE_RANK_BY_HWTHREAD); - } - n++; - } else if (0 == strncasecmp(ck[1], "core", len)) { - rmaps_ppr_global[PRTE_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10); - if (start < PRTE_HWLOC_CORE_LEVEL) { - start = PRTE_HWLOC_CORE_LEVEL; - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYCORE); - if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { - PRTE_SET_RANKING_POLICY(jdata->map->ranking, PRTE_RANK_BY_CORE); - } - } - n++; - } else if (0 == strncasecmp(ck[1], "package", len) || 0 == strncasecmp(ck[1], "skt", len)) { - rmaps_ppr_global[PRTE_HWLOC_PACKAGE_LEVEL] = strtol(ck[0], NULL, 10); - if (start < PRTE_HWLOC_PACKAGE_LEVEL) { - start = PRTE_HWLOC_PACKAGE_LEVEL; - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYPACKAGE); - if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { - PRTE_SET_RANKING_POLICY(jdata->map->ranking, PRTE_RANK_BY_PACKAGE); - } - } - n++; - } else if (0 == strncasecmp(ck[1], "numa", len) || 0 == strncasecmp(ck[1], "nm", len)) { - rmaps_ppr_global[PRTE_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10); - if (start < PRTE_HWLOC_NUMA_LEVEL) { - start = PRTE_HWLOC_NUMA_LEVEL; - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYNUMA); - if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { - PRTE_SET_RANKING_POLICY(jdata->map->ranking, PRTE_RANK_BY_NUMA); - } - } - n++; - } else if (0 == strncasecmp(ck[1], "l1cache", len)) { - rmaps_ppr_global[PRTE_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10); - if (start < PRTE_HWLOC_L1CACHE_LEVEL) { - start = PRTE_HWLOC_L1CACHE_LEVEL; - cache_level = 1; - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYL1CACHE); - if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { - PRTE_SET_RANKING_POLICY(jdata->map->ranking, PRTE_RANK_BY_L1CACHE); - } - } - n++; - } else if (0 == strncasecmp(ck[1], "l2cache", len)) { - rmaps_ppr_global[PRTE_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10); - if (start < PRTE_HWLOC_L2CACHE_LEVEL) { - start = PRTE_HWLOC_L2CACHE_LEVEL; - cache_level = 2; - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYL2CACHE); - if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { - PRTE_SET_RANKING_POLICY(jdata->map->ranking, PRTE_RANK_BY_L2CACHE); - } - } - n++; - } else if (0 == strncasecmp(ck[1], "l3cache", len)) { - rmaps_ppr_global[PRTE_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10); - if (start < PRTE_HWLOC_L3CACHE_LEVEL) { - start = PRTE_HWLOC_L3CACHE_LEVEL; - cache_level = 3; - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYL3CACHE); - if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { - PRTE_SET_RANKING_POLICY(jdata->map->ranking, PRTE_RANK_BY_L3CACHE); - } - } - n++; - } else { - /* unknown spec */ - pmix_show_help("help-prte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], - jobppr); - pmix_argv_free(ppr_req); - pmix_argv_free(ck); - free(jobppr); - return PRTE_ERR_SILENT; - } + ck = pmix_argv_split(jobppr, ':'); + if (2 != pmix_argv_count(ck)) { + /* must provide a specification */ + pmix_show_help("help-prte-rmaps-ppr.txt", "invalid-ppr", true, jobppr); + pmix_argv_free(ck); + free(jobppr); + return PRTE_ERR_SILENT; + } + len = strlen(ck[1]); + ppr = strtol(ck[0], NULL, 10); + ranking = PRTE_RANK_BY_SLOT; + if (0 == strncasecmp(ck[1], "node", len)) { + mapping = PRTE_MAPPING_BYNODE; + ranking = PRTE_RANK_BY_NODE; + options->maptype = HWLOC_OBJ_MACHINE; + } else if (0 == strncasecmp(ck[1], "hwthread", len) || + 0 == strncasecmp(ck[1], "thread", len)) { + mapping = PRTE_MAPPING_BYHWTHREAD; + options->maptype = HWLOC_OBJ_PU; + } else if (0 == strncasecmp(ck[1], "core", len)) { + mapping = PRTE_MAPPING_BYCORE; + options->maptype = HWLOC_OBJ_CORE; + } else if (0 == strncasecmp(ck[1], "package", len) || 0 == strncasecmp(ck[1], "skt", len)) { + mapping = PRTE_MAPPING_BYPACKAGE; + options->maptype = HWLOC_OBJ_PACKAGE; + } else if (0 == strncasecmp(ck[1], "numa", len) || 0 == strncasecmp(ck[1], "nm", len)) { + mapping = PRTE_MAPPING_BYNUMA; + options->maptype = HWLOC_OBJ_NUMANODE; + } else if (0 == strncasecmp(ck[1], "l1cache", len)) { + mapping = PRTE_MAPPING_BYL1CACHE; + PRTE_HWLOC_MAKE_OBJ_CACHE(1, options->maptype, options->cmaplvl); + } else if (0 == strncasecmp(ck[1], "l2cache", len)) { + mapping = PRTE_MAPPING_BYL2CACHE; + PRTE_HWLOC_MAKE_OBJ_CACHE(2, options->maptype, options->cmaplvl); + } else if (0 == strncasecmp(ck[1], "l3cache", len)) { + mapping = PRTE_MAPPING_BYL3CACHE; + PRTE_HWLOC_MAKE_OBJ_CACHE(3, options->maptype, options->cmaplvl); + } else { + /* unknown spec */ + pmix_show_help("help-prte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], + jobppr); pmix_argv_free(ck); + free(jobppr); + return PRTE_ERR_SILENT; } - pmix_argv_free(ppr_req); + pmix_argv_free(ck); + /* if nothing was given, that's an error */ - if (0 == n) { + if (0 == mapping) { prte_output(0, "NOTHING GIVEN"); free(jobppr); return PRTE_ERR_SILENT; } - /* if more than one level was specified, then pruning will be reqd */ - if (1 < n) { - pruning_reqd = true; + /* record the results */ + PRTE_SET_MAPPING_POLICY(jdata->map->mapping, mapping); + if (!PRTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { + PRTE_SET_RANKING_POLICY(jdata->map->ranking, ranking); + } + options->map = PRTE_GET_MAPPING_POLICY(jdata->map->mapping); + options->rank = PRTE_GET_RANKING_POLICY(jdata->map->ranking); + if (PRTE_RANK_BY_SPAN == options->rank || + PRTE_RANK_BY_FILL == options->rank) { + if (options->map < PRTE_MAPPING_BYNUMA || + options->map > PRTE_MAPPING_BYHWTHREAD) { + pmix_show_help("help-prte-rmaps-base.txt", "must-map-by-obj", + true, prte_rmaps_base_print_mapping(options->map), + prte_rmaps_base_print_ranking(options->rank)); + free(jobppr); + return PRTE_ERR_SILENT; + } } prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:ppr: job %s assigned policy %s", PRTE_JOBID_PRINT(jdata->nspace), - prte_rmaps_base_print_mapping(jdata->map->mapping)); - - /* convenience */ - level = start; - lowest = prte_hwloc_levels[start]; - - for (idx = 0; idx < (prte_app_idx_t) jdata->apps->size; idx++) { - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, idx))) { + "mca:rmaps:ppr: job %s assigned policy %s:%s", + PRTE_JOBID_PRINT(jdata->nspace), + prte_rmaps_base_print_mapping(options->map), + prte_rmaps_base_print_ranking(options->rank)); + + /* cycle thru the apps */ + for (idx = 0; idx < jdata->apps->size; idx++) { + app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, idx); + if (NULL == app) { continue; } - - /* if the number of total procs was given, set that - * limit - otherwise, set to max so we simply fill - * all the nodes with the pattern - */ - if (0 < app->num_procs) { - total_procs = app->num_procs; - } else { - total_procs = PMIX_RANK_VALID; - } + options->total_nobjs = 0; /* get the available nodes */ PMIX_CONSTRUCT(&node_list, pmix_list_t); - rc = prte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, + rc = prte_rmaps_base_get_target_nodes(&node_list, &num_slots, jdata, app, jdata->map->mapping, initial_map, false); if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); @@ -285,150 +199,115 @@ static int ppr_mapper(prte_job_t *jdata) } /* flag that all subsequent requests should not reset the node->mapped flag */ initial_map = false; - - /* if a bookmark exists from some prior mapping, set us to start there */ - jdata->bookmark = prte_rmaps_base_get_starting_point(&node_list, jdata); + /* if the number of total procs was given, set that + * limit - otherwise, set to max so we simply fill + * all the nodes with the pattern + */ + if (0 == app->num_procs) { + if (HWLOC_OBJ_MACHINE == options->maptype) { + app->num_procs = ppr * pmix_list_get_size(&node_list); + } else { + nobjs = 0; + PMIX_LIST_FOREACH(node, &node_list, prte_node_t) { + /* get the number of objects of this type on this node */ + nobjs += prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, + options->maptype, options->cmaplvl); + } + if (0 == nobjs) { + rc = PRTE_ERR_NOT_FOUND; + goto error; + } + app->num_procs = ppr * nobjs; + } + } /* cycle across the nodes */ nprocs_mapped = 0; - for (item = pmix_list_get_first(&node_list); item != pmix_list_get_end(&node_list); - item = pmix_list_get_next(item)) { - node = (prte_node_t *) item; - /* bozo check */ - if (NULL == node->topology || NULL == node->topology->topo) { - pmix_show_help("help-prte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); - rc = PRTE_ERR_SILENT; - goto error; - } - /* add the node to the map, if needed */ - if (!PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_MAPPED)) { - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_MAPPED); - PMIX_RETAIN(node); - pmix_pointer_array_add(jdata->map->nodes, node); - jdata->map->num_nodes++; + PMIX_LIST_FOREACH_SAFE(node, nd, &node_list, prte_node_t) { + options->nobjs = 0; + prte_rmaps_base_get_cpuset(jdata, node, options); + + if (!options->donotlaunch) { + rc = prte_rmaps_base_check_support(jdata, node, options); + if (PRTE_SUCCESS != rc) { + goto error; + } } - /* if we are mapping solely at the node level, just put - * that many procs on this node - */ - if (PRTE_HWLOC_NODE_LEVEL == start) { - obj = hwloc_get_root_obj(node->topology->topo); - for (j = 0; j < rmaps_ppr_global[start] && nprocs_mapped < total_procs; j++) { - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, node, idx))) { + + if (HWLOC_OBJ_MACHINE == options->maptype) { + options->nprocs = ppr; + /* check availability and set the target cpuset - this + * also computes the nprocs to be assigned capped by + * the number of available binding targets */ + if (!prte_rmaps_base_check_avail(jdata, app, node, &node_list, NULL, options)) { + continue; + } + for (j = 0; j < ppr && nprocs_mapped < app->num_procs; j++) { + proc = prte_rmaps_base_setup_proc(jdata, idx, node, NULL, options); + if (NULL == proc) { rc = PRTE_ERR_OUT_OF_RESOURCE; goto error; } nprocs_mapped++; - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, PRTE_ATTR_LOCAL, - obj, PMIX_POINTER); + rc = prte_rmaps_base_check_oversubscribed(jdata, app, node); + if (PRTE_SUCCESS != rc) { + goto error; + } } } else { - /* get the number of lowest resources on this node */ - nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, lowest, cache_level); - /* Map up to number of slots_available on node or number of specified resource on - * node whichever is less. */ - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && node->slots_available < (int) nobjs) { - num_available = node->slots_available; - } else { - num_available = nobjs; + /* get the number of resources on this node */ + nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, + options->maptype, options->cmaplvl); + if (0 == nobjs) { + continue; } - /* map the specified number of procs to each such resource on this node, - * recording the locale of each proc so we know its cpuset - */ - for (j = 0; j < rmaps_ppr_global[start] && nprocs_mapped < total_procs; j++) { - for (i=0; i < num_available && nprocs_mapped < total_procs; i++) { - obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, lowest, cache_level, i); - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, node, idx))) { + options->nprocs = ppr * nobjs; + /* map the specified number of procs to each such resource on this node */ + for (j = 0; j < nobjs && nprocs_mapped < app->num_procs; j++) { + obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, + options->maptype, options->cmaplvl, j); + if (!prte_rmaps_base_check_avail(jdata, app, node, &node_list, obj, options)) { + continue; + } + for (i=0; i < ppr && app->num_procs; i++) { + proc = prte_rmaps_base_setup_proc(jdata, idx, node, obj, options); + if (NULL == proc) { rc = PRTE_ERR_OUT_OF_RESOURCE; goto error; } nprocs_mapped++; - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, - PRTE_ATTR_LOCAL, obj, PMIX_POINTER); - } - } - if (pruning_reqd) { - /* go up the ladder and prune the procs according to - * the specification, adjusting the count of procs on the - * node as we go - */ - level--; - prune(jdata->nspace, idx, node, &level, &nprocs_mapped); - } - } - - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { - /* set the total slots used */ - if ((int) node->num_procs <= node->slots) { - node->slots_inuse = (int) node->num_procs; - } else { - node->slots_inuse = node->slots; - } - - /* if no-oversubscribe was specified, check to see if - * we have violated the total slot specification - regardless, - * if slots_max was given, we are not allowed to violate it! - */ - if ((node->slots < (int) node->num_procs) || - (0 < node->slots_max && node->slots_max < (int) node->num_procs)) { - if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", - true, node->num_procs, app->app); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - rc = PRTE_ERR_SILENT; - goto error; - } - /* flag the node as oversubscribed so that sched-yield gets - * properly set - */ - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_OVERSUBSCRIBED); - PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); - /* check for permission */ - if (PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_SLOTS_GIVEN)) { - /* if we weren't given a directive either way, then we will error out - * as the #slots were specifically given, either by the host RM or - * via hostfile/dash-host */ - if (!(PRTE_MAPPING_SUBSCRIBE_GIVEN & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { - pmix_show_help("help-prte-rmaps-base.txt", - "prte-rmaps-base:alloc-error", true, app->num_procs, - app->app); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - rc = PRTE_ERR_SILENT; - goto error; - } else if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - /* if we were explicitly told not to oversubscribe, then don't */ - pmix_show_help("help-prte-rmaps-base.txt", - "prte-rmaps-base:alloc-error", true, app->num_procs, - app->app); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - rc = PRTE_ERR_SILENT; + rc = prte_rmaps_base_check_oversubscribed(jdata, app, node); + if (PRTE_SUCCESS != rc) { goto error; } } } } + /* if we haven't mapped all the procs, continue on to the * next node */ - if (total_procs == nprocs_mapped) { + if (nprocs_mapped == app->num_procs) { break; } } if (0 == app->num_procs) { app->num_procs = nprocs_mapped; } - if (PMIX_RANK_VALID != total_procs && nprocs_mapped < total_procs) { + if (nprocs_mapped < options->nprocs) { /* couldn't map them all */ pmix_show_help("help-prte-rmaps-ppr.txt", "ppr-too-many-procs", true, app->app, - app->num_procs, nprocs_mapped, total_procs, jobppr); + app->num_procs, nprocs_mapped, options->nprocs, jobppr); rc = PRTE_ERR_SILENT; goto error; } + /* calculate the ranks for this app */ + rc = prte_rmaps_base_compute_vpids(jdata, app, options); + if (PRTE_SUCCESS != rc) { + return rc; + } - /* track the total number of processes we mapped - must update - * this AFTER we compute vpids so that computation is done - * correctly - */ jdata->num_procs += app->num_procs; PMIX_LIST_DESTRUCT(&node_list); @@ -441,329 +320,3 @@ static int ppr_mapper(prte_job_t *jdata) free(jobppr); return rc; } - -static hwloc_obj_t find_split(hwloc_topology_t topo, hwloc_obj_t obj) -{ - unsigned k; - hwloc_obj_t nxt; - - if (1 < obj->arity) { - return obj; - } - for (k = 0; k < obj->arity; k++) { - nxt = find_split(topo, obj->children[k]); - if (NULL != nxt) { - return nxt; - } - } - return NULL; -} - -/* recursively climb the topology, pruning procs beyond that allowed - * by the given ppr - */ -static void prune(pmix_nspace_t jobid, prte_app_idx_t app_idx, prte_node_t *node, - prte_hwloc_level_t *level, pmix_rank_t *nmapped) -{ - hwloc_obj_t obj, top; - unsigned int i, nobjs; - hwloc_obj_type_t lvl; - unsigned cache_level = 0, k; - int nprocs; - hwloc_cpuset_t avail; - int n, limit, nmax, nunder, idx, idxmax = 0; - prte_proc_t *proc, *pptr, *procmax; - prte_hwloc_level_t ll; - char dang[64]; - hwloc_obj_t locale; - - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:ppr: pruning level %d", *level); - - /* convenience */ - ll = *level; - - /* convenience */ - lvl = prte_hwloc_levels[ll]; - limit = rmaps_ppr_global[ll]; - - if (0 == limit) { - /* no limit at this level, so move up if necessary */ - if (0 == ll) { - /* done */ - return; - } - --(*level); - prune(jobid, app_idx, node, level, nmapped); - return; - } - - /* handle the darn cache thing again */ - if (PRTE_HWLOC_L3CACHE_LEVEL == ll) { - cache_level = 3; - } else if (PRTE_HWLOC_L2CACHE_LEVEL == ll) { - cache_level = 2; - } else if (PRTE_HWLOC_L1CACHE_LEVEL == ll) { - cache_level = 1; - } - - /* get the number of resources at this level on this node */ - nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, lvl, cache_level); - - /* for each resource, compute the number of procs sitting - * underneath it and check against the limit - */ - for (i = 0; i < nobjs; i++) { - obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, lvl, cache_level, i); - /* get the available cpuset */ - avail = obj->cpuset; - - /* look at the intersection of this object's cpuset and that - * of each proc in the job/app - if they intersect, then count this proc - * against the limit - */ - nprocs = 0; - for (n = 0; n < node->procs->size; n++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, n))) { - continue; - } - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jobid) || proc->app_idx != app_idx) { - continue; - } - locale = NULL; - if (prte_get_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, (void **) &locale, PMIX_POINTER)) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - return; - } - if (hwloc_bitmap_intersects(avail, locale->cpuset)) { - nprocs++; - } - } - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:ppr: found %d procs limit %d", nprocs, limit); - - /* check against the limit */ - while (limit < nprocs) { - /* need to remove procs - do this in a semi-intelligent - * manner to provide a little load balancing by cycling - * across the objects beneath this one, removing procs - * in a round-robin fashion until the limit is satisfied - * - * NOTE: I'm sure someone more knowledgeable with hwloc - * will come up with a more efficient way to do this, so - * consider this is a starting point - */ - - /* find the first level that has more than - * one child beneath it - if all levels - * have only one child, then return this - * object - */ - top = find_split(node->topology->topo, obj); - hwloc_obj_type_snprintf(dang, 64, top, 1); - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:ppr: SPLIT AT LEVEL %s", dang); - - /* cycle across the children of this object */ - nmax = 0; - procmax = NULL; - idx = 0; - /* find the child with the most procs underneath it */ - for (k = 0; k < top->arity && limit < nprocs; k++) { - /* get this object's available cpuset */ - nunder = 0; - pptr = NULL; - for (n = 0; n < node->procs->size; n++) { - proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, n); - if (NULL == proc) { - continue; - } - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jobid) || proc->app_idx != app_idx) { - continue; - } - locale = NULL; - if (prte_get_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, - (void **) &locale, PMIX_POINTER)) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - return; - } - if (hwloc_bitmap_intersects(top->children[k]->cpuset, locale->cpuset)) { - nunder++; - if (NULL == pptr) { - /* save the location of the first proc under this object */ - pptr = proc; - idx = n; - } - } - } - if (nmax < nunder) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d", k, nunder, - nmax); - nmax = nunder; - procmax = pptr; - idxmax = idx; - } - } - if (NULL == procmax) { - /* can't find anything to remove - error out */ - goto error; - } - /* remove it */ - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:ppr: removing proc at posn %d", idxmax); - pmix_pointer_array_set_item(node->procs, idxmax, NULL); - node->num_procs--; - node->slots_inuse--; - if (node->slots_inuse < 0) { - node->slots_inuse = 0; - } - nprocs--; - *nmapped -= 1; - PMIX_RELEASE(procmax); - } - } - /* finished with this level - move up if necessary */ - if (0 == ll) { - return; - } - --(*level); - prune(jobid, app_idx, node, level, nmapped); - return; - -error: - prte_output(0, "INFINITE LOOP"); -} - -static int assign_locations(prte_job_t *jdata) -{ - int i, j, m, n; - prte_mca_base_component_t *c = &prte_rmaps_ppr_component.base_version; - prte_node_t *node; - prte_proc_t *proc; - prte_app_context_t *app; - hwloc_obj_type_t level; - hwloc_obj_t obj; - unsigned int cache_level = 0; - int ppr, cnt, nobjs, nprocs_mapped; - char **ppr_req, **ck, *jobppr; - - if (NULL == jdata->map->last_mapper - || 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { - /* a mapper has been specified, and it isn't me */ - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:ppr: job %s not using ppr assign: %s", - PRTE_JOBID_PRINT(jdata->nspace), - (NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper); - return PRTE_ERR_TAKE_NEXT_OPTION; - } - - jobppr = NULL; - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_PPR, (void **) &jobppr, PMIX_STRING) || - NULL == jobppr) { - return PRTE_ERR_BAD_PARAM; - } - - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:ppr: assigning locations for job %s with ppr %s policy %s", - PRTE_JOBID_PRINT(jdata->nspace), jobppr, - prte_rmaps_base_print_mapping(jdata->map->mapping)); - - /* pickup the object level */ - if (PRTE_MAPPING_BYNODE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - level = HWLOC_OBJ_MACHINE; - } else if (PRTE_MAPPING_BYHWTHREAD == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - level = HWLOC_OBJ_PU; - } else if (PRTE_MAPPING_BYCORE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - level = HWLOC_OBJ_CORE; - } else if (PRTE_MAPPING_BYPACKAGE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - level = HWLOC_OBJ_PACKAGE; - } else if (PRTE_MAPPING_BYNUMA == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - level = HWLOC_OBJ_NUMANODE; - } else if (PRTE_MAPPING_BYL1CACHE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - level = HWLOC_OBJ_L1CACHE; - cache_level = 1; - } else if (PRTE_MAPPING_BYL2CACHE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - level = HWLOC_OBJ_L2CACHE; - cache_level = 2; - } else if (PRTE_MAPPING_BYL3CACHE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - level = HWLOC_OBJ_L3CACHE; - cache_level = 3; - } else { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - return PRTE_ERR_TAKE_NEXT_OPTION; - } - - /* get the ppr value */ - ppr_req = pmix_argv_split(jobppr, ','); - ck = pmix_argv_split(ppr_req[0], ':'); - ppr = strtol(ck[0], NULL, 10); - pmix_argv_free(ck); - pmix_argv_free(ppr_req); - - /* start assigning procs to objects, filling each object as we go until - * all procs are assigned. */ - for (n = 0; n < jdata->apps->size; n++) { - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, n))) { - continue; - } - nprocs_mapped = 0; - for (m = 0; m < jdata->map->nodes->size; m++) { - node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m); - if (NULL == node) { - continue; - } - if (NULL == node->topology || NULL == node->topology->topo) { - pmix_show_help("help-prte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); - return PRTE_ERR_SILENT; - } - if (HWLOC_OBJ_MACHINE == level) { - obj = hwloc_get_root_obj(node->topology->topo); - for (j = 0; j < node->procs->size; j++) { - proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j); - if (NULL == proc) { - continue; - } - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - continue; - } - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, PRTE_ATTR_LOCAL, - obj, PMIX_POINTER); - } - } else { - /* get the number of resources on this node at this level */ - nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, level, - cache_level); - - /* map the specified number of procs to each such resource on this node, - * recording the locale of each proc so we know its cpuset - */ - for (i = 0; i < nobjs; i++) { - cnt = 0; - obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, level, cache_level, - i); - for (j = 0; - j < node->procs->size && cnt < ppr && nprocs_mapped < app->num_procs; - j++) { - proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j); - if (NULL == proc) { - continue; - } - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - continue; - } - /* if we already assigned it, then skip */ - if (prte_get_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, NULL, PMIX_POINTER)) { - continue; - } - nprocs_mapped++; - cnt++; - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, - PRTE_ATTR_LOCAL, obj, PMIX_POINTER); - } - } - } - } - } - return PRTE_SUCCESS; -} diff --git a/src/mca/rmaps/ppr/rmaps_ppr_component.c b/src/mca/rmaps/ppr/rmaps_ppr_component.c index ea2ed115d1..4e79c38a57 100644 --- a/src/mca/rmaps/ppr/rmaps_ppr_component.c +++ b/src/mca/rmaps/ppr/rmaps_ppr_component.c @@ -35,11 +35,13 @@ static int prte_rmaps_ppr_register(void); prte_rmaps_base_component_t prte_rmaps_ppr_component = { .base_version = { - PRTE_RMAPS_BASE_VERSION_2_0_0, + PRTE_RMAPS_BASE_VERSION_4_0_0, .mca_component_name = "ppr", - PRTE_MCA_BASE_MAKE_VERSION(component, PRTE_MAJOR_VERSION, PRTE_MINOR_VERSION, - PMIX_RELEASE_VERSION), + PRTE_MCA_BASE_MAKE_VERSION(component, + PRTE_MAJOR_VERSION, + PRTE_MINOR_VERSION, + PMIX_RELEASE_VERSION), .mca_open_component = prte_rmaps_ppr_open, .mca_close_component = prte_rmaps_ppr_close, .mca_query_component = prte_rmaps_ppr_query, diff --git a/src/mca/rmaps/rank_file/rmaps_rank_file.c b/src/mca/rmaps/rank_file/rmaps_rank_file.c index f68133758e..77df9ba4fe 100644 --- a/src/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/src/mca/rmaps/rank_file/rmaps_rank_file.c @@ -53,9 +53,12 @@ #include "src/runtime/prte_globals.h" #include "src/util/pmix_show_help.h" -static int prte_rmaps_rf_map(prte_job_t *jdata); +static int prte_rmaps_rf_map(prte_job_t *jdata, + prte_rmaps_options_t *options); -prte_rmaps_base_module_t prte_rmaps_rank_file_module = {.map_job = prte_rmaps_rf_map}; +prte_rmaps_base_module_t prte_rmaps_rank_file_module = { + .map_job = prte_rmaps_rf_map +}; static int prte_rmaps_rank_file_parse(const char *); static char *prte_rmaps_rank_file_parse_string_or_int(void); @@ -70,7 +73,8 @@ static int num_ranks = 0; /* * Create a rank_file mapping for the job. */ -static int prte_rmaps_rf_map(prte_job_t *jdata) +static int prte_rmaps_rf_map(prte_job_t *jdata, + prte_rmaps_options_t *options) { prte_job_map_t *map; prte_app_context_t *app = NULL; @@ -85,7 +89,7 @@ static int prte_rmaps_rf_map(prte_job_t *jdata) int rc; prte_proc_t *proc; prte_mca_base_component_t *c = &prte_rmaps_rank_file_component.super.base_version; - char *slots, *jobslots = NULL; + char *slots; bool initial_map = true; char *rankfile = NULL; prte_binding_policy_t bind; @@ -112,7 +116,7 @@ static int prte_rmaps_rf_map(prte_job_t *jdata) PRTE_JOBID_PRINT(jdata->nspace)); return PRTE_ERR_TAKE_NEXT_OPTION; } - if (PRTE_BIND_ORDERED_REQUESTED(jdata->map->binding)) { + if (options->ordered) { /* NOT FOR US */ prte_output_verbose(5, prte_rmaps_base_framework.framework_output, "mca:rmaps:rf: job %s binding order requested - rank_file cannot map", @@ -139,13 +143,13 @@ static int prte_rmaps_rf_map(prte_job_t *jdata) /* convenience def */ map = jdata->map; - bind = PRTE_GET_BINDING_POLICY(jdata->map->binding); /* setup the node list */ PMIX_CONSTRUCT(&node_list, pmix_list_t); /* pickup the first app - there must be at least one */ - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, 0))) { + app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, 0); + if (NULL == app) { rc = PRTE_ERR_SILENT; goto error; } @@ -165,9 +169,6 @@ static int prte_rmaps_rf_map(prte_job_t *jdata) /* END SANITY CHECKS */ - /* see if the job was given a slot list */ - prte_get_attribute(&jdata->attributes, PRTE_JOB_CPUSET, (void **) &jobslots, PMIX_STRING); - /* start at the beginning... */ vpid_start = 0; jdata->num_procs = 0; @@ -181,7 +182,8 @@ static int prte_rmaps_rf_map(prte_job_t *jdata) /* cycle through the app_contexts, mapping them sequentially */ for (i = 0; i < jdata->apps->size; i++) { - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i))) { + app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i); + if (NULL == app) { continue; } @@ -189,9 +191,9 @@ static int prte_rmaps_rf_map(prte_job_t *jdata) * use since that can now be modified with a hostfile and/or -host * option */ - if (PRTE_SUCCESS - != (rc = prte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, map->mapping, - initial_map, false))) { + rc = prte_rmaps_base_get_target_nodes(&node_list, &num_slots, jdata, app, + options->map, initial_map, false); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); goto error; } @@ -211,12 +213,11 @@ static int prte_rmaps_rf_map(prte_job_t *jdata) for (k = 0; k < app->num_procs; k++) { rank = vpid_start + k; /* get the rankfile entry for this rank */ - if (NULL - == (rfmap = (prte_rmaps_rank_file_map_t *) pmix_pointer_array_get_item(&rankmap, - rank))) { + rfmap = (prte_rmaps_rank_file_map_t *) pmix_pointer_array_get_item(&rankmap, rank); + if (NULL == rfmap) { /* if this job was given a slot-list, then use it */ - if (NULL != jobslots) { - slots = jobslots; + if (NULL != options->cpuset) { + slots = options->cpuset; } else if (NULL != prte_hwloc_default_cpu_list) { /* if we were give a default slot-list, then use it */ slots = prte_hwloc_default_cpu_list; @@ -300,90 +301,36 @@ static int prte_rmaps_rf_map(prte_job_t *jdata) rc = PRTE_ERR_SILENT; goto error; } - /* ensure the node is in the map */ - if (!PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_MAPPED)) { - PMIX_RETAIN(node); - pmix_pointer_array_add(map->nodes, node); - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_MAPPED); - ++(jdata->map->num_nodes); + if (!options->donotlaunch) { + rc = prte_rmaps_base_check_support(jdata, node, options); + if (PRTE_SUCCESS != rc) { + return rc; + } + } + prte_rmaps_base_get_cpuset(jdata, node, options); + if (!prte_rmaps_base_check_avail(jdata, app, node, &node_list, NULL, options)) { + pmix_show_help("help-rmaps_rank_file.txt", "bad-host", true, rfmap->node_name); + rc = PRTE_ERR_SILENT; + goto error; } - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, node, i))) { + proc = prte_rmaps_base_setup_proc(jdata, app->idx, node, NULL, options); + if (NULL == proc) { PRTE_ERROR_LOG(PRTE_ERR_OUT_OF_RESOURCE); rc = PRTE_ERR_OUT_OF_RESOURCE; goto error; } /* check if we are oversubscribed */ - if ((node->slots < (int) node->num_procs) || - (0 < node->slots_max && node->slots_max < (int) node->num_procs)) { - if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, - node->num_procs, app->app); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - rc = PRTE_ERR_SILENT; - goto error; - } - /* flag the node as oversubscribed so that sched-yield gets - * properly set - */ - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_OVERSUBSCRIBED); - PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); + rc = prte_rmaps_base_check_oversubscribed(jdata, app, node); + if (PRTE_SUCCESS != rc) { + goto error; } - /* set the vpid */ + /* set the vpid */ proc->name.rank = rank; - - if (NULL != slots && PRTE_BIND_TO_NONE != bind) { - /* setup the bitmap */ - hwloc_cpuset_t bitmap; - char *cpu_bitmap; - if (NULL == node->topology || NULL == node->topology->topo) { - /* not allowed - for rank-file, we must have - * the topology info - */ - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:no-topology", true, - node->name); - rc = PRTE_ERR_SILENT; - goto error; - } - bitmap = hwloc_bitmap_alloc(); - /* parse the slot_list to find the package and core */ - rc = prte_hwloc_base_cpu_list_parse(slots, node->topology->topo, bitmap); - if (PRTE_ERR_NOT_FOUND == rc) { - char *tmp = prte_hwloc_base_cset2str(hwloc_topology_get_allowed_cpuset(node->topology->topo), - false, node->topology->topo); - pmix_show_help("help-rmaps_rank_file.txt", "missing-cpu", true, - prte_tool_basename, slots, tmp); - free(tmp); - rc = PRTE_ERR_SILENT; - hwloc_bitmap_free(bitmap); - goto error; - } else if (PRTE_ERROR == rc) { - pmix_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile); - rc = PRTE_ERR_SILENT; - hwloc_bitmap_free(bitmap); - goto error; - } else { - PRTE_ERROR_LOG(rc); - hwloc_bitmap_free(bitmap); - goto error; - } - /* note that we cannot set the proc locale to any specific object - * as the slot list may have assigned it to more than one - so - * leave that field NULL - */ - /* set the proc to the specified map */ - hwloc_bitmap_list_asprintf(&cpu_bitmap, bitmap); - prte_set_attribute(&proc->attributes, PRTE_PROC_CPU_BITMAP, PRTE_ATTR_GLOBAL, - cpu_bitmap, PMIX_STRING); - /* cleanup */ - free(cpu_bitmap); - hwloc_bitmap_free(bitmap); - } - /* insert the proc into the proper place */ - if (PRTE_SUCCESS - != (rc = pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc))) { + rc = pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); - return rc; + goto error; } jdata->num_procs++; } @@ -392,13 +339,10 @@ static int prte_rmaps_rf_map(prte_job_t *jdata) /* cleanup the node list - it can differ from one app_context * to another, so we have to get it every time */ - while (NULL != (item = pmix_list_remove_first(&node_list))) { - PMIX_RELEASE(item); - } - PMIX_DESTRUCT(&node_list); + PMIX_LIST_DESTRUCT(&node_list); PMIX_CONSTRUCT(&node_list, pmix_list_t); } - PMIX_DESTRUCT(&node_list); + PMIX_LIST_DESTRUCT(&node_list); /* cleanup the rankmap */ for (i = 0; i < rankmap.size; i++) { @@ -407,10 +351,6 @@ static int prte_rmaps_rf_map(prte_job_t *jdata) } } PMIX_DESTRUCT(&rankmap); - /* mark the job as fully described */ - prte_set_attribute(&jdata->attributes, PRTE_JOB_FULLY_DESCRIBED, PRTE_ATTR_GLOBAL, NULL, - PMIX_BOOL); - if (NULL != rankfile) { free(rankfile); } diff --git a/src/mca/rmaps/rank_file/rmaps_rank_file_component.c b/src/mca/rmaps/rank_file/rmaps_rank_file_component.c index 362ae659f3..763b369c49 100644 --- a/src/mca/rmaps/rank_file/rmaps_rank_file_component.c +++ b/src/mca/rmaps/rank_file/rmaps_rank_file_component.c @@ -52,11 +52,13 @@ prte_rmaps_rf_component_t prte_rmaps_rank_file_component = { information about the component itself */ .base_version = { - PRTE_RMAPS_BASE_VERSION_2_0_0, + PRTE_RMAPS_BASE_VERSION_4_0_0, .mca_component_name = "rank_file", - PRTE_MCA_BASE_MAKE_VERSION(component, PRTE_MAJOR_VERSION, PRTE_MINOR_VERSION, - PMIX_RELEASE_VERSION), + PRTE_MCA_BASE_MAKE_VERSION(component, + PRTE_MAJOR_VERSION, + PRTE_MINOR_VERSION, + PMIX_RELEASE_VERSION), .mca_query_component = prte_rmaps_rank_file_query, }, .base_data = { diff --git a/src/mca/rmaps/rmaps.h b/src/mca/rmaps/rmaps.h index 1d541263ac..d310988af6 100644 --- a/src/mca/rmaps/rmaps.h +++ b/src/mca/rmaps/rmaps.h @@ -13,7 +13,7 @@ * Copyright (c) 2011 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2017-2020 Intel, Inc. All rights reserved. - * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -69,27 +69,20 @@ BEGIN_C_DECLS /* map a job - used by the HNP to compute the #procs on each node. * This is passed to the backend daemons as a regex which they * use to create an prte_job_map_t for the job */ -typedef int (*prte_rmaps_base_module_map_fn_t)(prte_job_t *jdata); - -/* assign a location to each process. Used by the backend daemons, - * this function takes the prte_job_map_t created from the regex - * and assigns each process to a specific location within the - * hardware topology based on the --map-by directive */ -typedef int (*prte_rmaps_base_module_assign_loc_fn_t)(prte_job_t *jdata); +typedef int (*prte_rmaps_base_module_map_fn_t)(prte_job_t *jdata, + prte_rmaps_options_t *options); /* - * rmaps module version 3.0.0 + * rmaps module version 4.0.0 */ -struct prte_rmaps_base_module_3_0_0_t { +struct prte_rmaps_base_module_4_0_0_t { /** Mapping function pointer */ prte_rmaps_base_module_map_fn_t map_job; - /* assign locations */ - prte_rmaps_base_module_assign_loc_fn_t assign_locations; }; /** Convenience typedef */ -typedef struct prte_rmaps_base_module_3_0_0_t prte_rmaps_base_module_3_0_0_t; +typedef struct prte_rmaps_base_module_4_0_0_t prte_rmaps_base_module_4_0_0_t; /** Convenience typedef */ -typedef prte_rmaps_base_module_3_0_0_t prte_rmaps_base_module_t; +typedef prte_rmaps_base_module_4_0_0_t prte_rmaps_base_module_t; /* * rmaps component @@ -98,16 +91,16 @@ typedef prte_rmaps_base_module_3_0_0_t prte_rmaps_base_module_t; /** * rmaps component version 3.0.0 */ -struct prte_rmaps_base_component_3_0_0_t { +struct prte_rmaps_base_component_4_0_0_t { /** Base MCA structure */ prte_mca_base_component_t base_version; /** Base MCA data */ prte_mca_base_component_data_t base_data; }; /** Convenience typedef */ -typedef struct prte_rmaps_base_component_3_0_0_t prte_rmaps_base_component_3_0_0_t; +typedef struct prte_rmaps_base_component_4_0_0_t prte_rmaps_base_component_4_0_0_t; /** Convenience typedef */ -typedef prte_rmaps_base_component_3_0_0_t prte_rmaps_base_component_t; +typedef prte_rmaps_base_component_4_0_0_t prte_rmaps_base_component_t; END_C_DECLS diff --git a/src/mca/rmaps/rmaps_types.h b/src/mca/rmaps/rmaps_types.h index eb3fe08bb6..c3802e8fe9 100644 --- a/src/mca/rmaps/rmaps_types.h +++ b/src/mca/rmaps/rmaps_types.h @@ -74,18 +74,63 @@ struct prte_job_map_t { typedef struct prte_job_map_t prte_job_map_t; PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_job_map_t); -/** +typedef struct { + /* input info */ + uint16_t cpus_per_rank; + bool use_hwthreads; + int stream; + int verbosity; + char *cpuset; + hwloc_cpuset_t job_cpuset; + bool bindsupport; + bool donotlaunch; + bool membind_warned; + bool oversubscribe; + bool overload; + + /* mapping values */ + prte_mapping_policy_t map; + bool mapspan; + bool ordered; + prte_binding_policy_t mapdepth; + unsigned ncpus; + int nprocs; + hwloc_obj_type_t maptype; + unsigned cmaplvl; + + /* ranking values */ + prte_ranking_policy_t rank; + bool userranked; + unsigned nnodes; + unsigned total_nobjs; + unsigned nobjs; + pmix_rank_t last_rank; + + /* binding values */ + prte_binding_policy_t bind; + bool dobind; + hwloc_obj_type_t hwb; + unsigned clvl; + + /* usage tracking */ + hwloc_cpuset_t target; + hwloc_obj_t obj; + +} prte_rmaps_options_t; + + +/* + ** * Macro for use in components that are of type rmaps */ -#define PRTE_RMAPS_BASE_VERSION_2_0_0 PRTE_MCA_BASE_VERSION_2_1_0("rmaps", 2, 0, 0) +#define PRTE_RMAPS_BASE_VERSION_4_0_0 PRTE_MCA_BASE_VERSION_2_1_0("rmaps", 4, 0, 0) /* define map-related directives */ #define PRTE_MAPPING_NO_USE_LOCAL 0x0100 #define PRTE_MAPPING_NO_OVERSUBSCRIBE 0x0200 #define PRTE_MAPPING_SUBSCRIBE_GIVEN 0x0400 #define PRTE_MAPPING_SPAN 0x0800 -/* an error flag */ -#define PRTE_MAPPING_CONFLICTED 0x1000 +#define PRTE_MAPPING_ORDERED 0x1000 /* directives given */ #define PRTE_MAPPING_LOCAL_GIVEN 0x2000 #define PRTE_MAPPING_GIVEN 0x4000 @@ -110,17 +155,18 @@ PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_job_map_t); /* now take the other round-robin options */ #define PRTE_MAPPING_BYSLOT 9 #define PRTE_MAPPING_BYDIST 10 +#define PRTE_MAPPING_PELIST 11 /* convenience - declare anything <= 15 to be round-robin*/ -#define PRTE_MAPPING_RR 16 +#define PRTE_MAPPING_RR 16 /* sequential policy */ -#define PRTE_MAPPING_SEQ 20 +#define PRTE_MAPPING_SEQ 20 /* staged execution mapping */ -#define PRTE_MAPPING_STAGED 21 +#define PRTE_MAPPING_STAGED 21 /* rank file and other user-defined mapping */ -#define PRTE_MAPPING_BYUSER 22 +#define PRTE_MAPPING_BYUSER 22 /* pattern-based mapping */ -#define PRTE_MAPPING_PPR 23 +#define PRTE_MAPPING_PPR 23 /* macro to separate out the mapping policy * from the directives */ @@ -130,23 +176,18 @@ PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_job_map_t); #define PRTE_SET_MAPPING_POLICY(target, pol) (target) = (pol) | ((target) &0xff00) /* define ranking directives */ -#define PRTE_RANKING_SPAN 0x1000 -#define PRTE_RANKING_FILL 0x2000 -#define PRTE_RANKING_GIVEN 0x4000 +#define PRTE_RANKING_GIVEN 0x1000 #define PRTE_SET_RANKING_DIRECTIVE(target, pol) (target) |= (pol) #define PRTE_UNSET_RANKING_DIRECTIVE(target, pol) (target) &= ~(pol) #define PRTE_GET_RANKING_DIRECTIVE(pol) ((pol) &0xf000) /* define ranking policies */ #define PRTE_RANK_BY_NODE 1 -#define PRTE_RANK_BY_NUMA 2 -#define PRTE_RANK_BY_PACKAGE 3 -#define PRTE_RANK_BY_L3CACHE 4 -#define PRTE_RANK_BY_L2CACHE 5 -#define PRTE_RANK_BY_L1CACHE 6 -#define PRTE_RANK_BY_CORE 7 -#define PRTE_RANK_BY_HWTHREAD 8 -#define PRTE_RANK_BY_SLOT 9 +#define PRTE_RANK_BY_SLOT 2 +#define PRTE_RANK_BY_FILL 3 +#define PRTE_RANK_BY_SPAN 4 +#define PRTE_RANKING_BYUSER 5 + #define PRTE_GET_RANKING_POLICY(pol) ((pol) &0x0fff) /* macro to determine if ranking policy is set */ #define PRTE_RANKING_POLICY_IS_SET(pol) ((pol) &0x0fff) diff --git a/src/mca/rmaps/round_robin/Makefile.am b/src/mca/rmaps/round_robin/Makefile.am index d30a2ca8b7..5b1c7c898d 100644 --- a/src/mca/rmaps/round_robin/Makefile.am +++ b/src/mca/rmaps/round_robin/Makefile.am @@ -12,6 +12,7 @@ # Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved # Copyright (c) 2017-2020 Intel, Inc. All rights reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2022 Nanook Consulting. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -25,8 +26,7 @@ sources = \ rmaps_rr.c \ rmaps_rr.h \ rmaps_rr_component.c \ - rmaps_rr_mappers.c \ - rmaps_rr_assign.c + rmaps_rr_mappers.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la diff --git a/src/mca/rmaps/round_robin/rmaps_rr.c b/src/mca/rmaps/round_robin/rmaps_rr.c index 16dfc9ab5a..e9e6009d63 100644 --- a/src/mca/rmaps/round_robin/rmaps_rr.c +++ b/src/mca/rmaps/round_robin/rmaps_rr.c @@ -44,7 +44,8 @@ /* * Create a round-robin mapping for the job. */ -static int prte_rmaps_rr_map(prte_job_t *jdata) +static int prte_rmaps_rr_map(prte_job_t *jdata, + prte_rmaps_options_t *options) { prte_app_context_t *app; int i; @@ -54,6 +55,7 @@ static int prte_rmaps_rr_map(prte_job_t *jdata) int rc; prte_mca_base_component_t *c = &prte_rmaps_round_robin_component.base_version; bool initial_map = true; + char **tmp; /* this mapper can only handle initial launch * when rr mapping is desired - allow @@ -82,7 +84,8 @@ static int prte_rmaps_rr_map(prte_job_t *jdata) } prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: mapping job %s", PRTE_JOBID_PRINT(jdata->nspace)); + "mca:rmaps:rr: mapping job %s", + PRTE_JOBID_PRINT(jdata->nspace)); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { @@ -97,7 +100,8 @@ static int prte_rmaps_rr_map(prte_job_t *jdata) for (i = 0; i < jdata->apps->size; i++) { hwloc_obj_type_t target; unsigned cache_level; - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i))) { + app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i); + if (NULL == app) { continue; } @@ -119,122 +123,62 @@ static int prte_rmaps_rr_map(prte_job_t *jdata) * use since that can now be modified with a hostfile and/or -host * option */ - if (PRTE_SUCCESS - != (rc = prte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, - jdata->map->mapping, initial_map, false))) { + rc = prte_rmaps_base_get_target_nodes(&node_list, &num_slots, jdata, app, + jdata->map->mapping, initial_map, false); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); goto error; } /* flag that all subsequent requests should not reset the node->mapped flag */ initial_map = false; - /* if a bookmark exists from some prior mapping, set us to start there */ - jdata->bookmark = prte_rmaps_base_get_starting_point(&node_list, jdata); - if (0 == app->num_procs) { - /* set the num_procs to equal the number of slots on these - * mapped nodes, taking into account the number of cpus/rank - */ - app->num_procs = num_slots; - /* sometimes, we have only one "slot" assigned, but may - * want more than one cpu/rank - so ensure we always wind - * up with at least one proc */ - if (0 == app->num_procs) { - app->num_procs = 1; + if (NULL != options->cpuset && !options->overload) { + tmp = pmix_argv_split(options->cpuset, ','); + app->num_procs = pmix_argv_count(tmp); + pmix_argv_free(tmp); + } else { + /* set the num_procs to equal the number of slots on these + * mapped nodes, taking into account the number of cpus/rank + */ + app->num_procs = num_slots / options->cpus_per_rank; + /* sometimes, we have only one "slot" assigned, but may + * want more than one cpu/rank - so ensure we always wind + * up with at least one proc */ + if (0 == app->num_procs) { + app->num_procs = 1; + } } } /* Make assignments */ - if (PRTE_MAPPING_BYNODE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - rc = prte_rmaps_rr_bynode(jdata, app, &node_list, num_slots, app->num_procs); - } else if (PRTE_MAPPING_BYSLOT == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - rc = prte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); - } else if (PRTE_MAPPING_BYHWTHREAD == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - rc = prte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, - HWLOC_OBJ_PU, 0); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); - } - } else if (PRTE_MAPPING_BYCORE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - rc = prte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, - HWLOC_OBJ_CORE, 0); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); - } - } else if (PRTE_MAPPING_BYL1CACHE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - PRTE_HWLOC_MAKE_OBJ_CACHE(1, target, cache_level); - rc = prte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, target, - cache_level); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); - } - } else if (PRTE_MAPPING_BYL2CACHE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - PRTE_HWLOC_MAKE_OBJ_CACHE(2, target, cache_level); - rc = prte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, target, - cache_level); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); - } - } else if (PRTE_MAPPING_BYL3CACHE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - PRTE_HWLOC_MAKE_OBJ_CACHE(3, target, cache_level); - rc = prte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, target, - cache_level); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); - } - } else if (PRTE_MAPPING_BYNUMA == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - rc = prte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, - HWLOC_OBJ_NUMANODE, 0); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); - } - } else if (PRTE_MAPPING_BYPACKAGE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - rc = prte_rmaps_rr_byobj(jdata, app, &node_list, num_slots, app->num_procs, - HWLOC_OBJ_PACKAGE, 0); + if (PRTE_MAPPING_BYNODE == options->map) { + rc = prte_rmaps_rr_bynode(jdata, app, &node_list, + num_slots, app->num_procs, + options); + } else if (PRTE_MAPPING_BYSLOT == options->map) { + rc = prte_rmaps_rr_byslot(jdata, app, &node_list, + num_slots, app->num_procs, + options); + } else if (PRTE_MAPPING_PELIST == options->map) { + rc = prte_rmaps_rr_bycpu(jdata, app, &node_list, + num_slots, app->num_procs, + options); + } else { + rc = prte_rmaps_rr_byobj(jdata, app, &node_list, + num_slots, app->num_procs, + options); if (PRTE_ERR_NOT_FOUND == rc) { /* if the mapper couldn't map by this object because * it isn't available, but the error allows us to try * byslot, then do so */ PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_byslot(jdata, app, &node_list, num_slots, app->num_procs); + options->map = PRTE_MAPPING_BYSLOT; + rc = prte_rmaps_rr_byslot(jdata, app, &node_list, + num_slots, app->num_procs, + options); } - } else { - /* unrecognized mapping directive */ - pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", true, "mapping", - prte_rmaps_base_print_mapping(jdata->map->mapping)); - rc = PRTE_ERR_SILENT; - goto error; } if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); @@ -250,132 +194,17 @@ static int prte_rmaps_rr_map(prte_job_t *jdata) /* cleanup the node list - it can differ from one app_context * to another, so we have to get it every time */ - while (NULL != (item = pmix_list_remove_first(&node_list))) { - PMIX_RELEASE(item); - } - PMIX_DESTRUCT(&node_list); + PMIX_LIST_DESTRUCT(&node_list); } return PRTE_SUCCESS; error: - while (NULL != (item = pmix_list_remove_first(&node_list))) { - PMIX_RELEASE(item); - } - PMIX_DESTRUCT(&node_list); - - return rc; -} + PMIX_LIST_DESTRUCT(&node_list); -static int prte_rmaps_rr_assign_locations(prte_job_t *jdata) -{ - prte_mca_base_component_t *c = &prte_rmaps_round_robin_component.base_version; - hwloc_obj_type_t target; - unsigned cache_level; - int rc; - - if (NULL == jdata->map->last_mapper || - 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { - /* a mapper has been specified, and it isn't me */ - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: job %s not using rr mapper", - PRTE_JOBID_PRINT(jdata->nspace)); - return PRTE_ERR_TAKE_NEXT_OPTION; - } - - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: assign locations for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - - /* if the mapping directive was byslot or bynode, then we - * assign locations to the root object level */ - if (PRTE_MAPPING_BYNODE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping) || - PRTE_MAPPING_BYSLOT == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - return prte_rmaps_rr_assign_root_level(jdata); - } - - /* otherwise, assign by object */ - if (PRTE_MAPPING_BYHWTHREAD == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - rc = prte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_PU, 0); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't assign by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_assign_root_level(jdata); - } - } else if (PRTE_MAPPING_BYCORE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - rc = prte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CORE, 0); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_assign_root_level(jdata); - } - } else if (PRTE_MAPPING_BYL1CACHE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - PRTE_HWLOC_MAKE_OBJ_CACHE(1, target, cache_level); - rc = prte_rmaps_rr_assign_byobj(jdata, target, cache_level); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_assign_root_level(jdata); - } - } else if (PRTE_MAPPING_BYL2CACHE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - PRTE_HWLOC_MAKE_OBJ_CACHE(2, target, cache_level); - rc = prte_rmaps_rr_assign_byobj(jdata, target, cache_level); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_assign_root_level(jdata); - } - } else if (PRTE_MAPPING_BYL3CACHE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - PRTE_HWLOC_MAKE_OBJ_CACHE(3, target, cache_level); - rc = prte_rmaps_rr_assign_byobj(jdata, target, cache_level); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_assign_root_level(jdata); - } - } else if (PRTE_MAPPING_BYNUMA == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - rc = prte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_NUMANODE, 0); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_assign_root_level(jdata); - } - } else if (PRTE_MAPPING_BYPACKAGE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { - rc = prte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_PACKAGE, 0); - if (PRTE_ERR_NOT_FOUND == rc) { - /* if the mapper couldn't map by this object because - * it isn't available, but the error allows us to try - * byslot, then do so - */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - rc = prte_rmaps_rr_assign_root_level(jdata); - } - } else { - /* unrecognized mapping directive */ - pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", true, "mapping", - prte_rmaps_base_print_mapping(jdata->map->mapping)); - rc = PRTE_ERR_SILENT; - } return rc; } -prte_rmaps_base_module_t prte_rmaps_round_robin_module - = {.map_job = prte_rmaps_rr_map, .assign_locations = prte_rmaps_rr_assign_locations}; +prte_rmaps_base_module_t prte_rmaps_round_robin_module = { + .map_job = prte_rmaps_rr_map +}; diff --git a/src/mca/rmaps/round_robin/rmaps_rr.h b/src/mca/rmaps/round_robin/rmaps_rr.h index b2cbfde0a2..5b9eafa9b0 100644 --- a/src/mca/rmaps/round_robin/rmaps_rr.h +++ b/src/mca/rmaps/round_robin/rmaps_rr.h @@ -42,20 +42,21 @@ extern prte_rmaps_base_module_t prte_rmaps_round_robin_module; PRTE_MODULE_EXPORT int prte_rmaps_rr_bynode(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t *node_list, int32_t num_slots, - pmix_rank_t nprocs); + pmix_rank_t nprocs, prte_rmaps_options_t *options); PRTE_MODULE_EXPORT int prte_rmaps_rr_byslot(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t *node_list, int32_t num_slots, - pmix_rank_t nprocs); + pmix_rank_t nprocs, prte_rmaps_options_t *options); -PRTE_MODULE_EXPORT int prte_rmaps_rr_byobj(prte_job_t *jdata, prte_app_context_t *app, - pmix_list_t *node_list, int32_t num_slots, - pmix_rank_t num_procs, hwloc_obj_type_t target, - unsigned cache_level); - -PRTE_MODULE_EXPORT int prte_rmaps_rr_assign_root_level(prte_job_t *jdata); +PRTE_MODULE_EXPORT int prte_rmaps_rr_byobj(prte_job_t *jdata, + prte_app_context_t *app, + pmix_list_t *node_list, + int32_t num_slots, + pmix_rank_t num_procs, + prte_rmaps_options_t *options); -PRTE_MODULE_EXPORT int prte_rmaps_rr_assign_byobj(prte_job_t *jdata, hwloc_obj_type_t target, - unsigned cache_level); +PRTE_MODULE_EXPORT int prte_rmaps_rr_bycpu(prte_job_t *jdata, prte_app_context_t *app, + pmix_list_t *node_list, int32_t num_slots, + pmix_rank_t num_procs, prte_rmaps_options_t *options); END_C_DECLS diff --git a/src/mca/rmaps/round_robin/rmaps_rr_assign.c b/src/mca/rmaps/round_robin/rmaps_rr_assign.c deleted file mode 100644 index c281332433..0000000000 --- a/src/mca/rmaps/round_robin/rmaps_rr_assign.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2009-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "prte_config.h" -#include "constants.h" - -#include - -#include "src/hwloc/hwloc-internal.h" -#include "src/util/output.h" - -#include "src/mca/errmgr/errmgr.h" -#include "src/runtime/prte_globals.h" -#include "src/util/name_fns.h" -#include "src/util/pmix_show_help.h" - -#include "rmaps_rr.h" -#include "src/mca/rmaps/base/base.h" -#include "src/mca/rmaps/base/rmaps_private.h" - -int prte_rmaps_rr_assign_root_level(prte_job_t *jdata) -{ - int i, m; - prte_node_t *node; - prte_proc_t *proc; - hwloc_obj_t obj = NULL; - - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: assigning procs to root level for job %s", - PRTE_JOBID_PRINT(jdata->nspace)); - - for (m = 0; m < jdata->map->nodes->size; m++) { - if (NULL == (node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m))) { - continue; - } - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr:slot working node %s", node->name); - /* get the root object as we are not assigning - * locale here except at the node level */ - if (NULL == node->topology || NULL == node->topology->topo) { - /* nothing we can do */ - continue; - } - obj = hwloc_get_root_obj(node->topology->topo); - for (i = 0; i < node->procs->size; i++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, i))) { - continue; - } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr:assign skipping proc %s - from another job", - PRTE_NAME_PRINT(&proc->name)); - continue; - } - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, PRTE_ATTR_LOCAL, obj, - PMIX_POINTER); - } - } - return PRTE_SUCCESS; -} - -/* mapping by hwloc object looks a lot like mapping by node, - * but has the added complication of possibly having different - * numbers of objects on each node - */ -int prte_rmaps_rr_assign_byobj(prte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) -{ - int start, j, m, n, k, npus, cpus_per_rank; - prte_app_context_t *app; - prte_node_t *node; - prte_proc_t *proc; - hwloc_obj_t obj = NULL, root; - unsigned int nobjs; - uint16_t u16, *u16ptr = &u16; - char *job_cpuset; - prte_hwloc_topo_data_t *rdata; - hwloc_cpuset_t available, mycpus; - bool use_hwthread_cpus; - - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: assigning locations by %s for job %s", - hwloc_obj_type_string(target), PRTE_JOBID_PRINT(jdata->nspace)); - - /* see if this job has a "soft" cgroup assignment */ - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_CPUSET, (void **) &job_cpuset, PMIX_STRING)) { - job_cpuset = NULL; - } - - /* see if they want multiple cpus/rank */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, (void **) &u16ptr, PMIX_UINT16)) { - cpus_per_rank = u16; - } else { - cpus_per_rank = 1; - } - - /* check for type of cpu being used */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL)) { - use_hwthread_cpus = true; - } else { - use_hwthread_cpus = false; - } - - /* start mapping procs onto objects, filling each object as we go until - * all procs are mapped. If one pass doesn't catch all the required procs, - * then loop thru the list again to handle the oversubscription - */ - for (n = 0; n < jdata->apps->size; n++) { - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, n))) { - continue; - } - for (m = 0; m < jdata->map->nodes->size; m++) { - if (NULL - == (node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m))) { - continue; - } - if (NULL == node->topology || NULL == node->topology->topo) { - pmix_show_help("help-prte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); - return PRTE_ERR_SILENT; - } - /* get the number of objects of this type on this node */ - nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level); - if (0 == nobjs) { - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: found NO %s objects on node %s", - hwloc_obj_type_string(target), node->name); - continue; - } - - /* get the available processors on this node */ - root = hwloc_get_root_obj(node->topology->topo); - if (NULL == root->userdata) { - /* incorrect */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_BAD_PARAM; - } - rdata = (prte_hwloc_topo_data_t *) root->userdata; - available = hwloc_bitmap_dup(rdata->available); - if (NULL != job_cpuset) { - /* deal with any "soft" cgroup specification */ - mycpus = prte_hwloc_base_generate_cpuset(node->topology->topo, use_hwthread_cpus, - job_cpuset); - hwloc_bitmap_and(available, mycpus, available); - hwloc_bitmap_free(mycpus); - } - - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: found %u %s objects on node %s", nobjs, - hwloc_obj_type_string(target), node->name); - - /* if this is a comm_spawn situation, start with the object - * where the parent left off and increment */ - if (!PMIX_NSPACE_INVALID(jdata->originator.nspace) && UINT_MAX != jdata->bkmark_obj) { - start = (jdata->bkmark_obj + 1) % nobjs; - } else { - start = 0; - } - /* loop over the procs on this node */ - for (j = 0; j < node->procs->size; j++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr:assign skipping proc %s - from another job", - PRTE_NAME_PRINT(&proc->name)); - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - - /* Search for resource which has at least enough members for - * request. Seach fails if we wrap back to our starting index - * without finding a satisfactory resource. */ - k = start; - do { - /* get the hwloc object */ - obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, k); - if (NULL == obj) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_NOT_FOUND; - } - npus = prte_hwloc_base_get_npus(node->topology->topo, use_hwthread_cpus, - available, obj); - if (npus >= cpus_per_rank) { - break; - } - k = (k + 1) % nobjs; - } while (k != start); - /* Fail if loop exits without finding an adequate resource */ - if (cpus_per_rank > npus) { - pmix_show_help("help-prte-rmaps-base.txt", "mapping-too-low", true, - cpus_per_rank, npus, - prte_rmaps_base_print_mapping(prte_rmaps_base.mapping)); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - prte_output_verbose(20, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: assigning proc to object %d", k); - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, PRTE_ATTR_LOCAL, obj, PMIX_POINTER); - /* Position at next sequential resource for next search */ - start = (k + 1) % nobjs; - /* track the bookmark */ - jdata->bkmark_obj = start; - } - hwloc_bitmap_free(available); - } - } - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_SUCCESS; -} diff --git a/src/mca/rmaps/round_robin/rmaps_rr_component.c b/src/mca/rmaps/round_robin/rmaps_rr_component.c index 64eda0420f..8ad72117b0 100644 --- a/src/mca/rmaps/round_robin/rmaps_rr_component.c +++ b/src/mca/rmaps/round_robin/rmaps_rr_component.c @@ -45,11 +45,13 @@ static int my_priority; prte_rmaps_base_component_t prte_rmaps_round_robin_component = { .base_version = { - PRTE_RMAPS_BASE_VERSION_2_0_0, + PRTE_RMAPS_BASE_VERSION_4_0_0, .mca_component_name = "round_robin", - PRTE_MCA_BASE_MAKE_VERSION(component, PRTE_MAJOR_VERSION, PRTE_MINOR_VERSION, - PMIX_RELEASE_VERSION), + PRTE_MCA_BASE_MAKE_VERSION(component, + PRTE_MAJOR_VERSION, + PRTE_MINOR_VERSION, + PMIX_RELEASE_VERSION), .mca_open_component = prte_rmaps_round_robin_open, .mca_close_component = prte_rmaps_round_robin_close, .mca_query_component = prte_rmaps_round_robin_query, diff --git a/src/mca/rmaps/round_robin/rmaps_rr_mappers.c b/src/mca/rmaps/round_robin/rmaps_rr_mappers.c index 0f56338563..011ac8c74c 100644 --- a/src/mca/rmaps/round_robin/rmaps_rr_mappers.c +++ b/src/mca/rmaps/round_robin/rmaps_rr_mappers.c @@ -39,19 +39,19 @@ #include "src/mca/rmaps/base/base.h" #include "src/mca/rmaps/base/rmaps_private.h" -int prte_rmaps_rr_byslot(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t *node_list, - int32_t num_slots, pmix_rank_t num_procs) +int prte_rmaps_rr_byslot(prte_job_t *jdata, + prte_app_context_t *app, + pmix_list_t *node_list, + int32_t num_slots, + pmix_rank_t num_procs, + prte_rmaps_options_t *options) { - int i, nprocs_mapped; - prte_node_t *node; - int num_procs_to_assign, extra_procs_to_assign = 0, nxtra_nodes = 0; - hwloc_obj_t obj = NULL; + int i, rc, nprocs_mapped; + prte_node_t *node, *nd; + int extra_procs_to_assign = 0, nxtra_nodes = 0; float balance; - bool add_one = false; prte_proc_t *proc; - int orig_extra_procs; - bool made_progress = false; - bool orig_add_one; + bool second_pass = false; prte_output_verbose(2, prte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu", @@ -60,7 +60,7 @@ int prte_rmaps_rr_byslot(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t /* check to see if we can map all the procs */ if (num_slots < (int) app->num_procs) { - if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { + if (!options->oversubscribe) { pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, app->num_procs, app->app, prte_process_info.nodename); PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); @@ -68,57 +68,79 @@ int prte_rmaps_rr_byslot(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t } } - /* first pass: map the number of procs to each node until we - * map all specified procs or use all allocated slots - */ nprocs_mapped = 0; - PMIX_LIST_FOREACH(node, node_list, prte_node_t) + +pass: + PMIX_LIST_FOREACH_SAFE(node, nd, node_list, prte_node_t) { prte_output_verbose(2, prte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot working node %s", node->name); - /* get the root object as we are not assigning - * locale here except at the node level - */ - if (NULL != node->topology && NULL != node->topology->topo) { - obj = hwloc_get_root_obj(node->topology->topo); - } - if (node->slots <= node->slots_inuse) { - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr:slot node %s is full - skipping", node->name); - continue; - } - /* assign a number of procs equal to the number of available slots */ - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { - num_procs_to_assign = node->slots_available; + prte_rmaps_base_get_cpuset(jdata, node, options); + + /* compute the number of procs to go on this node */ + if (second_pass) { + options->nprocs = extra_procs_to_assign; + if (0 < nxtra_nodes) { + --nxtra_nodes; + if (0 == nxtra_nodes) { + --extra_procs_to_assign; + } + } } else { - num_procs_to_assign = node->slots; + if (!options->donotlaunch) { + rc = prte_rmaps_base_check_support(jdata, node, options); + if (PRTE_SUCCESS != rc) { + return rc; + } + } + /* assign a number of procs equal to the number of available slots */ + if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { + options->nprocs = node->slots_available; + } else { + options->nprocs = node->slots; + } + } + + if (!prte_rmaps_base_check_avail(jdata, app, node, node_list, NULL, options)) { + rc = PRTE_ERR_OUT_OF_RESOURCE; + continue; } prte_output_verbose(2, prte_rmaps_base_framework.framework_output, "mca:rmaps:rr:slot assigning %d procs to node %s", - (int) num_procs_to_assign, node->name); + (int) options->nprocs, node->name); - for (i = 0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) { - /* add this node to the map - do it only once */ - if (!PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_MAPPED)) { - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_MAPPED); - PMIX_RETAIN(node); - pmix_pointer_array_add(jdata->map->nodes, node); - ++(jdata->map->num_nodes); - } - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, node, app->idx))) { + for (i = 0; i < options->nprocs && nprocs_mapped < app->num_procs; i++) { + proc = prte_rmaps_base_setup_proc(jdata, app->idx, node, NULL, options); + if (NULL == proc) { return PRTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, - PRTE_ATTR_LOCAL, obj, PMIX_POINTER); + rc = prte_rmaps_base_check_oversubscribed(jdata, app, node); + if (PRTE_SUCCESS != rc) { + return rc; + } + } + + if (nprocs_mapped == app->num_procs) { + /* calculate the ranks for this app */ + rc = prte_rmaps_base_compute_vpids(jdata, app, options); + return rc; } } - if (nprocs_mapped == app->num_procs) { - /* we are done */ - return PRTE_SUCCESS; + if (second_pass) { + /* unable to do it */ + if (PRTE_ERR_OUT_OF_RESOURCE == rc) { + pmix_show_help("help-prte-rmaps-base.txt", + "out-of-resource", true, + app->num_procs, app->app, + prte_rmaps_base_print_mapping(options->map), + prte_hwloc_base_print_binding(options->bind)); + return PRTE_ERR_SILENT; + } + return PRTE_ERR_FAILED_TO_MAP; } prte_output_verbose(2, prte_rmaps_base_framework.framework_output, @@ -126,8 +148,11 @@ int prte_rmaps_rr_byslot(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t PRTE_JOBID_PRINT(jdata->nspace)); /* second pass: if we haven't mapped everyone yet, it is - * because we are oversubscribed. Figure out how many procs - * to add + * because we are oversubscribed. All of the nodes that are + * at max_slots have been removed from the list as that specifies + * a hard boundary, so the nodes remaining are available for + * handling the oversubscription. Figure out how many procs + * to add to each of them. */ balance = (float) ((int) app->num_procs - nprocs_mapped) / (float) pmix_list_get_size(node_list); @@ -140,445 +165,252 @@ int prte_rmaps_rr_byslot(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t * until all procs are mapped */ extra_procs_to_assign++; - /* flag that we added one */ - add_one = true; } + // Rescan the nodes + second_pass = true; + goto pass; +} - // Rescan the nodes due to a max_slots issue - rescan_nodes: +int prte_rmaps_rr_bynode(prte_job_t *jdata, + prte_app_context_t *app, + pmix_list_t *node_list, + int32_t num_slots, + pmix_rank_t num_procs, + prte_rmaps_options_t *options) +{ + int rc, j, nprocs_mapped, nnode; + prte_node_t *node, *nd; + float balance; + bool second_pass = false; + prte_proc_t *proc; - PMIX_LIST_FOREACH(node, node_list, prte_node_t) - { - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr:slot working node %s", node->name); - /* get the root object as we are not assigning - * locale except at the node level - */ - if (NULL != node->topology && NULL != node->topology->topo) { - obj = hwloc_get_root_obj(node->topology->topo); + prte_output_verbose(2, prte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: mapping by node for job %s app %d slots %d num_procs %lu", + PRTE_JOBID_PRINT(jdata->nspace), (int) app->idx, (int) num_slots, + (unsigned long) num_procs); + + /* quick check to see if we can map all the procs */ + if (num_slots < (int) app->num_procs) { + if (!options->oversubscribe) { + pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, + app->num_procs, app->app, prte_process_info.nodename); + PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); + return PRTE_ERR_SILENT; } + } - if (0 != node->slots_max && node->slots_max <= node->slots_inuse) { - /* cannot use this node - already at max_slots */ + nprocs_mapped = 0; + +pass: + /* divide the procs evenly across all nodes - this is the + * average we have to maintain as we go, but we adjust + * the number on each node to reflect its available slots. + * Obviously, if all nodes have the same number of slots, + * then the avg is what we get on each node - this is + * the most common situation. + */ + options->nprocs = (app->num_procs - nprocs_mapped) / pmix_list_get_size(node_list); + if (0 == options->nprocs) { + /* if there are less procs than nodes, we have to + * place at least one/node + */ + options->nprocs = 1; + } + + PMIX_LIST_FOREACH_SAFE(node, nd, node_list, prte_node_t) + { + prte_rmaps_base_get_cpuset(jdata, node, options); + + if (!prte_rmaps_base_check_avail(jdata, app, node, node_list, NULL, options)) { + rc = PRTE_ERR_OUT_OF_RESOURCE; continue; } - /* Save original values in case we need to reset them due to max_slots */ - orig_extra_procs = extra_procs_to_assign; - orig_add_one = add_one; - if (add_one) { - if (0 == nxtra_nodes) { - --extra_procs_to_assign; - add_one = false; - } else { - --nxtra_nodes; - } - } - if (node->slots <= node->slots_inuse) { - /* nodes are already oversubscribed */ - num_procs_to_assign = extra_procs_to_assign; - } else { - /* nodes have some room */ - num_procs_to_assign = node->slots - node->slots_inuse + extra_procs_to_assign; - } + /* if oversubscribe is specified, then just ignore the + * number of slots on each node and assign this number. + * Note that oversubscribe automatically dictates that + * we do not bind, so binding can also be ignored */ - if (0 != node->slots_max) { - if (node->slots_max < (node->slots_inuse + num_procs_to_assign)) { - num_procs_to_assign = node->slots_max - node->slots_inuse; - if (0 >= num_procs_to_assign) { - /* Undo the adjustments to these variables from above */ - extra_procs_to_assign = orig_extra_procs; - if (orig_add_one) { - if (0 == nxtra_nodes) { - ++extra_procs_to_assign; - add_one = true; - } else { - ++nxtra_nodes; - } - } - continue; - } + if (!options->oversubscribe) { + /* since oversubscribe is not allowed , cap our usage + * at the number of available slots */ + if (node->slots_available < options->nprocs) { + options->nprocs = node->slots_available; } } - /* add this node to the map - do it only once */ - if (!PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_MAPPED)) { - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_MAPPED); - PMIX_RETAIN(node); - pmix_pointer_array_add(jdata->map->nodes, node); - ++(jdata->map->num_nodes); - } + PRTE_OUTPUT_VERBOSE((10, prte_rmaps_base_framework.framework_output, + "%s NODE %s ASSIGNING %d PROCS", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + node->name, options->nprocs)); - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr:slot adding up to %d procs to node %s", - num_procs_to_assign, node->name); - for (i = 0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) { - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, node, app->idx))) { + for (j=0; j < options->nprocs && nprocs_mapped < app->num_procs; j++) { + proc = prte_rmaps_base_setup_proc(jdata, app->idx, node, NULL, options); + if (NULL == proc) { return PRTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, PRTE_ATTR_LOCAL, obj, - PMIX_POINTER); - } - /* We made progress mapping at least 1 process in this loop */ - made_progress = true; - /* not all nodes are equal, so only set oversubscribed for - * this node if it is in that state - */ - if (node->slots < (int) node->num_procs) { - /* flag the node as oversubscribed so that sched-yield gets - * properly set - */ - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_OVERSUBSCRIBED); - PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); - /* check for permission */ - if (PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_SLOTS_GIVEN)) { - /* if we weren't given a directive either way, then we will error out - * as the #slots were specifically given, either by the host RM or - * via hostfile/dash-host */ - if (!(PRTE_MAPPING_SUBSCRIBE_GIVEN - & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, - app->num_procs, app->app, prte_process_info.nodename); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - return PRTE_ERR_SILENT; - } else if (PRTE_MAPPING_NO_OVERSUBSCRIBE - & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - /* if we were explicitly told not to oversubscribe, then don't */ - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, - app->num_procs, app->app, prte_process_info.nodename); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - return PRTE_ERR_SILENT; - } + rc = prte_rmaps_base_check_oversubscribed(jdata, app, node); + if (PRTE_SUCCESS != rc) { + return rc; } } - /* if we have mapped everything, then we are done */ if (nprocs_mapped == app->num_procs) { - break; + /* calculate the ranks for this app */ + rc = prte_rmaps_base_compute_vpids(jdata, app, options); + return rc; } } - /* If we went through the loop and did not find a place for any one process - * then all of the nodes are full. - */ - if (!made_progress) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", - true, app->num_procs, app->app, prte_process_info.nodename); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - return PRTE_ERR_SILENT; - } - if (nprocs_mapped != app->num_procs) { - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr:slot Re-scan all nodes. Mapped %d, Target %d (%c)", - nprocs_mapped, app->num_procs, - made_progress ? 'T' : 'F'); - made_progress = false; - goto rescan_nodes; + if (second_pass) { + /* unable to do it */ + if (PRTE_ERR_OUT_OF_RESOURCE == rc) { + pmix_show_help("help-prte-rmaps-base.txt", + "out-of-resource", true, + app->num_procs, app->app, + prte_rmaps_base_print_mapping(options->map), + prte_hwloc_base_print_binding(options->bind)); + return PRTE_ERR_SILENT; + } + return PRTE_ERR_FAILED_TO_MAP; } + prte_output_verbose(2, prte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:node job %s is oversubscribed - performing second pass", + PRTE_JOBID_PRINT(jdata->nspace)); - return PRTE_SUCCESS; + /* second pass: if we haven't mapped everyone yet, it is + * because we are oversubscribed. All of the nodes that are + * at max_slots have been removed from the list as that specifies + * a hard boundary, so the nodes remaining are available for + * handling the oversubscription. + */ + second_pass = true; + goto pass; } -int prte_rmaps_rr_bynode(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t *node_list, - int32_t num_slots, pmix_rank_t num_procs) +/* mapping by cpu */ +int prte_rmaps_rr_bycpu(prte_job_t *jdata, prte_app_context_t *app, + pmix_list_t *node_list, int32_t num_slots, + pmix_rank_t num_procs, prte_rmaps_options_t *options) { - int j, nprocs_mapped, nnodes; - prte_node_t *node; - int num_procs_to_assign, navg; - int extra_procs_to_assign = 0, nxtra_nodes = 0; - hwloc_obj_t obj = NULL; - float balance; - bool add_one = false; - bool oversubscribed = false; + int i, rc, nprocs_mapped; + prte_node_t *node, *nd; prte_proc_t *proc; - int orig_extra_procs; - bool made_progress = false; - bool orig_add_one; + char **tmp; + int ntomap; prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: mapping by node for job %s app %d slots %d num_procs %lu", - PRTE_JOBID_PRINT(jdata->nspace), (int) app->idx, (int) num_slots, - (unsigned long) num_procs); + "mca:rmaps:rr: mapping by cpu for job %s slots %d num_procs %lu", + PRTE_JOBID_PRINT(jdata->nspace), (int) num_slots, + (unsigned long)app->num_procs); - /* quick check to see if we can map all the procs */ + /* check to see if we can map all the procs */ if (num_slots < (int) app->num_procs) { - if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { + if (!options->oversubscribe) { pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, app->num_procs, app->app, prte_process_info.nodename); PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); return PRTE_ERR_SILENT; } - oversubscribed = true; } - nnodes = pmix_list_get_size(node_list); nprocs_mapped = 0; + tmp = pmix_argv_split(options->cpuset, ','); + ntomap = pmix_argv_count(tmp); + pmix_argv_free(tmp); - do { - /* divide the procs evenly across all nodes - this is the - * average we have to maintain as we go, but we adjust - * the number on each node to reflect its available slots. - * Obviously, if all nodes have the same number of slots, - * then the avg is what we get on each node - this is - * the most common situation. - */ - navg = ((int) app->num_procs - nprocs_mapped) / nnodes; - if (0 == navg) { - /* if there are less procs than nodes, we have to - * place at least one/node - */ - navg = 1; - } - - /* compute how many extra procs to put on each node */ - balance = (float) (((int) app->num_procs - nprocs_mapped) - (navg * nnodes)) - / (float) nnodes; - extra_procs_to_assign = (int) balance; - nxtra_nodes = 0; - add_one = false; - if (0 < (balance - (float) extra_procs_to_assign)) { - /* compute how many nodes need an extra proc */ - nxtra_nodes = ((int) app->num_procs - nprocs_mapped) - - ((navg + extra_procs_to_assign) * nnodes); - /* add one so that we add an extra proc to the first nodes - * until all procs are mapped - */ - extra_procs_to_assign++; - /* flag that we added one */ - add_one = true; - } - + PMIX_LIST_FOREACH_SAFE(node, nd, node_list, prte_node_t) + { prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: mapping by node navg %d extra_procs %d extra_nodes %d", - navg, extra_procs_to_assign, nxtra_nodes); + "mca:rmaps:rr:cpu working node %s", node->name); - nnodes = 0; - PMIX_LIST_FOREACH(node, node_list, prte_node_t) - { - /* get the root object as we are not assigning - * locale except at the node level - */ - if (NULL != node->topology && NULL != node->topology->topo) { - obj = hwloc_get_root_obj(node->topology->topo); - } + prte_rmaps_base_get_cpuset(jdata, node, options); - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && - 0 != node->slots_max && - node->slots_max <= node->slots_inuse) { - /* cannot use this node - already at max_slots */ - continue; - } - - if (oversubscribed) { - /* Save original values in case we need to reset them due to max_slots */ - orig_extra_procs = extra_procs_to_assign; - orig_add_one = add_one; - /* compute the number of procs to go on this node */ - if (add_one) { - if (0 == nxtra_nodes) { - --extra_procs_to_assign; - add_one = false; - } else { - --nxtra_nodes; - } - } - /* everybody just takes their share */ - num_procs_to_assign = navg + extra_procs_to_assign; - - if (0 != node->slots_max) { - if (node->slots_max < (node->slots_inuse + num_procs_to_assign)) { - num_procs_to_assign = node->slots_max - node->slots_inuse; - if (0 >= num_procs_to_assign) { - /* Undo the adjustments to these variables from above */ - extra_procs_to_assign = orig_extra_procs; - if (orig_add_one) { - if (0 == nxtra_nodes) { - ++extra_procs_to_assign; - add_one = true; - } else { - ++nxtra_nodes; - } - } - continue; - } - } - } - } else if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && - node->slots <= node->slots_inuse) { - /* since we are not oversubcribed, ignore this node */ - continue; + if (options->ordered || !options->overload) { + options->nprocs = ntomap; + } else { + /* assign a number of procs equal to the number of available slots */ + if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { + options->nprocs = node->slots_available; } else { - /* if we are not oversubscribed, then there are enough - * slots to handle all the procs. However, not every - * node will have the same number of slots, so we - * have to track how many procs to "shift" elsewhere - * to make up the difference - */ - - /* compute the number of procs to go on this node */ - if (add_one) { - if (0 == nxtra_nodes) { - --extra_procs_to_assign; - add_one = false; - } else { - --nxtra_nodes; - } - } - /* if slots < avg + extra (adjusted for cpus/proc), then try to take all */ - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && - node->slots_available < (navg + extra_procs_to_assign)) { - num_procs_to_assign = node->slots_available; - /* if we can't take any proc, skip following steps */ - if (num_procs_to_assign == 0) { - continue; - } - } else { - /* take the avg + extra */ - num_procs_to_assign = navg + extra_procs_to_assign; - } - PRTE_OUTPUT_VERBOSE((20, prte_rmaps_base_framework.framework_output, - "%s NODE %s AVG %d ASSIGN %d EXTRA %d", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), node->name, navg, - num_procs_to_assign, extra_procs_to_assign)); - } - /* add this node to the map, but only do so once */ - if (!PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_MAPPED)) { - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_MAPPED); - PMIX_RETAIN(node); - pmix_pointer_array_add(jdata->map->nodes, node); - ++(jdata->map->num_nodes); - } - nnodes++; // track how many nodes remain available - PRTE_OUTPUT_VERBOSE((20, prte_rmaps_base_framework.framework_output, - "%s NODE %s ASSIGNING %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - node->name, num_procs_to_assign)); - for (j = 0; j < num_procs_to_assign && nprocs_mapped < app->num_procs; j++) { - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, node, app->idx))) { - return PRTE_ERR_OUT_OF_RESOURCE; - } - nprocs_mapped++; - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, PRTE_ATTR_LOCAL, obj, - PMIX_POINTER); - } - /* not all nodes are equal, so only set oversubscribed for - * this node if it is in that state - */ - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && - node->slots < (int) node->num_procs) { - /* flag the node as oversubscribed so that sched-yield gets - * properly set - */ - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_OVERSUBSCRIBED); - PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); - /* check for permission */ - if (PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_SLOTS_GIVEN)) { - /* if we weren't given a directive either way, then we will error out - * as the #slots were specifically given, either by the host RM or - * via hostfile/dash-host */ - if (!(PRTE_MAPPING_SUBSCRIBE_GIVEN - & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", - true, app->num_procs, app->app, prte_process_info.nodename); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - return PRTE_ERR_SILENT; - } else if (PRTE_MAPPING_NO_OVERSUBSCRIBE - & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - /* if we were explicitly told not to oversubscribe, then don't */ - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", - true, app->num_procs, app->app, prte_process_info.nodename); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - return PRTE_ERR_SILENT; - } - } - } - if (nprocs_mapped == app->num_procs) { - /* we are done */ - break; + options->nprocs = node->slots; } } - } while (nprocs_mapped < app->num_procs && 0 < nnodes); - /* now fillin as required until fully mapped */ - while (nprocs_mapped < app->num_procs) { - made_progress = false; - PMIX_LIST_FOREACH(node, node_list, prte_node_t) - { - /* get the root object as we are not assigning - * locale except at the node level - */ - if (NULL != node->topology && NULL != node->topology->topo) { - obj = hwloc_get_root_obj(node->topology->topo); - } + if (!prte_rmaps_base_check_avail(jdata, app, node, node_list, NULL, options)) { + rc = PRTE_ERR_OUT_OF_RESOURCE; + continue; + } - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && - 0 != node->slots_max && - node->slots_max <= node->slots_inuse) { - /* cannot use this node - already at max_slots */ - continue; - } + prte_output_verbose(2, prte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:cpu assigning %d procs to node %s", + (int) options->nprocs, node->name); - PRTE_OUTPUT_VERBOSE((20, prte_rmaps_base_framework.framework_output, - "%s ADDING PROC TO NODE %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - node->name)); - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, node, app->idx))) { + for (i = 0; i < options->nprocs && nprocs_mapped < app->num_procs; i++) { + proc = prte_rmaps_base_setup_proc(jdata, app->idx, node, NULL, options); + if (NULL == proc) { return PRTE_ERR_OUT_OF_RESOURCE; } - made_progress = true; nprocs_mapped++; - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, PRTE_ATTR_LOCAL, obj, - PMIX_POINTER); - /* not all nodes are equal, so only set oversubscribed for - * this node if it is in that state - */ - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && - node->slots < (int) node->num_procs) { - /* flag the node as oversubscribed so that sched-yield gets - * properly set - */ - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_OVERSUBSCRIBED); - PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); - } - if (nprocs_mapped == app->num_procs) { - /* we are done */ - break; + rc = prte_rmaps_base_check_oversubscribed(jdata, app, node); + if (PRTE_SUCCESS != rc) { + return rc; } } - if (!made_progress) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", - true, app->num_procs, app->app, prte_process_info.nodename); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - return PRTE_ERR_SILENT; + if (nprocs_mapped == app->num_procs) { + /* calculate the ranks for this app */ + rc = prte_rmaps_base_compute_vpids(jdata, app, options); + return rc; } } - return PRTE_SUCCESS; + /* if we get here, then we were unable to map all the procs */ + if (PRTE_ERR_OUT_OF_RESOURCE == rc) { + pmix_show_help("help-prte-rmaps-base.txt", + "out-of-resource", true, + app->num_procs, app->app, + prte_rmaps_base_print_mapping(options->map), + prte_hwloc_base_print_binding(options->bind)); + return PRTE_ERR_SILENT; + } + return PRTE_ERR_FAILED_TO_MAP; } -static int byobj_span(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t *node_list, - int32_t num_slots, pmix_rank_t num_procs, hwloc_obj_type_t target, - unsigned cache_level); - /* mapping by hwloc object looks a lot like mapping by node, * but has the added complication of possibly having different * numbers of objects on each node */ -int prte_rmaps_rr_byobj(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t *node_list, - int32_t num_slots, pmix_rank_t num_procs, hwloc_obj_type_t target, - unsigned cache_level) +int prte_rmaps_rr_byobj(prte_job_t *jdata, prte_app_context_t *app, + pmix_list_t *node_list, int32_t num_slots, + pmix_rank_t num_procs, + prte_rmaps_options_t *options) { - int i, nmapped, nprocs_mapped; - prte_node_t *node; - int nprocs, start, cpus_per_rank, npus; - hwloc_obj_t obj = NULL, root; - unsigned int nobjs; - bool add_one; - bool second_pass, use_hwthread_cpus; + int i, rc, nprocs_mapped, nprocs; + prte_node_t *node, *nd; + int extra_procs_to_assign = 0, nxtra_nodes = 0; + int navg, nxtra_objs = 0; + float balance; prte_proc_t *proc; - uint16_t u16, *u16ptr = &u16; - char *job_cpuset; - prte_hwloc_topo_data_t *rdata; - hwloc_cpuset_t available, mycpus; - bool found_obj; + bool second_pass = false; + bool span = false; + hwloc_obj_t obj = NULL; + unsigned j, total_nobjs, nobjs; + + prte_output_verbose(2, prte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: mapping by %s for job %s slots %d num_procs %lu", + hwloc_obj_type_string(options->maptype), + PRTE_JOBID_PRINT(jdata->nspace), + (int) num_slots, (unsigned long) num_procs); + + /* quick check to see if we can map all the procs */ + if (num_slots < app->num_procs) { + if (!options->oversubscribe) { + pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, + app->num_procs, app->app, prte_process_info.nodename); + PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); + return PRTE_ERR_SILENT; + } + } /* there are two modes for mapping by object: span and not-span. The * span mode essentially operates as if there was just a single @@ -594,42 +426,36 @@ int prte_rmaps_rr_byobj(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t * to the next node. Thus, procs tend to be "front loaded" onto the * list of nodes, as opposed to being "load balanced" in the span mode */ - if (PRTE_MAPPING_SPAN & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - return byobj_span(jdata, app, node_list, num_slots, num_procs, target, cache_level); - } - - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: mapping no-span by %s for job %s slots %d num_procs %lu", - hwloc_obj_type_string(target), PRTE_JOBID_PRINT(jdata->nspace), - (int) num_slots, (unsigned long) num_procs); - - /* quick check to see if we can map all the procs */ - if (num_slots < app->num_procs) { - if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, - app->num_procs, app->app, prte_process_info.nodename); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - return PRTE_ERR_SILENT; + if (options->mapspan) { + /* we know we have enough slots, or that oversubscrption is allowed, so + * next determine how many total objects we have to work with + */ + total_nobjs = 0; + PMIX_LIST_FOREACH(node, node_list, prte_node_t) + { + /* get the number of objects of this type on this node */ + total_nobjs += prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, + options->maptype, options->cmaplvl); } - } - /* see if this job has a "soft" cgroup assignment */ - job_cpuset = NULL; - prte_get_attribute(&jdata->attributes, PRTE_JOB_CPUSET, (void **) &job_cpuset, PMIX_STRING); - - /* see if they want multiple cpus/rank */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, (void **) &u16ptr, - PMIX_UINT16)) { - cpus_per_rank = u16; - } else { - cpus_per_rank = 1; - } + if (0 == total_nobjs) { + return PRTE_ERR_NOT_FOUND; + } + /* divide the procs evenly across all objects */ + navg = app->num_procs / total_nobjs; + if (0 == navg) { + /* if there are less procs than objects, we have to + * place at least one/obj + */ + navg = 1; + } - /* check for type of cpu being used */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL)) { - use_hwthread_cpus = true; - } else { - use_hwthread_cpus = false; + /* compute how many objs need an extra proc */ + nxtra_objs = app->num_procs - (navg * total_nobjs); + if (0 > nxtra_objs) { + nxtra_objs = 0; + } + span = true; } /* we know we have enough slots, or that oversubscrption is allowed, so @@ -638,402 +464,179 @@ int prte_rmaps_rr_byobj(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t * then loop thru the list again to handle the oversubscription */ nprocs_mapped = 0; - second_pass = false; - do { - add_one = false; - PMIX_LIST_FOREACH(node, node_list, prte_node_t) - { - if (NULL == node->topology || NULL == node->topology->topo) { - pmix_show_help("help-prte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); - return PRTE_ERR_SILENT; - } - start = 0; - /* get the number of objects of this type on this node */ - nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level); - if (0 == nobjs) { - continue; - } - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: found %u %s objects on node %s", nobjs, - hwloc_obj_type_string(target), node->name); - /* if this is a comm_spawn situation, start with the object - * where the parent left off and increment */ - if (!PMIX_NSPACE_INVALID(jdata->originator.nspace) && UINT_MAX != jdata->bkmark_obj) { - start = (jdata->bkmark_obj + 1) % nobjs; +pass: + options->total_nobjs = 0; + PMIX_LIST_FOREACH(node, node_list, prte_node_t) + { + + prte_rmaps_base_get_cpuset(jdata, node, options); + options->nobjs = 0; + /* have to delay checking for availability until we have the object */ + + /* get the number of objects of this type on this node */ + nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, + options->maptype, options->cmaplvl); + if (0 == nobjs) { + continue; + } + prte_output_verbose(2, prte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: found %u %s objects on node %s", + nobjs, hwloc_obj_type_string(options->maptype), + node->name); + + /* compute the number of procs to go on this node */ + if (second_pass) { + nprocs = extra_procs_to_assign; + if (0 < nxtra_nodes) { + --nxtra_nodes; + if (0 == nxtra_nodes) { + --extra_procs_to_assign; + } } - /* compute the number of procs to go on this node */ - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { - nprocs = node->slots_available; - } else { - nprocs = nobjs; + } else { + if (!options->donotlaunch) { + rc = prte_rmaps_base_check_support(jdata, node, options); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); + return rc; + } } - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: calculated nprocs %d", nprocs); - if (nprocs < 1) { - if (second_pass) { - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && - 0 != node->slots_max && - node->slots_max <= node->slots_inuse) { - /* cannot use this node - already at max_slots */ - continue; - } - /* already checked for oversubscription permission, so at least put - * one proc on it - */ - nprocs = 1; - /* offset our starting object position to avoid always - * hitting the first one - */ - start = node->num_procs % nobjs; + if (span) { + if (navg <= node->slots_available) { + nprocs = navg; } else { - continue; + nprocs = node->slots_available; } - } else if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && - 0 != node->slots_max && - (node->slots_inuse + nprocs) > node->slots_max) { - nprocs = node->slots_max - node->slots_inuse; - if (0 >= nprocs) { - /* cannot use this node */ - continue; + if (0 < nxtra_objs) { + nprocs++; + nxtra_objs--; + } + } else { + /* assign a number of procs equal to the number of available slots */ + if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { + nprocs = node->slots_available; + } else { + nprocs = node->slots; } } - /* get the available processors on this node */ - root = hwloc_get_root_obj(node->topology->topo); - if (NULL == root->userdata) { - /* incorrect */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - return PRTE_ERR_BAD_PARAM; - } - rdata = (prte_hwloc_topo_data_t *) root->userdata; - available = hwloc_bitmap_dup(rdata->available); - if (NULL != job_cpuset) { - /* deal with any "soft" cgroup specification */ - mycpus = prte_hwloc_base_generate_cpuset(node->topology->topo, use_hwthread_cpus, - job_cpuset); - hwloc_bitmap_and(available, mycpus, available); - hwloc_bitmap_free(mycpus); - } - - /* add this node to the map, if reqd */ - if (!PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_MAPPED)) { - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_MAPPED); - PMIX_RETAIN(node); - pmix_pointer_array_add(jdata->map->nodes, node); - ++(jdata->map->num_nodes); - } - nmapped = 0; - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: assigning nprocs %d", nprocs); - - do { - found_obj = false; - /* loop through the number of objects */ - for (i = 0; - i < (int) nobjs && nmapped < nprocs && nprocs_mapped < (int) app->num_procs; - i++) { + } + prte_output_verbose(2, prte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assigning nprocs %d", nprocs); + + if (span) { + /* if we are mapping spanned, then we loop over + * procs as the outer loop and loop over objects + * as the inner loop so we balance procs across + * all the objects on the node */ + for (i=0; i < nprocs && nprocs_mapped < app->num_procs; i++) { + for (j=0; j < nobjs && nprocs_mapped < app->num_procs; j++) { prte_output_verbose(10, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: assigning proc to object %d", - (i + start) % nobjs); + "mca:rmaps:rr: assigning proc to object %d", j); /* get the hwloc object */ - obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, target, - cache_level, (i + start) % nobjs); + obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, + options->maptype, options->cmaplvl, j); if (NULL == obj) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_NOT_FOUND; + /* out of objects on this node */ + break; } - npus = prte_hwloc_base_get_npus(node->topology->topo, use_hwthread_cpus, - available, obj); - if (0 == npus) { + options->nprocs = nprocs; + if (!prte_rmaps_base_check_avail(jdata, app, node, node_list, obj, options)) { + rc = PRTE_ERR_OUT_OF_RESOURCE; continue; } - found_obj = true; - if (cpus_per_rank > npus) { - pmix_show_help("help-prte-rmaps-base.txt", "mapping-too-low", true, - cpus_per_rank, npus, - prte_rmaps_base_print_mapping(prte_rmaps_base.mapping)); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, node, app->idx))) { - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } + proc = prte_rmaps_base_setup_proc(jdata, app->idx, node, obj, options); + if (NULL == proc) { return PRTE_ERR_OUT_OF_RESOURCE; } + /* setup_proc removes any node at max_slots */ + if (0 == i) { + options->total_nobjs++; + } + options->nobjs++; nprocs_mapped++; - nmapped++; - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, PRTE_ATTR_LOCAL, - obj, PMIX_POINTER); - /* track the bookmark */ - jdata->bkmark_obj = (i + start) % nobjs; - } - } while (found_obj && nmapped < nprocs && nprocs_mapped < (int) app->num_procs); - if (!found_obj) { - char *err; - hwloc_bitmap_list_asprintf(&err, available); - pmix_show_help("help-prte-rmaps-base.txt", "insufficient-cpus", true, - prte_rmaps_base_print_mapping(prte_rmaps_base.mapping), - (NULL == prte_hwloc_default_cpu_list) ? "N/A" - : prte_hwloc_default_cpu_list, - (NULL == job_cpuset) ? "N/A" : job_cpuset, err); - hwloc_bitmap_free(available); - free(err); - if (NULL != job_cpuset) { - free(job_cpuset); + rc = prte_rmaps_base_check_oversubscribed(jdata, app, node); + if (PRTE_SUCCESS != rc) { + return rc; + } } - return PRTE_ERR_SILENT; } - hwloc_bitmap_free(available); - add_one = true; - /* not all nodes are equal, so only set oversubscribed for - * this node if it is in that state - */ - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && - node->slots < (int) node->num_procs) { - /* flag the node as oversubscribed so that sched-yield gets - * properly set - */ - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_OVERSUBSCRIBED); - PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); - /* check for permission */ - if (PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_SLOTS_GIVEN)) { - /* if we weren't given a directive either way, then we will error out - * as the #slots were specifically given, either by the host RM or - * via hostfile/dash-host */ - if (!(PRTE_MAPPING_SUBSCRIBE_GIVEN - & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", - true, app->num_procs, app->app, prte_process_info.nodename); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } else if (PRTE_MAPPING_NO_OVERSUBSCRIBE - & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - /* if we were explicitly told not to oversubscribe, then don't */ - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", - true, app->num_procs, app->app, prte_process_info.nodename); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; + } else { + /* if we are not mapping spanned, then we loop over + * objects as the outer loop and loop over procs + * as the inner loop so that procs fill a given + * object before moving to the next one on the node */ + for (j=0; j < nobjs && nprocs_mapped < app->num_procs; j++) { + /* get the hwloc object */ + obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, + options->maptype, options->cmaplvl, j); + if (NULL == obj) { + /* out of objects on this node */ + break; + } + options->nprocs = nprocs; + if (!prte_rmaps_base_check_avail(jdata, app, node, node_list, obj, options)) { + rc = PRTE_ERR_OUT_OF_RESOURCE; + continue; + } + options->total_nobjs++; + options->nobjs++; + for (i=0; i < options->nprocs && nprocs_mapped < app->num_procs; i++) { + proc = prte_rmaps_base_setup_proc(jdata, app->idx, node, obj, options); + if (NULL == proc) { + return PRTE_ERR_OUT_OF_RESOURCE; + } + /* setup_proc removes any node at max_slots */ + nprocs_mapped++; + rc = prte_rmaps_base_check_oversubscribed(jdata, app, node); + if (PRTE_SUCCESS != rc) { + return rc; } } } - if (nprocs_mapped == app->num_procs) { - /* we are done */ - break; - } } - second_pass = true; - } while (add_one && nprocs_mapped < app->num_procs); - - if (NULL != job_cpuset) { - free(job_cpuset); - } - - if (nprocs_mapped < app->num_procs) { - /* usually means there were no objects of the requested type */ - return PRTE_ERR_NOT_FOUND; - } - - return PRTE_SUCCESS; -} - -static int byobj_span(prte_job_t *jdata, prte_app_context_t *app, pmix_list_t *node_list, - int32_t num_slots, pmix_rank_t num_procs, hwloc_obj_type_t target, - unsigned cache_level) -{ - int i, j, nprocs_mapped, navg; - prte_node_t *node; - int nprocs, nxtra_objs, npus, cpus_per_rank; - hwloc_obj_t obj = NULL, root; - unsigned int nobjs; - prte_proc_t *proc; - uint16_t u16, *u16ptr = &u16; - char *job_cpuset; - prte_hwloc_topo_data_t *rdata; - hwloc_cpuset_t available, mycpus; - bool use_hwthread_cpus; - - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: mapping span by %s for job %s slots %d num_procs %lu", - hwloc_obj_type_string(target), PRTE_JOBID_PRINT(jdata->nspace), - (int) num_slots, (unsigned long) num_procs); - - /* quick check to see if we can map all the procs */ - if (num_slots < (int) app->num_procs) { - if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, - app->num_procs, app->app, prte_process_info.nodename); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - return PRTE_ERR_SILENT; + if (nprocs_mapped == app->num_procs) { + /* calculate the ranks for this app */ + rc = prte_rmaps_base_compute_vpids(jdata, app, options); + return rc; } } - /* we know we have enough slots, or that oversubscrption is allowed, so - * next determine how many total objects we have to work with - */ - nobjs = 0; - PMIX_LIST_FOREACH(node, node_list, prte_node_t) - { - if (NULL == node->topology || NULL == node->topology->topo) { - pmix_show_help("help-prte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); + if (second_pass) { + /* unable to do it */ + if (PRTE_ERR_OUT_OF_RESOURCE == rc) { + pmix_show_help("help-prte-rmaps-base.txt", + "out-of-resource", true, + app->num_procs, app->app, + prte_rmaps_base_print_mapping(options->map), + prte_hwloc_base_print_binding(options->bind)); return PRTE_ERR_SILENT; } - /* get the number of objects of this type on this node */ - nobjs += prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level); - } - - if (0 == nobjs) { - return PRTE_ERR_NOT_FOUND; - } - - /* see if this job has a "soft" cgroup assignment */ - job_cpuset = NULL; - prte_get_attribute(&jdata->attributes, PRTE_JOB_CPUSET, (void **) &job_cpuset, PMIX_STRING); - - /* see if they want multiple cpus/rank */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, (void **) &u16ptr, - PMIX_UINT16)) { - cpus_per_rank = u16; - } else { - cpus_per_rank = 1; - } - - /* check for type of cpu being used */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL)) { - use_hwthread_cpus = true; - } else { - use_hwthread_cpus = false; - } - - /* divide the procs evenly across all objects */ - navg = app->num_procs / nobjs; - if (0 == navg) { - /* if there are less procs than objects, we have to - * place at least one/obj - */ - navg = 1; + return PRTE_ERR_FAILED_TO_MAP; } - /* compute how many objs need an extra proc */ - if (0 > (nxtra_objs = app->num_procs - (navg * nobjs))) { - nxtra_objs = 0; - } - - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr: mapping by %s navg %d extra_objs %d", - hwloc_obj_type_string(target), navg, nxtra_objs); - - nprocs_mapped = 0; - PMIX_LIST_FOREACH(node, node_list, prte_node_t) - { - /* add this node to the map, if reqd */ - if (!PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_MAPPED)) { - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_MAPPED); - PMIX_RETAIN(node); - pmix_pointer_array_add(jdata->map->nodes, node); - ++(jdata->map->num_nodes); - } - /* get the available processors on this node */ - root = hwloc_get_root_obj(node->topology->topo); - if (NULL == root->userdata) { - /* incorrect */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_BAD_PARAM; - } - rdata = (prte_hwloc_topo_data_t *) root->userdata; - available = hwloc_bitmap_dup(rdata->available); - if (NULL != job_cpuset) { - /* deal with any "soft" cgroup specification */ - mycpus = prte_hwloc_base_generate_cpuset(node->topology->topo, use_hwthread_cpus, - job_cpuset); - hwloc_bitmap_and(available, mycpus, available); - hwloc_bitmap_free(mycpus); - } - /* get the number of objects of this type on this node */ - nobjs = prte_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level); - prte_output_verbose(2, prte_rmaps_base_framework.framework_output, - "mca:rmaps:rr:byobj: found %d objs on node %s", nobjs, node->name); - /* loop through the number of objects */ - for (i = 0; i < (int) nobjs && nprocs_mapped < (int) app->num_procs; i++) { - /* get the hwloc object */ - obj = prte_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, i); - if (NULL == obj) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_NOT_FOUND; - } - npus = prte_hwloc_base_get_npus(node->topology->topo, use_hwthread_cpus, available, obj); - if (cpus_per_rank > npus) { - pmix_show_help("help-prte-rmaps-base.txt", "mapping-too-low", true, cpus_per_rank, - npus, prte_rmaps_base_print_mapping(prte_rmaps_base.mapping)); - hwloc_bitmap_free(available); - if (NULL != job_cpuset) { - free(job_cpuset); - } - return PRTE_ERR_SILENT; - } - /* determine how many to map */ - if (navg <= node->slots_available) { - nprocs = navg; - } else { - nprocs = node->slots_available; - } - if (0 < nxtra_objs) { - nprocs++; - nxtra_objs--; - } - /* map the reqd number of procs */ - for (j = 0; j < nprocs && nprocs_mapped < app->num_procs; j++) { - if (NULL == (proc = prte_rmaps_base_setup_proc(jdata, node, app->idx))) { - return PRTE_ERR_OUT_OF_RESOURCE; - } - nprocs_mapped++; - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, PRTE_ATTR_LOCAL, obj, - PMIX_POINTER); - } - /* keep track of the node we last used */ - jdata->bookmark = node; - } - /* not all nodes are equal, so only set oversubscribed for - * this node if it is in that state + /* second pass: if we haven't mapped everyone yet, it is + * because we are oversubscribed. All of the nodes that are + * at max_slots have been removed from the list as that specifies + * a hard boundary, so the nodes remaining are available for + * handling the oversubscription. Figure out how many procs + * to add to each of them. + */ + balance = (float) ((int) app->num_procs - nprocs_mapped) + / (float) total_nobjs; + extra_procs_to_assign = (int) balance; + if (0 < (balance - (float) extra_procs_to_assign)) { + /* compute how many nodes need an extra proc */ + nxtra_nodes = app->num_procs - nprocs_mapped + - (extra_procs_to_assign * total_nobjs); + /* add one so that we add an extra proc to the first nodes + * until all procs are mapped */ - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL) && - node->slots < (int) node->num_procs) { - /* flag the node as oversubscribed so that sched-yield gets - * properly set - */ - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_OVERSUBSCRIBED); - PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); - } - if (nprocs_mapped == app->num_procs) { - /* we are done */ - break; - } - hwloc_bitmap_free(available); - } - if (NULL != job_cpuset) { - free(job_cpuset); + extra_procs_to_assign++; } + // Rescan the nodes + second_pass = true; + goto pass; return PRTE_SUCCESS; } diff --git a/src/mca/rmaps/seq/help-prte-rmaps-seq.txt b/src/mca/rmaps/seq/help-prte-rmaps-seq.txt index 97b89dc150..001eda7f90 100644 --- a/src/mca/rmaps/seq/help-prte-rmaps-seq.txt +++ b/src/mca/rmaps/seq/help-prte-rmaps-seq.txt @@ -68,3 +68,11 @@ A 1:0,1 # put this rank on node A, bound to socket 1, CPUs 0 and 1 B 1-4 # put this rank on node B, bound to CPUs 1-4 Please correct the syntax and try again. +# +[proc-failed-to-map] +A process could not be mapped: + + Hostname: %s + App: %s + +Could not continue. diff --git a/src/mca/rmaps/seq/rmaps_seq.c b/src/mca/rmaps/seq/rmaps_seq.c index 97c04dd6c9..60a53da536 100644 --- a/src/mca/rmaps/seq/rmaps_seq.c +++ b/src/mca/rmaps/seq/rmaps_seq.c @@ -52,10 +52,13 @@ #include "src/mca/rmaps/base/base.h" #include "src/mca/rmaps/base/rmaps_private.h" -static int prte_rmaps_seq_map(prte_job_t *jdata); +static int prte_rmaps_seq_map(prte_job_t *jdata, + prte_rmaps_options_t *options); /* define the module */ -prte_rmaps_base_module_t prte_rmaps_seq_module = {.map_job = prte_rmaps_seq_map}; +prte_rmaps_base_module_t prte_rmaps_seq_module = { + .map_job = prte_rmaps_seq_map +}; /* local object for tracking rank locations */ typedef struct { @@ -84,20 +87,43 @@ PMIX_CLASS_INSTANCE(seq_node_t, pmix_list_item_t, sn_con, sn_des); static char *prte_getline(FILE *fp); static int process_file(char *path, pmix_list_t *list); +static bool quickmatch(prte_node_t *nd, char *name) +{ + int n; + + if (0 == strcmp(nd->name, name)) { + return true; + } + if (0 == strcmp(nd->name, prte_process_info.nodename) && + (0 == strcmp(name, "localhost") || + 0 == strcmp(name, "127.0.0.1"))) { + return true; + } + if (NULL != nd->aliases) { + for (n=0; NULL != nd->aliases[n]; n++) { + if (0 == strcmp(nd->aliases[n], name)) { + return true; + } + } + } + return false; +} + /* * Sequentially map the ranks according to the placement in the * specified hostfile */ -static int prte_rmaps_seq_map(prte_job_t *jdata) +static int prte_rmaps_seq_map(prte_job_t *jdata, + prte_rmaps_options_t *options) { prte_job_map_t *map; prte_app_context_t *app; int i, n; int32_t j; pmix_list_item_t *item; - prte_node_t *node, *nd; + prte_node_t *node, *nd, *nsave = NULL; seq_node_t *sq, *save = NULL, *seq; - pmix_rank_t vpid; + pmix_rank_t vpid, apprank; int32_t num_nodes; int rc; pmix_list_t default_seq_list; @@ -105,7 +131,7 @@ static int prte_rmaps_seq_map(prte_job_t *jdata) prte_proc_t *proc; prte_mca_base_component_t *c = &prte_rmaps_seq_component.base_version; char *hosts = NULL; - bool use_hwthread_cpus, match; + bool match; PRTE_OUTPUT_VERBOSE((1, prte_rmaps_base_framework.framework_output, "%s rmaps:seq called on job %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), @@ -129,8 +155,6 @@ static int prte_rmaps_seq_map(prte_job_t *jdata) PRTE_JOBID_PRINT(jdata->nspace)); return PRTE_ERR_TAKE_NEXT_OPTION; } - /* we need to process it */ - goto process; } if (PRTE_MAPPING_SEQ != PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I don't know how to do these - defer */ @@ -140,9 +164,9 @@ static int prte_rmaps_seq_map(prte_job_t *jdata) return PRTE_ERR_TAKE_NEXT_OPTION; } -process: prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:seq: mapping job %s", PRTE_JOBID_PRINT(jdata->nspace)); + "mca:rmaps:seq: mapping job %s", + PRTE_JOBID_PRINT(jdata->nspace)); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { @@ -163,14 +187,6 @@ static int prte_rmaps_seq_map(prte_job_t *jdata) } } - /* check for type of cpu being used */ - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL) - && PRTE_BIND_TO_HWTHREAD == PRTE_GET_BINDING_POLICY(jdata->map->binding)) { - use_hwthread_cpus = true; - } else { - use_hwthread_cpus = false; - } - /* start at the beginning... */ vpid = 0; jdata->num_procs = 0; @@ -178,18 +194,13 @@ static int prte_rmaps_seq_map(prte_job_t *jdata) save = (seq_node_t *) pmix_list_get_first(&default_seq_list); } - /* initialize all the nodes as not included in this job map */ - for (j = 0; j < prte_node_pool->size; j++) { - if (NULL != (node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, j))) { - PRTE_FLAG_UNSET(node, PRTE_NODE_FLAG_MAPPED); - } - } - /* cycle through the app_contexts, mapping them sequentially */ for (i = 0; i < jdata->apps->size; i++) { - if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i))) { + app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i); + if (NULL == app) { continue; } + apprank = 0; /* specified seq file trumps all */ if (prte_get_attribute(&jdata->attributes, PRTE_JOB_FILE, (void **) &hosts, PMIX_STRING)) { @@ -311,7 +322,7 @@ static int prte_rmaps_seq_map(prte_job_t *jdata) if (NULL == node) { continue; } - if (0 == strcmp(node->name, sq->hostname)) { + if (quickmatch(node, sq->hostname)) { match = true; break; } @@ -323,134 +334,38 @@ static int prte_rmaps_seq_map(prte_job_t *jdata) rc = PRTE_ERR_SILENT; goto error; } - /* ensure the node is in the map */ - if (!PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_MAPPED)) { - PMIX_RETAIN(node); - pmix_pointer_array_add(map->nodes, node); - jdata->map->num_nodes++; - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_MAPPED); + /* check availability */ + prte_rmaps_base_get_cpuset(jdata, node, options); + if (!prte_rmaps_base_check_avail(jdata, app, node, seq_list, NULL, options)) { + continue; } - proc = prte_rmaps_base_setup_proc(jdata, node, i); - if (!PRTE_FLAG_TEST(app, PRTE_APP_FLAG_TOOL)) { - /* check if we are oversubscribed */ - if ((node->slots < (int) node->num_procs) || - (0 < node->slots_max && node->slots_max < (int) node->num_procs)) { - if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", true, - node->num_procs, app->app); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - rc = PRTE_ERR_SILENT; - goto error; - } - /* flag the node as oversubscribed so that sched-yield gets - * properly set - */ - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_OVERSUBSCRIBED); - PRTE_FLAG_SET(jdata, PRTE_JOB_FLAG_OVERSUBSCRIBED); - /* check for permission */ - if (PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_SLOTS_GIVEN)) { - /* if we weren't given a directive either way, then we will error out - * as the #slots were specifically given, either by the host RM or - * via hostfile/dash-host */ - if (!(PRTE_MAPPING_SUBSCRIBE_GIVEN & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping))) { - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", - true, app->num_procs, app->app); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - rc = PRTE_ERR_SILENT; - goto error; - } else if (PRTE_MAPPING_NO_OVERSUBSCRIBE & PRTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { - /* if we were explicitly told not to oversubscribe, then don't */ - pmix_show_help("help-prte-rmaps-base.txt", "prte-rmaps-base:alloc-error", - true, app->num_procs, app->app); - PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE); - rc = PRTE_ERR_SILENT; - goto error; - } - } - } + + /* map the proc */ + proc = prte_rmaps_base_setup_proc(jdata, i, node, NULL, options); + if (NULL == proc) { + pmix_show_help("help-prte-rmaps-seq.txt", "proc-failed-to-map", true, + sq->hostname, app->app); + rc = PRTE_ERR_SILENT; + goto error; + } + proc->name.rank = vpid; + vpid++; + proc->app_rank = apprank; + apprank++; + PMIX_RETAIN(proc); + rc = pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc); + if (PMIX_SUCCESS != rc) { + PMIX_RELEASE(proc); + return rc; + } + rc = prte_rmaps_base_check_oversubscribed(jdata, app, node); + if (PRTE_SUCCESS != rc) { + return rc; } - /* assign the vpid */ - proc->name.rank = vpid++; prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:seq: assign proc %s to node %s for app %s", + "mca:rmaps:seq: assigned proc %s to node %s for app %s", PRTE_VPID_PRINT(proc->name.rank), sq->hostname, app->app); - /* record the cpuset, if given */ - if (NULL != sq->cpuset) { - hwloc_cpuset_t bitmap; - char *cpu_bitmap; - if (NULL == node->topology || NULL == node->topology->topo) { - /* not allowed - for sequential cpusets, we must have - * the topology info - */ - pmix_show_help("help-prte-rmaps-base.txt", "rmaps:no-topology", true, - node->name); - rc = PRTE_ERR_SILENT; - goto error; - } - /* if we are using hwthreads as cpus and binding to hwthreads, then - * we can just copy the cpuset across as it already specifies things - * at that level */ - if (use_hwthread_cpus) { - cpu_bitmap = strdup(sq->cpuset); - } else { - /* setup the bitmap */ - bitmap = hwloc_bitmap_alloc(); - /* parse the slot_list to find the package and core */ - rc = prte_hwloc_base_cpu_list_parse(sq->cpuset, node->topology->topo, bitmap); - if (PRTE_ERR_NOT_FOUND == rc) { - char *tmp = prte_hwloc_base_cset2str(hwloc_topology_get_allowed_cpuset(node->topology->topo), - false, node->topology->topo); - pmix_show_help("help-rmaps-seq.txt", "missing-cpu", true, - prte_tool_basename, sq->cpuset, tmp); - free(tmp); - } else if (PRTE_ERROR == rc) { - pmix_show_help("help-rmaps-seq.txt", "bad-syntax", true, hosts); - rc = PRTE_ERR_SILENT; - hwloc_bitmap_free(bitmap); - goto error; - } else { - PRTE_ERROR_LOG(rc); - hwloc_bitmap_free(bitmap); - goto error; - } - /* note that we cannot set the proc locale to any specific object - * as the slot list may have assigned it to more than one - so - * leave that field NULL - */ - /* set the proc to the specified map */ - hwloc_bitmap_list_asprintf(&cpu_bitmap, bitmap); - hwloc_bitmap_free(bitmap); - } - prte_set_attribute(&proc->attributes, PRTE_PROC_CPU_BITMAP, PRTE_ATTR_GLOBAL, - cpu_bitmap, PMIX_STRING); - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps:seq: binding proc %s to cpuset %s bitmap %s", - PRTE_VPID_PRINT(proc->name.rank), sq->cpuset, cpu_bitmap); - /* note that the user specified the mapping */ - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYUSER); - PRTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, PRTE_MAPPING_GIVEN); - /* cleanup */ - free(cpu_bitmap); - } else { - hwloc_obj_t locale; - - /* assign the locale - okay for the topo to be null as - * it just means it wasn't returned - */ - if (NULL != node->topology && NULL != node->topology->topo) { - locale = hwloc_get_root_obj(node->topology->topo); - prte_set_attribute(&proc->attributes, PRTE_PROC_HWLOC_LOCALE, PRTE_ATTR_LOCAL, - locale, PMIX_POINTER); - } - } - - /* add to the jdata proc array */ - if (PRTE_SUCCESS - != (rc = pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc))) { - PRTE_ERROR_LOG(rc); - goto error; - } /* move to next node */ sq = (seq_node_t *) pmix_list_get_next(&sq->super); } @@ -469,10 +384,25 @@ static int prte_rmaps_seq_map(prte_job_t *jdata) } } - /* mark that this job is to be fully - * described in the launch msg */ - prte_set_attribute(&jdata->attributes, PRTE_JOB_FULLY_DESCRIBED, PRTE_ATTR_GLOBAL, NULL, - PMIX_BOOL); + /* compute local ranks */ + for (i=0; i < jdata->map->nodes->size; i++) { + node = (prte_node_t*)pmix_pointer_array_get_item(jdata->map->nodes, i); + if (NULL == node) { + continue; + } + vpid = 0; + for (n=0; n < node->procs->size; n++) { + proc = (prte_proc_t*)pmix_pointer_array_get_item(node->procs, n); + if (NULL == proc) { + continue; + } + if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { + continue; + } + proc->local_rank = vpid; + ++vpid; + } + } return PRTE_SUCCESS; diff --git a/src/mca/rmaps/seq/rmaps_seq_component.c b/src/mca/rmaps/seq/rmaps_seq_component.c index 7acbcff94c..1df6687e04 100644 --- a/src/mca/rmaps/seq/rmaps_seq_component.c +++ b/src/mca/rmaps/seq/rmaps_seq_component.c @@ -45,11 +45,13 @@ static int my_priority; prte_rmaps_base_component_t prte_rmaps_seq_component = { .base_version = { - PRTE_RMAPS_BASE_VERSION_2_0_0, + PRTE_RMAPS_BASE_VERSION_4_0_0, .mca_component_name = "seq", - PRTE_MCA_BASE_MAKE_VERSION(component, PRTE_MAJOR_VERSION, PRTE_MINOR_VERSION, - PMIX_RELEASE_VERSION), + PRTE_MCA_BASE_MAKE_VERSION(component, + PRTE_MAJOR_VERSION, + PRTE_MINOR_VERSION, + PMIX_RELEASE_VERSION), .mca_open_component = prte_rmaps_seq_open, .mca_close_component = prte_rmaps_seq_close, .mca_query_component = prte_rmaps_seq_query, diff --git a/src/mca/rtc/hwloc/rtc_hwloc.c b/src/mca/rtc/hwloc/rtc_hwloc.c index d58b9effbf..98b23ffb3a 100644 --- a/src/mca/rtc/hwloc/rtc_hwloc.c +++ b/src/mca/rtc/hwloc/rtc_hwloc.c @@ -88,10 +88,8 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) prte_app_context_t *context = cd->app; hwloc_cpuset_t cpuset; hwloc_obj_t root; - prte_hwloc_topo_data_t *sum; int rc = PRTE_ERROR; char *msg; - char *cpu_bitmap; prte_output_verbose(2, prte_rtc_base_framework.framework_output, "%s hwloc:set on child %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), @@ -108,10 +106,7 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) } /* Set process affinity, if given */ - cpu_bitmap = NULL; - if (!prte_get_attribute(&child->attributes, PRTE_PROC_CPU_BITMAP, (void **) &cpu_bitmap, - PMIX_STRING) - || NULL == cpu_bitmap || 0 == strlen(cpu_bitmap)) { + if (NULL == child->cpuset || 0 == strlen(child->cpuset)) { /* if the daemon is bound, then we need to "free" this proc */ if (NULL != prte_daemon_cores) { root = hwloc_get_root_obj(prte_hwloc_topology); @@ -120,9 +115,13 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) "incorrectly bound", prte_process_info.nodename, context->app, __FILE__, __LINE__); } - sum = (prte_hwloc_topo_data_t *) root->userdata; /* bind this proc to all available processors */ - rc = hwloc_set_cpubind(prte_hwloc_topology, sum->available, 0); +#if HWLOC_API_VERSION < 0x20000 + cpuset = root->allowed_cpuset; +#else + cpuset = (hwloc_cpuset_t)hwloc_topology_get_allowed_cpuset(prte_hwloc_topology); +#endif + rc = hwloc_set_cpubind(prte_hwloc_topology, cpuset, 0); /* if we got an error and this wasn't a default binding policy, then report it */ if (rc < 0 && PRTE_BINDING_POLICY_IS_SET(jobdat->map->binding)) { if (errno == ENOSYS) { @@ -131,7 +130,7 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) msg = "hwloc indicates cpu binding cannot be enforced"; } else { char *tmp; - (void) hwloc_bitmap_list_asprintf(&tmp, sum->available); + (void) hwloc_bitmap_list_asprintf(&tmp, cpuset); pmix_asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"", prte_strerror(rc), tmp); free(tmp); @@ -168,10 +167,10 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) } else { /* convert the list to a cpuset */ cpuset = hwloc_bitmap_alloc(); - if (0 != (rc = hwloc_bitmap_list_sscanf(cpuset, cpu_bitmap))) { + if (0 != (rc = hwloc_bitmap_list_sscanf(cpuset, child->cpuset))) { /* See comment above about "This may be a small memory leak" */ pmix_asprintf(&msg, "hwloc_bitmap_sscanf returned \"%s\" for the string \"%s\"", - prte_strerror(rc), cpu_bitmap); + prte_strerror(rc), child->cpuset); if (NULL == msg) { msg = "failed to convert bitmap list to hwloc bitmap"; } @@ -189,12 +188,13 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) prte_rtc_base_send_warn_show_help(write_fd, "help-prte-odls-default.txt", "not bound", prte_process_info.nodename, context->app, msg, __FILE__, __LINE__); - free(cpu_bitmap); + hwloc_bitmap_free(cpuset); return; } } /* bind as specified */ rc = hwloc_set_cpubind(prte_hwloc_topology, cpuset, 0); + hwloc_bitmap_free(cpuset); /* if we got an error and this wasn't a default binding policy, then report it */ if (rc < 0 && PRTE_BINDING_POLICY_IS_SET(jobdat->map->binding)) { char *tmp = NULL; @@ -204,7 +204,7 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) msg = "hwloc indicates cpu binding cannot be enforced"; } else { pmix_asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"", - prte_strerror(rc), cpu_bitmap); + prte_strerror(rc), child->cpuset); } if (PRTE_BINDING_REQUIRED(jobdat->map->binding)) { /* If binding is required, send an error up the pipe (which exits @@ -257,14 +257,10 @@ static void set(prte_odls_spawn_caddy_t *cd, int write_fd) prte_rtc_base_send_warn_show_help(write_fd, "help-prte-odls-default.txt", "memory not bound", prte_process_info.nodename, context->app, msg, __FILE__, __LINE__); - free(cpu_bitmap); return; } } } - if (NULL != cpu_bitmap) { - free(cpu_bitmap); - } } static void report_binding(prte_job_t *jobdat, int rank) diff --git a/src/mca/schizo/base/schizo_base_frame.c b/src/mca/schizo/base/schizo_base_frame.c index 9313b7e6ef..a6e002ea4b 100644 --- a/src/mca/schizo/base/schizo_base_frame.c +++ b/src/mca/schizo/base/schizo_base_frame.c @@ -146,9 +146,9 @@ bool prte_schizo_base_check_directives(char *directive, bool found; char *str; - /* if it starts with a ':', then these are just modifiers */ + /* if it starts with a ':', then these are just qualifiers */ if (':' == dir[0]) { - qls = pmix_argv_split(&dir[1], ','); + qls = pmix_argv_split(&dir[1], ':'); for (m=0; NULL != qls[m]; m++) { if (!prte_schizo_base_check_qualifiers(directive, quals, qls[m])) { pmix_argv_free(qls); @@ -197,13 +197,13 @@ bool prte_schizo_base_check_directives(char *directive, } found = false; for (m=0; NULL != pproptions[m]; m++) { - if (0 == strcasecmp(args[2], pproptions[m])) { + if (0 == strncasecmp(args[2], pproptions[m], strlen(args[2]))) { found = true; break; } } if (!found) { - v = pmix_argv_join(pproptions, ','); + v = pmix_argv_join(pproptions, ':'); pmix_asprintf(&q, "ppr:%s:[%s]", args[1], v); free(v); pmix_show_help("help-prte-rmaps-base.txt", @@ -214,13 +214,13 @@ bool prte_schizo_base_check_directives(char *directive, return false; } if (NULL != args[3]) { - qls = pmix_argv_split(args[3], ','); + qls = pmix_argv_split(args[3], ':'); } else { pmix_argv_free(args); return true; } } else { - qls = pmix_argv_split(args[1], ','); + qls = pmix_argv_split(args[1], ':'); } for (m=0; NULL != qls[m]; m++) { if (!prte_schizo_base_check_qualifiers(directive, quals, qls[m])) { @@ -237,7 +237,7 @@ bool prte_schizo_base_check_directives(char *directive, return true; } } - v = pmix_argv_join(valid, ','); + v = pmix_argv_join(valid, ':'); pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-directive", true, directive, dir, v); @@ -313,9 +313,10 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) PRTE_CLI_PACKAGE, PRTE_CLI_NODE, PRTE_CLI_SEQ, - PRTE_CLI_DIST, +// PRTE_CLI_DIST, PRTE_CLI_PPR, PRTE_CLI_RANKFILE, + PRTE_CLI_PELIST, NULL }; char *mapquals[] = { @@ -326,30 +327,23 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) PRTE_CLI_NOLOCAL, PRTE_CLI_HWTCPUS, PRTE_CLI_CORECPUS, - PRTE_CLI_DEVICE, +// PRTE_CLI_DEVICE, PRTE_CLI_INHERIT, PRTE_CLI_NOINHERIT, - PRTE_CLI_PELIST, PRTE_CLI_QFILE, PRTE_CLI_NOLAUNCH, + PRTE_CLI_ORDERED, NULL }; char *rankers[] = { PRTE_CLI_SLOT, - PRTE_CLI_HWT, - PRTE_CLI_CORE, - PRTE_CLI_L1CACHE, - PRTE_CLI_L2CACHE, - PRTE_CLI_L3CACHE, - PRTE_CLI_NUMA, - PRTE_CLI_PACKAGE, PRTE_CLI_NODE, + PRTE_CLI_FILL, + PRTE_CLI_SPAN, NULL }; char *rkquals[] = { - PRTE_CLI_SPAN, - PRTE_CLI_FILL, NULL }; @@ -368,7 +362,6 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) PRTE_CLI_OVERLOAD, PRTE_CLI_NOOVERLOAD, PRTE_CLI_IF_SUPP, - PRTE_CLI_ORDERED, PRTE_CLI_REPORT, NULL }; @@ -495,4 +488,5 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) return PRTE_SUCCESS; } -PMIX_CLASS_INSTANCE(prte_schizo_base_active_module_t, pmix_list_item_t, NULL, NULL); +PMIX_CLASS_INSTANCE(prte_schizo_base_active_module_t, + pmix_list_item_t, NULL, NULL); diff --git a/src/mca/schizo/base/schizo_base_stubs.c b/src/mca/schizo/base/schizo_base_stubs.c index 2a756e5cec..c8f4c3fd6e 100644 --- a/src/mca/schizo/base/schizo_base_stubs.c +++ b/src/mca/schizo/base/schizo_base_stubs.c @@ -112,8 +112,10 @@ int prte_schizo_base_add_directive(pmix_cli_result_t *results, // do we allow multiple directives? if (!check_multi(target)) { // report the error + tmp = pmix_argv_join(opt->values, ','); ptr = pmix_show_help_string("help-schizo-base.txt", "too-many-directives", - true, target, opt->values, deprecated, directive); + true, target, tmp, deprecated, directive); + free(tmp); fprintf(stderr, "%s\n", ptr); return PRTE_ERR_SILENT; } @@ -178,18 +180,10 @@ int prte_schizo_base_add_qualifier(pmix_cli_result_t *results, fprintf(stderr, "%s\n", ptr); return PRTE_ERR_SILENT; } else { - // does it already contain a qualifier? - if (NULL != strchr(opt->values[0], ':')) { - // can just add this one to the end - pmix_asprintf(&tmp, "%s:%s", opt->values[0], qualifier); - free(opt->values[0]); - opt->values[0] = tmp; - } else { - // append with a colon delimiter - pmix_asprintf(&tmp, "%s:%s", opt->values[0], qualifier); - free(opt->values[0]); - opt->values[0] = tmp; - } + // append with a colon delimiter + pmix_asprintf(&tmp, "%s:%s", opt->values[0], qualifier); + free(opt->values[0]); + opt->values[0] = tmp; } } else { // add the new option diff --git a/src/mca/schizo/ompi/schizo_ompi.c b/src/mca/schizo/ompi/schizo_ompi.c index 71653c0ef3..05eab71ab1 100644 --- a/src/mca/schizo/ompi/schizo_ompi.c +++ b/src/mca/schizo/ompi/schizo_ompi.c @@ -65,7 +65,7 @@ static int detect_proxy(char *argv); static int parse_env(char **srcenv, char ***dstenv, pmix_cli_result_t *cli); static void allow_run_as_root(pmix_cli_result_t *results); static int set_default_ranking(prte_job_t *jdata, - prte_schizo_options_t *options); + prte_rmaps_options_t *options); static int setup_fork(prte_job_t *jdata, prte_app_context_t *context); static void job_info(pmix_cli_result_t *results, void *jobinfo); @@ -216,7 +216,7 @@ static struct option ompioptions[] = { PMIX_OPTION_DEFINE("use-hwthread-cpus", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("cpu-set", PMIX_ARG_REQD), PMIX_OPTION_DEFINE("cpu-list", PMIX_ARG_REQD), - PMIX_OPTION_DEFINE("--bind-to-core", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("bind-to-core", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("bynode", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("bycore", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("byslot", PMIX_ARG_NONE), @@ -495,7 +495,7 @@ static int convert_deprecated_cli(pmix_cli_result_t *results, } /* --use-hwthread-cpus -> --bind-to hwthread */ else if (0 == strcmp(option, "use-hwthread-cpus")) { - rc = prte_schizo_base_add_directive(results, option, + rc = prte_schizo_base_add_qualifier(results, option, PRTE_CLI_BINDTO, PRTE_CLI_HWT, warn); PMIX_CLI_REMOVE_DEPRECATED(results, opt); @@ -1950,7 +1950,7 @@ static void allow_run_as_root(pmix_cli_result_t *results) } static int set_default_ranking(prte_job_t *jdata, - prte_schizo_options_t *options) + prte_rmaps_options_t *options) { int rc; prte_mapping_policy_t map; diff --git a/src/mca/schizo/prte/help-schizo-prterun.txt b/src/mca/schizo/prte/help-schizo-prterun.txt index ddf9c80480..b0848ef3eb 100644 --- a/src/mca/schizo/prte/help-schizo-prterun.txt +++ b/src/mca/schizo/prte/help-schizo-prterun.txt @@ -31,13 +31,13 @@ Initiate an instance of the PMIx Reference RTE (PRRTE) DVM --debug-daemons Debug daemons - if not set, the "verbose" setting will be limited to the DVM controller to reduce clutter - --debug-daemons-file Enable debugging of any PRTE daemons used by this application, storing + --debug-daemons-file Enable debugging of any PRRTE daemons used by this application, storing their verbose output in files --display Comma-delimited list of options for displaying information about the allocation and job. Allowed values: allocation, bind, map, map-devel, topo --get-stack-traces Get stack traces of all application procs on timeout - --leave-session-attached Do not discard stdout/stderr of remote PRTE daemons + --leave-session-attached Do not discard stdout/stderr of remote PRRTE daemons --report-state-on-timeout Report all job and process states upon timeout --spawn-timeout Timeout the job if spawn takes more than the specified number of seconds --stop-on-exec If supported, stop each specified process at start of execution @@ -76,19 +76,23 @@ Initiate an instance of the PMIx Reference RTE (PRRTE) DVM --map-by Mapping Policy for job [slot | hwthread | core (default:np<=2) | l1cache | l2cache | l3cache | numa (default:np>2) | package | node | seq | dist | - ppr |,rankfile] with supported colon-delimited modifiers: PE=y (for + ppr | rankfile | PE-LIST=a,b (comma-delimited ranges of cpus to use for + this job)] with supported colon-delimited qualifiers: PE=y (for multiple cpus/proc), SPAN, OVERSUBSCRIBE, NOOVERSUBSCRIBE, NOLOCAL, - HWTCPUS, CORECPUS, DEVICE(for dist policy), INHERIT, NOINHERIT, - PE-LIST=a,b (comma-delimited ranges of cpus to use for this job), - FILE= for seq and rankfile options + HWTCPUS, CORECPUS, DEVICE(for dist policy), INHERIT, NOINHERIT, DONOTLAUNCH, + FILE= for seq and rankfile options, ORDERED to indicate that the + CPUs specified in PE-LIST should be assigned to individual procs in + specified order (i.e., the first proc on a node is bound to the first + CPU in the list, the second proc to the second CPU, etc.). The PE-LIST + option without the ORDERED qualifier will result in each proc being + bound to the complete list of CPUs. /***** Ranking Options *****/ - --rank-by Ranking Policy for job [slot (default:np<=2) | hwthread | core | l1cache - | l2cache | l3cache | numa (default:np>2) | package | node], with - modifier :SPAN or :FILL + --rank-by Ranking Policy for job [slot (default) | node | fill | object | span] + with no qualifiers @@ -127,7 +131,7 @@ Initiate an instance of the PMIx Reference RTE (PRRTE) DVM is the parameter name; arg1 is the parameter value) --preload-files Preload the comma separated list of files to the remote machines current working directory before starting the remote process. - --prtemca Pass context-specific PRTE MCA parameters to the DVM + --prtemca Pass context-specific PRRTE MCA parameters to the DVM --pset User-specified name assigned to the processes in their given application --rankfile Name of file to specify explicit task mapping -s|--preload-binary Preload the binary on the remote machine before starting the remote @@ -243,10 +247,10 @@ Maximum number of daemons to start Debug daemon output enabled # [debug-daemons-file] -Enable debugging of any PRTE daemons used by this application, storing output in files +Enable debugging of any PRRTE daemons used by this application, storing output in files # [leave-session-attached] -Do not discard stdout/stderr of remote PRTE daemons +Do not discard stdout/stderr of remote PRRTE daemons # [tmpdir] Set the root for the session directory tree @@ -387,65 +391,107 @@ Adjust buffering for stdout/stderr [0 unbuffered] [1 line buffered] [2 fully buf Specify procs to receive stdin [rank, "all", "none"] (default: 0, indicating rank 0) # [map-by] -Mapping Policy for job: - slot - hwthread - core (default: np <= 2) - l1cache - l2cache - l3cache - numa (default: np > 2) - package - node - seq - dist - ppr - rankfile -with supported colon-delimited qualifiers: - PE=y (for multiple cpus/proc) - SPAN - OVERSUBSCRIBE - NOOVERSUBSCRIBE - NOLOCAL - HWTCPUS - CORECPUS - DEVICE(for dist policy) - INHERIT - NOINHERIT - PE-LIST=a,b (comma-delimited ranges of cpus to use for this job) - FILE= for seq and rankfile options +#include#help-prte-rmaps-base#map-by-option # [rank-by] -Ranking Policy for job: - slot (default: np <= 2) - hwthread - core - l1cache - l2cache - l3cache - numa (default: np > 2) - package - node -with supported colon-delimited qualifiers: - SPAN - FILL +#include#help-prte-rmaps-base#rank-by-option # [bind-to] -Binding Policy for job: - none (default: oversubscribed) - hwthread - core (default: np <= 2) - l1cache - l2cache - l3cache - numa (default: np > 2) - package -with supported colon-delimited qualifiers: - overload-allowed - if-supported +#include#help-prte-hwloc-base#bind-to-option # [rankfile] Name of file to specify explicit task mapping +# +[placement] +Process Mapping / Ranking / Binding Options in detail + +Processes are mapped to the resources in an allocation according to the +directive provided by the user via the --map-by command line option. Supported +directives include: + +- SLOT fills each node up to the number of available slots before moving + on to the next node in the allocation. +- NODE places processes in a round-robin fashion across all nodes in the + allocation, subject to limitations on the number of available slots on + each node. +- HWTHREAD places one process on each available hwthread on a node before + moving on to the next node in the allocation, subject to limitations on + the number of available slots on each node. +- CORE places one process on each available core on a node before + moving on to the next node in the allocation, subject to limitations on + the number of available slots on each node. +- L1CACHE places one process on each available L1 cache on a node before + moving on to the next node in the allocation, subject to limitations on + the number of available slots on each node. +- L2CACHE places one process on each available L2 cache on a node before + moving on to the next node in the allocation, subject to limitations on + the number of available slots on each node. +- L3CACHE places one process on each available L3 cache on a node before + moving on to the next node in the allocation, subject to limitations on + the number of available slots on each node. +- NUMA places one process on each available NUMA region on a node before + moving on to the next node in the allocation, subject to limitations on + the number of available slots on each node. +- PACKAGE places one process on each available package on a node before + moving on to the next node in the allocation, subject to limitations on + the number of available slots on each node. + +--map-by + Map to the specified object. See defaults in Quick Summary. Supported options + include slot, hwthread, core, l1cache, l2cache, l3cache, numa, package, node, + seq, ppr, rankfile, and pe-list + +Mapping by slot +Any directive can include qualifiers by adding a colon (:) and any +combination of one or more of the following to the --map-by option +(except where noted): + +- PE=n bind n processing elements to each process (can not be + used in combination with rankfile or pe-list directives) +- SPAN load balance the processes across the allocation by treating + the allocation as a single "super-node" (can not be + used in combination with slot, node, seq, ppr, rankfile, or + pe-list directives) +- OVERSUBSCRIBE allow more processes on a node than processing elements +- NOOVERSUBSCRIBE means !OVERSUBSCRIBE +- NOLOCAL do not launch processes on the same node as prun +- HWTCPUS use hardware threads as CPU slots +- CORECPUS use cores as CPU slots (default) +- INHERIT +- NOINHERIT means !INHERIT +- FILE= (path to file containing sequential or rankfile entries). +- ORDERED only applies to the PE-LIST option and indicates that + procs are to be bound to each of the specified CPUs in the order + in which they are assigned (i.e., the first proc on a node shall + be bound to the first CPU in the list, the second proc shall be + bound to the second CPU, etc.) + + +ppr policy example: --map-by ppr:N: will launch N times the number +of objects of the specified type on each node. + +By default, process ranks are assigned in accordance with the mapping +directive - e.g., jobs that are mapped by node will have the process +ranks assigned on a per-node basis. However, users can override the +default by specifying any of the following directives using the --rank-by +command line option: + +- SLOT assigns ranks to each process on a node in the order in + which the mapper assigned them. +- NODE assigns ranks +- FILL + +To bind processes to sets of objects: + +--bind-to + + Bind processes to the specified object. See defaults in Quick Summary. Supported options include none, hwthread, core, l1cache, l2cache, l3cache, numa, and package, cpu-set (bind each proc to the specified set, effectively a "soft" c-group) + +Any object can include qualifiers by adding a colon (:) and any combination of one or more of the following to the --bind-to option: + +- overload-allowed allows for binding more than one process in relation to a CPU +- if-supported if that object is supported on this system + # [display] Comma-delimited list of options for displaying information about the allocation and job. diff --git a/src/mca/schizo/prte/help-schizo-prun.txt b/src/mca/schizo/prte/help-schizo-prun.txt index c6098af198..e76cdca7f5 100644 --- a/src/mca/schizo/prte/help-schizo-prun.txt +++ b/src/mca/schizo/prte/help-schizo-prun.txt @@ -69,11 +69,16 @@ Submit job to the PMIx Reference RTE --map-by Mapping Policy for job [slot | hwthread | core (default:np<=2) | l1cache | l2cache | l3cache | numa (default:np>2) | package | node | seq | dist | - ppr |,rankfile] with supported colon-delimited modifiers: PE=y (for + ppr | rankfile | PE-LIST=a,b (comma-delimited ranges of cpus to use for + this job)] with supported colon-delimited qualifiers: PE=y (for multiple cpus/proc), SPAN, OVERSUBSCRIBE, NOOVERSUBSCRIBE, NOLOCAL, - HWTCPUS, CORECPUS, DEVICE(for dist policy), INHERIT, NOINHERIT, - PE-LIST=a,b (comma-delimited ranges of cpus to use for this job), - FILE= for seq and rankfile options + HWTCPUS, CORECPUS, DEVICE(for dist policy), INHERIT, NOINHERIT, DONOTLAUNCH, + FILE= for seq and rankfile options, ORDERED to indicate that the + CPUs specified in PE-LIST should be assigned to individual procs in + specified order (i.e., the first proc on a node is bound to the first + CPU in the list, the second proc to the second CPU, etc.). The PE-LIST + option without the ORDERED qualifier will result in each proc being + bound to the complete list of CPUs. diff --git a/src/mca/schizo/prte/schizo_prte.c b/src/mca/schizo/prte/schizo_prte.c index 06090cc598..25b4f48f95 100644 --- a/src/mca/schizo/prte/schizo_prte.c +++ b/src/mca/schizo/prte/schizo_prte.c @@ -127,7 +127,6 @@ static struct option prteoptions[] = { PMIX_OPTION_DEFINE(PRTE_CLI_STREAM_BUF, PMIX_ARG_REQD), /* developer options */ - PMIX_OPTION_DEFINE(PRTE_CLI_DO_NOT_LAUNCH, PMIX_ARG_REQD), PMIX_OPTION_DEFINE(PRTE_CLI_DISPLAY, PMIX_ARG_REQD), // deprecated options @@ -140,6 +139,7 @@ static struct option prteoptions[] = { PMIX_OPTION_DEFINE("display-devel-allocation", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("display-map", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("display-allocation", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("do-not-launch", PMIX_ARG_NONE), PMIX_OPTION_END }; @@ -232,9 +232,6 @@ static struct option prterunoptions[] = { /* display options */ PMIX_OPTION_DEFINE(PRTE_CLI_DISPLAY, PMIX_ARG_REQD), - /* developer options */ - PMIX_OPTION_DEFINE(PRTE_CLI_DO_NOT_LAUNCH, PMIX_ARG_NONE), - #if PRTE_ENABLE_FT PMIX_OPTION_DEFINE(PRTE_CLI_ENABLE_RECOVERY, PMIX_ARG_NONE), PMIX_OPTION_DEFINE(PRTE_CLI_MAX_RESTARTS, PMIX_ARG_REQD), @@ -257,6 +254,24 @@ static struct option prterunoptions[] = { PMIX_OPTION_DEFINE("display-map", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("display-allocation", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("rankfile", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("nolocal", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("oversubscribe", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("nooversubscribe", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("use-hwthread-cpus", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("cpu-set", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("cpu-list", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("bind-to-core", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("bynode", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("bycore", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("byslot", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("cpus-per-proc", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("cpus-per-rank", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("npernode", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("pernode", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("npersocket", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("ppr", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("debug", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("do-not-launch", PMIX_ARG_NONE), PMIX_OPTION_END }; @@ -337,9 +352,6 @@ static struct option prunoptions[] = { /* display options */ PMIX_OPTION_DEFINE(PRTE_CLI_DISPLAY, PMIX_ARG_REQD), - /* developer options */ - PMIX_OPTION_DEFINE(PRTE_CLI_DO_NOT_LAUNCH, PMIX_ARG_NONE), - #if PRTE_ENABLE_FT PMIX_OPTION_DEFINE(PRTE_CLI_ENABLE_RECOVERY, PMIX_ARG_NONE), PMIX_OPTION_DEFINE(PRTE_CLI_MAX_RESTARTS, PMIX_ARG_REQD), @@ -349,7 +361,6 @@ static struct option prunoptions[] = { // deprecated options PMIX_OPTION_DEFINE("mca", PMIX_ARG_REQD), - PMIX_OPTION_DEFINE("gmca", PMIX_ARG_REQD), PMIX_OPTION_DEFINE("xml", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("tag-output", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("timestamp-output", PMIX_ARG_NONE), @@ -363,6 +374,24 @@ static struct option prunoptions[] = { PMIX_OPTION_DEFINE("display-map", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("display-allocation", PMIX_ARG_NONE), PMIX_OPTION_DEFINE("rankfile", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("nolocal", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("oversubscribe", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("nooversubscribe", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("use-hwthread-cpus", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("cpu-set", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("cpu-list", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("bind-to-core", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("bynode", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("bycore", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("byslot", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("cpus-per-proc", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("cpus-per-rank", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("npernode", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("pernode", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("npersocket", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("ppr", PMIX_ARG_REQD), + PMIX_OPTION_DEFINE("debug", PMIX_ARG_NONE), + PMIX_OPTION_DEFINE("do-not-launch", PMIX_ARG_NONE), PMIX_OPTION_END }; @@ -554,7 +583,7 @@ static int convert_deprecated_cli(pmix_cli_result_t *results, } /* --use-hwthread-cpus -> --bind-to hwthread */ else if (0 == strcmp(option, "use-hwthread-cpus")) { - rc = prte_schizo_base_add_directive(results, option, + rc = prte_schizo_base_add_qualifier(results, option, PRTE_CLI_BINDTO, PRTE_CLI_HWT, warn); PMIX_CLI_REMOVE_DEPRECATED(results, opt); @@ -761,6 +790,13 @@ static int convert_deprecated_cli(pmix_cli_result_t *results, } PMIX_CLI_REMOVE_DEPRECATED(results, opt); } + /* --do-not-launch -> --map-by :donotlaunch */ + else if (0 == strcmp(option, "do-not-launch")) { + rc = prte_schizo_base_add_qualifier(results, option, + PRTE_CLI_MAPBY, PRTE_CLI_NOLAUNCH, + warn); + PMIX_CLI_REMOVE_DEPRECATED(results, opt); + } /* --map-by socket -> --map-by package */ else if (0 == strcmp(option, PRTE_CLI_MAPBY)) { /* check the value of the option for "socket" */ @@ -779,8 +815,9 @@ static int convert_deprecated_cli(pmix_cli_result_t *results, pmix_asprintf(&p2, "%s %s", option, p1); pmix_asprintf(&tmp2, "%s %s", option, tmp); /* can't just call show_help as we want every instance to be reported */ - output = pmix_show_help_string("help-schizo-base.txt", "deprecated-converted", true, p2, - tmp2); + output = pmix_show_help_string("help-schizo-base.txt", + "deprecated-converted", true, + p2, tmp2); fprintf(stderr, "%s\n", output); free(output); free(p2); @@ -791,10 +828,17 @@ static int convert_deprecated_cli(pmix_cli_result_t *results, opt->values[0] = tmp; } } - /* --rank-by socket -> --rank-by package */ + /* --rank-by */ else if (0 == strcmp(option, PRTE_CLI_RANKBY)) { - /* check the value of the option for "socket" */ - if (0 == strncasecmp(opt->values[0], "socket", strlen("socket"))) { + /* check the value of the option for object-level directives - show help + * for ranking if given */ + if (0 == strncasecmp(opt->values[0], "socket", strlen("socket")) || + 0 == strncasecmp(opt->values[0], "l1cache", strlen("l1cache")) || + 0 == strncasecmp(opt->values[0], "l2cache", strlen("l2cache")) || + 0 == strncasecmp(opt->values[0], "l3cache", strlen("l3cache")) || + 0 == strncasecmp(opt->values[0], "numa", strlen("numa")) || + 0 == strncasecmp(opt->values[0], "hwthread", strlen("hwthread")) || + 0 == strncasecmp(opt->values[0], "core", strlen("core"))) { p1 = strdup(opt->values[0]); // save the original option /* replace "socket" with "package" */ if (NULL == (p2 = strchr(opt->values[0], ':'))) { diff --git a/src/mca/schizo/schizo.h b/src/mca/schizo/schizo.h index 4b6d15a2ae..fe66fdf351 100644 --- a/src/mca/schizo/schizo.h +++ b/src/mca/schizo/schizo.h @@ -28,6 +28,8 @@ #include "types.h" #include "src/class/pmix_list.h" +#include "src/hwloc/hwloc-internal.h" +#include "src/mca/rmaps/rmaps_types.h" #include "src/util/pmix_cmd_line.h" #include "src/mca/mca.h" @@ -38,14 +40,6 @@ BEGIN_C_DECLS typedef int (*prte_schizo_convertor_fn_t)(char *option, char ***argv, int idx); -typedef struct { - pmix_rank_t nprocs; - uint16_t cpus_per_rank; - bool use_hwthreads; - int stream; - int verbosity; -} prte_schizo_options_t; - /* * schizo module functions */ @@ -86,13 +80,13 @@ typedef void (*prte_schizo_base_module_allow_run_as_root_fn_t)(pmix_cli_result_t /* Set the default mapping policy for a job */ typedef int (*prte_schizo_base_module_set_default_mapping_fn_t)(prte_job_t *jdata, - prte_schizo_options_t *options); + prte_rmaps_options_t *options); typedef int (*prte_schizo_base_module_set_default_ranking_fn_t)(prte_job_t *jdata, - prte_schizo_options_t *options); + prte_rmaps_options_t *options); typedef int (*prte_schizo_base_module_set_default_binding_fn_t)(prte_job_t *jdata, - prte_schizo_options_t *options); + prte_rmaps_options_t *options); /* do whatever preparation work * is required to setup the app for execution. This is intended to be diff --git a/src/mca/state/dvm/state_dvm.c b/src/mca/state/dvm/state_dvm.c index 8ae504e1b2..7f213322a0 100644 --- a/src/mca/state/dvm/state_dvm.c +++ b/src/mca/state/dvm/state_dvm.c @@ -65,80 +65,92 @@ static void ready_for_debug(int fd, short args, void *cbata); /****************** * DVM module - used when mpirun is persistent ******************/ -prte_state_base_module_t prte_state_dvm_module = {init, - finalize, - prte_state_base_activate_job_state, - prte_state_base_add_job_state, - prte_state_base_set_job_state_callback, - prte_state_base_set_job_state_priority, - prte_state_base_remove_job_state, - prte_state_base_activate_proc_state, - prte_state_base_add_proc_state, - prte_state_base_set_proc_state_callback, - prte_state_base_set_proc_state_priority, - prte_state_base_remove_proc_state}; +prte_state_base_module_t prte_state_dvm_module = { + init, + finalize, + prte_state_base_activate_job_state, + prte_state_base_add_job_state, + prte_state_base_set_job_state_callback, + prte_state_base_set_job_state_priority, + prte_state_base_remove_job_state, + prte_state_base_activate_proc_state, + prte_state_base_add_proc_state, + prte_state_base_set_proc_state_callback, + prte_state_base_set_proc_state_priority, + prte_state_base_remove_proc_state +}; static void dvm_notify(int sd, short args, void *cbdata); /* defined default state machine sequence - individual * plm's must add a state for launching daemons */ -static prte_job_state_t launch_states[] = {PRTE_JOB_STATE_INIT, - PRTE_JOB_STATE_INIT_COMPLETE, - PRTE_JOB_STATE_ALLOCATE, - PRTE_JOB_STATE_ALLOCATION_COMPLETE, - PRTE_JOB_STATE_DAEMONS_LAUNCHED, - PRTE_JOB_STATE_DAEMONS_REPORTED, - PRTE_JOB_STATE_VM_READY, - PRTE_JOB_STATE_MAP, - PRTE_JOB_STATE_MAP_COMPLETE, - PRTE_JOB_STATE_SYSTEM_PREP, - PRTE_JOB_STATE_LAUNCH_APPS, - PRTE_JOB_STATE_SEND_LAUNCH_MSG, - PRTE_JOB_STATE_STARTED, - PRTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, - PRTE_JOB_STATE_READY_FOR_DEBUG, - PRTE_JOB_STATE_RUNNING, - PRTE_JOB_STATE_REGISTERED, - /* termination states */ - PRTE_JOB_STATE_TERMINATED, - PRTE_JOB_STATE_NOTIFY_COMPLETED, - PRTE_JOB_STATE_NOTIFIED, - PRTE_JOB_STATE_ALL_JOBS_COMPLETE}; -static prte_state_cbfunc_t launch_callbacks[] = {prte_plm_base_setup_job, - init_complete, - prte_ras_base_allocate, - prte_plm_base_allocation_complete, - prte_plm_base_daemons_launched, - prte_plm_base_daemons_reported, - vm_ready, - prte_rmaps_base_map_job, - prte_plm_base_mapping_complete, - prte_plm_base_complete_setup, - prte_plm_base_launch_apps, - prte_plm_base_send_launch_msg, - job_started, - prte_state_base_local_launch_complete, - ready_for_debug, - prte_plm_base_post_launch, - prte_plm_base_registered, - check_complete, - dvm_notify, - cleanup_job, - prte_quit}; - -static prte_proc_state_t proc_states[] = {PRTE_PROC_STATE_RUNNING, - PRTE_PROC_STATE_READY_FOR_DEBUG, - PRTE_PROC_STATE_REGISTERED, - PRTE_PROC_STATE_IOF_COMPLETE, - PRTE_PROC_STATE_WAITPID_FIRED, - PRTE_PROC_STATE_TERMINATED}; -static prte_state_cbfunc_t proc_callbacks[] = {prte_state_base_track_procs, - prte_state_base_track_procs, - prte_state_base_track_procs, - prte_state_base_track_procs, - prte_state_base_track_procs, - prte_state_base_track_procs}; +static prte_job_state_t launch_states[] = { + PRTE_JOB_STATE_INIT, + PRTE_JOB_STATE_INIT_COMPLETE, + PRTE_JOB_STATE_ALLOCATE, + PRTE_JOB_STATE_ALLOCATION_COMPLETE, + PRTE_JOB_STATE_DAEMONS_LAUNCHED, + PRTE_JOB_STATE_DAEMONS_REPORTED, + PRTE_JOB_STATE_VM_READY, + PRTE_JOB_STATE_MAP, + PRTE_JOB_STATE_MAP_COMPLETE, + PRTE_JOB_STATE_SYSTEM_PREP, + PRTE_JOB_STATE_LAUNCH_APPS, + PRTE_JOB_STATE_SEND_LAUNCH_MSG, + PRTE_JOB_STATE_STARTED, + PRTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, + PRTE_JOB_STATE_READY_FOR_DEBUG, + PRTE_JOB_STATE_RUNNING, + PRTE_JOB_STATE_REGISTERED, + /* termination states */ + PRTE_JOB_STATE_TERMINATED, + PRTE_JOB_STATE_NOTIFY_COMPLETED, + PRTE_JOB_STATE_NOTIFIED, + PRTE_JOB_STATE_ALL_JOBS_COMPLETE +}; + +static prte_state_cbfunc_t launch_callbacks[] = { + prte_plm_base_setup_job, + init_complete, + prte_ras_base_allocate, + prte_plm_base_allocation_complete, + prte_plm_base_daemons_launched, + prte_plm_base_daemons_reported, + vm_ready, + prte_rmaps_base_map_job, + prte_plm_base_mapping_complete, + prte_plm_base_complete_setup, + prte_plm_base_launch_apps, + prte_plm_base_send_launch_msg, + job_started, + prte_state_base_local_launch_complete, + ready_for_debug, + prte_plm_base_post_launch, + prte_plm_base_registered, + check_complete, + dvm_notify, + cleanup_job, + prte_quit +}; + +static prte_proc_state_t proc_states[] = { + PRTE_PROC_STATE_RUNNING, + PRTE_PROC_STATE_READY_FOR_DEBUG, + PRTE_PROC_STATE_REGISTERED, + PRTE_PROC_STATE_IOF_COMPLETE, + PRTE_PROC_STATE_WAITPID_FIRED, + PRTE_PROC_STATE_TERMINATED +}; + +static prte_state_cbfunc_t proc_callbacks[] = { + prte_state_base_track_procs, + prte_state_base_track_procs, + prte_state_base_track_procs, + prte_state_base_track_procs, + prte_state_base_track_procs, + prte_state_base_track_procs +}; static void force_quit(int fd, short args, void *cbdata) { @@ -178,21 +190,19 @@ static int init(void) } } /* add the termination response */ - if (PRTE_SUCCESS - != (rc = prte_state.add_job_state(PRTE_JOB_STATE_DAEMONS_TERMINATED, prte_quit, - PRTE_SYS_PRI))) { + rc = prte_state.add_job_state(PRTE_JOB_STATE_DAEMONS_TERMINATED, prte_quit, PRTE_SYS_PRI); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); } /* add a default error response */ - if (PRTE_SUCCESS - != (rc = prte_state.add_job_state(PRTE_JOB_STATE_FORCED_EXIT, force_quit, - PRTE_ERROR_PRI))) { + rc = prte_state.add_job_state(PRTE_JOB_STATE_FORCED_EXIT, force_quit, PRTE_ERROR_PRI); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); } /* add callback to report progress, if requested */ - if (PRTE_SUCCESS - != (rc = prte_state.add_job_state(PRTE_JOB_STATE_REPORT_PROGRESS, - prte_state_base_report_progress, PRTE_ERROR_PRI))) { + rc = prte_state.add_job_state(PRTE_JOB_STATE_REPORT_PROGRESS, + prte_state_base_report_progress, PRTE_ERROR_PRI); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); } if (5 < prte_output_get_verbosity(prte_state_base_framework.framework_output)) { @@ -204,8 +214,8 @@ static int init(void) */ num_states = sizeof(proc_states) / sizeof(prte_proc_state_t); for (i = 0; i < num_states; i++) { - if (PRTE_SUCCESS - != (rc = prte_state.add_proc_state(proc_states[i], proc_callbacks[i], PRTE_SYS_PRI))) { + rc = prte_state.add_proc_state(proc_states[i], proc_callbacks[i], PRTE_SYS_PRI); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); } } @@ -278,13 +288,6 @@ static void vm_ready(int fd, short args, void *cbdata) PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); return; } - /* provide the info on the capabilities of each node */ - if (PRTE_SUCCESS != (rc = prte_util_pass_node_info(&buf))) { - PRTE_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&buf); - PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT); - return; - } /* get wireup info for daemons */ jptr = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace); for (v = 0; v < jptr->procs->size; v++) { @@ -513,6 +516,11 @@ static void check_complete(int fd, short args, void *cbdata) char *tmp; prte_timer_t *timer; prte_app_context_t *app; + hwloc_obj_t obj; + hwloc_obj_type_t type; + hwloc_cpuset_t boundcpus; + unsigned n; + uint16_t u16, *u16ptr = &u16; PMIX_ACQUIRE_OBJECT(caddy); jdata = caddy->jdata; @@ -712,15 +720,23 @@ static void check_complete(int fd, short args, void *cbdata) */ if (NULL != jdata->map) { map = jdata->map; + if (prte_get_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, NULL, PMIX_BOOL)) { + type = HWLOC_OBJ_PU; + } else { + type = HWLOC_OBJ_CORE; + } + boundcpus = hwloc_bitmap_alloc(); for (index = 0; index < map->nodes->size; index++) { - if (NULL == (node = (prte_node_t *) pmix_pointer_array_get_item(map->nodes, index))) { + node = (prte_node_t *) pmix_pointer_array_get_item(map->nodes, index); + if (NULL == node) { continue; } PRTE_OUTPUT_VERBOSE((2, prte_state_base_framework.framework_output, "%s state:dvm releasing procs from node %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, i))) { + proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, i); + if (NULL == proc) { continue; } if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { @@ -734,6 +750,26 @@ static void check_complete(int fd, short args, void *cbdata) node->num_procs--; node->next_node_rank--; } + /* release the resources held by the proc - only the first + * cpu in the proc's cpuset was used to mark usage */ + if (NULL != proc->cpuset) { + if (0 != (rc = hwloc_bitmap_list_sscanf(boundcpus, proc->cpuset))) { + prte_output(0, "hwloc_bitmap_sscanf returned %s for the string %s", + prte_strerror(rc), proc->cpuset); + continue; + } + obj = hwloc_get_obj_inside_cpuset_by_type(node->topology->topo, + boundcpus, type, 0); + if (NULL == obj) { + prte_output(0, "COULD NOT GET BOUND CPU FOR RESOURCE RELEASE"); + continue; + } +#if HWLOC_API_VERSION < 0x20000 + hwloc_bitmap_or(node->available, node->available, obj->allowed_cpuset); +#else + hwloc_bitmap_or(node->available, node->available, obj->cpuset); +#endif + } PRTE_OUTPUT_VERBOSE((2, prte_state_base_framework.framework_output, "%s state:dvm releasing proc %s from node %s", @@ -751,6 +787,7 @@ static void check_complete(int fd, short args, void *cbdata) /* flag that the node is no longer in a map */ PRTE_FLAG_UNSET(node, PRTE_NODE_FLAG_MAPPED); } + hwloc_bitmap_free(boundcpus); PMIX_RELEASE(map); jdata->map = NULL; } diff --git a/src/prted/pmix/pmix_server_dyn.c b/src/prted/pmix/pmix_server_dyn.c index 80198b3711..0aa4f2ea2a 100644 --- a/src/prted/pmix/pmix_server_dyn.c +++ b/src/prted/pmix/pmix_server_dyn.c @@ -347,8 +347,8 @@ static void interim(int sd, short args, void *cbdata) /*** DISPLAY MAP ***/ } else if (PMIX_CHECK_KEY(info, PMIX_DISPLAY_MAP)) { if (PMIX_INFO_TRUE(info)) { - prte_set_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_MAP, PRTE_ATTR_GLOBAL, NULL, - PMIX_BOOL); + prte_set_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_MAP, + PRTE_ATTR_GLOBAL, NULL, PMIX_BOOL); } /*** PPR (PROCS-PER-RESOURCE) ***/ @@ -414,8 +414,8 @@ static void interim(int sd, short args, void *cbdata) /*** CPUS/RANK ***/ } else if (PMIX_CHECK_KEY(info, PMIX_CPUS_PER_PROC)) { u16 = info->value.data.uint32; - prte_set_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, PRTE_ATTR_GLOBAL, &u16, - PMIX_UINT16); + prte_set_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, + PRTE_ATTR_GLOBAL, &u16, PMIX_UINT16); /*** NO USE LOCAL ***/ } else if (PMIX_CHECK_KEY(info, PMIX_NO_PROCS_ON_HEAD)) { diff --git a/src/prted/pmix/pmix_server_register_fns.c b/src/prted/pmix/pmix_server_register_fns.c index 7c51bfdd27..d5abf39d71 100644 --- a/src/prted/pmix/pmix_server_register_fns.c +++ b/src/prted/pmix/pmix_server_register_fns.c @@ -438,18 +438,14 @@ int prte_pmix_server_register_nspace(prte_job_t *jdata) PMIX_INFO_LIST_ADD(ret, pmap, PMIX_RANK, &pptr->name.rank, PMIX_PROC_RANK); /* location, for local procs */ - tmp = NULL; - if (prte_get_attribute(&pptr->attributes, PRTE_PROC_CPU_BITMAP, - (void **) &tmp, PMIX_STRING) - && NULL != tmp) { + if (NULL != pptr->cpuset) { /* provide the cpuset string for this proc */ - PMIX_INFO_LIST_ADD(ret, pmap, PMIX_CPUSET, tmp, PMIX_STRING); + PMIX_INFO_LIST_ADD(ret, pmap, PMIX_CPUSET, pptr->cpuset, PMIX_STRING); /* let PMIx generate the locality string */ PMIX_CPUSET_CONSTRUCT(&cpuset); cpuset.source = "hwloc"; cpuset.bitmap = hwloc_bitmap_alloc(); - hwloc_bitmap_list_sscanf(cpuset.bitmap, tmp); - free(tmp); + hwloc_bitmap_list_sscanf(cpuset.bitmap, pptr->cpuset); ret = PMIx_server_generate_locality_string(&cpuset, &tmp); if (PMIX_SUCCESS != ret) { PMIX_ERROR_LOG(ret); diff --git a/src/runtime/data_type_support/prte_dt_packing_fns.c b/src/runtime/data_type_support/prte_dt_packing_fns.c index 77bc833b03..8ea487a8f1 100644 --- a/src/runtime/data_type_support/prte_dt_packing_fns.c +++ b/src/runtime/data_type_support/prte_dt_packing_fns.c @@ -174,18 +174,14 @@ int prte_job_pack(pmix_data_buffer_t *bkt, prte_job_t *job) } if (0 < job->num_procs) { - /* check attributes to see if this job is to be fully - * described in the launch msg */ - if (prte_get_attribute(&job->attributes, PRTE_JOB_FULLY_DESCRIBED, NULL, PMIX_BOOL)) { - for (j = 0; j < job->procs->size; j++) { - if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(job->procs, j))) { - continue; - } - rc = prte_proc_pack(bkt, proc); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return prte_pmix_convert_status(rc); - } + for (j = 0; j < job->procs->size; j++) { + if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(job->procs, j))) { + continue; + } + rc = prte_proc_pack(bkt, proc); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); } } } @@ -395,6 +391,13 @@ int prte_proc_pack(pmix_data_buffer_t *bkt, prte_proc_t *proc) return prte_pmix_convert_status(rc); } + /* pack the cpuset */ + rc = PMIx_Data_pack(NULL, bkt, (void *) &proc->cpuset, 1, PMIX_STRING); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return prte_pmix_convert_status(rc); + } + /* pack the attributes that will go */ count = 0; PMIX_LIST_FOREACH(kv, &proc->attributes, prte_attribute_t) diff --git a/src/runtime/data_type_support/prte_dt_print_fns.c b/src/runtime/data_type_support/prte_dt_print_fns.c index fc6d3080b3..0a69728944 100644 --- a/src/runtime/data_type_support/prte_dt_print_fns.c +++ b/src/runtime/data_type_support/prte_dt_print_fns.c @@ -253,7 +253,7 @@ void prte_proc_print(char **output, prte_job_t *jdata, prte_proc_t *src) hwloc_obj_t loc = NULL; char *locale, *tmp2; hwloc_cpuset_t mycpus; - char *str, *cpu_bitmap = NULL; + char *str; bool use_hwthread_cpus; /* set default result */ @@ -280,12 +280,10 @@ void prte_proc_print(char **output, prte_job_t *jdata, prte_proc_t *src) } if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_DEVEL_MAP, NULL, PMIX_BOOL)) { - if (prte_get_attribute(&src->attributes, PRTE_PROC_CPU_BITMAP, (void **) &cpu_bitmap, - PMIX_STRING) - && NULL != cpu_bitmap && NULL != src->node->topology + if (NULL != src->cpuset && NULL != src->node->topology && NULL != src->node->topology->topo) { mycpus = hwloc_bitmap_alloc(); - hwloc_bitmap_list_sscanf(mycpus, cpu_bitmap); + hwloc_bitmap_list_sscanf(mycpus, src->cpuset); if (NULL == (str = prte_hwloc_base_cset2str(mycpus, use_hwthread_cpus, src->node->topology->topo))) { @@ -296,7 +294,6 @@ void prte_proc_print(char **output, prte_job_t *jdata, prte_proc_t *src) PRTE_JOBID_PRINT(src->name.nspace), (long) src->app_idx, PRTE_VPID_PRINT(src->name.rank), str); free(str); - free(cpu_bitmap); } else { /* just print a very simple output for users */ pmix_asprintf(&tmp, "\n%sProcess jobid: %s App: %ld Process rank: %s Bound: N/A", pfx2, @@ -317,22 +314,15 @@ void prte_proc_print(char **output, prte_job_t *jdata, prte_proc_t *src) free(tmp); tmp = tmp3; - if (prte_get_attribute(&src->attributes, PRTE_PROC_HWLOC_LOCALE, (void **) &loc, - PMIX_POINTER)) { - if (NULL != loc) { - locale = prte_hwloc_base_cset2str(loc->cpuset, use_hwthread_cpus, - src->node->topology->topo); - } else { - locale = strdup("UNKNOWN"); - } + if (NULL != src->obj) { + locale = prte_hwloc_base_cset2str(src->obj->cpuset, use_hwthread_cpus, + src->node->topology->topo); } else { locale = strdup("UNKNOWN"); } - if (prte_get_attribute(&src->attributes, PRTE_PROC_CPU_BITMAP, (void **) &cpu_bitmap, - PMIX_STRING) - && NULL != src->node->topology && NULL != src->node->topology->topo) { + if (NULL != src->cpuset) { mycpus = hwloc_bitmap_alloc(); - hwloc_bitmap_list_sscanf(mycpus, cpu_bitmap); + hwloc_bitmap_list_sscanf(mycpus, src->cpuset); tmp2 = prte_hwloc_base_cset2str(mycpus, use_hwthread_cpus, src->node->topology->topo); hwloc_bitmap_free(mycpus); } else { @@ -345,9 +335,6 @@ void prte_proc_print(char **output, prte_job_t *jdata, prte_proc_t *src) free(locale); free(tmp); free(tmp2); - if (NULL != cpu_bitmap) { - free(cpu_bitmap); - } /* set the return */ *output = tmp4; diff --git a/src/runtime/data_type_support/prte_dt_unpacking_fns.c b/src/runtime/data_type_support/prte_dt_unpacking_fns.c index 47010c8254..3f8621a4f1 100644 --- a/src/runtime/data_type_support/prte_dt_unpacking_fns.c +++ b/src/runtime/data_type_support/prte_dt_unpacking_fns.c @@ -194,20 +194,16 @@ int prte_job_unpack(pmix_data_buffer_t *bkt, prte_job_t **job) } if (0 < jptr->num_procs) { - /* check attributes to see if this job was fully - * described in the launch msg */ - if (prte_get_attribute(&jptr->attributes, PRTE_JOB_FULLY_DESCRIBED, NULL, PMIX_BOOL)) { - prte_proc_t *proc; - for (j = 0; j < jptr->num_procs; j++) { - n = 1; - rc = prte_proc_unpack(bkt, &proc); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_RELEASE(jptr); - return prte_pmix_convert_status(rc); - } - pmix_pointer_array_add(jptr->procs, proc); + prte_proc_t *proc; + for (j = 0; j < jptr->num_procs; j++) { + n = 1; + rc = prte_proc_unpack(bkt, &proc); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(jptr); + return prte_pmix_convert_status(rc); } + pmix_pointer_array_add(jptr->procs, proc); } } @@ -455,6 +451,15 @@ int prte_proc_unpack(pmix_data_buffer_t *bkt, prte_proc_t **pc) return prte_pmix_convert_status(rc); } + /* unpack the cpuset */ + n = 1; + rc = PMIx_Data_unpack(NULL, bkt, &proc->cpuset, &n, PMIX_STRING); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(proc); + return prte_pmix_convert_status(rc); + } + /* unpack the attributes */ rc = PMIx_Data_unpack(NULL, bkt, &count, &n, PMIX_INT32); if (PMIX_SUCCESS != rc) { diff --git a/src/runtime/prte_globals.c b/src/runtime/prte_globals.c index ead4cd1def..4633e26238 100644 --- a/src/runtime/prte_globals.c +++ b/src/runtime/prte_globals.c @@ -475,7 +475,6 @@ static void prte_job_construct(prte_job_t *job) PRTE_GLOBAL_ARRAY_BLOCK_SIZE); job->map = NULL; job->bookmark = NULL; - job->bkmark_obj = UINT_MAX; // mark that we haven't assigned a bkmark yet job->state = PRTE_JOB_STATE_UNDEF; job->num_mapped = 0; @@ -599,6 +598,7 @@ static void prte_node_construct(prte_node_t *node) node->rawname = NULL; node->aliases = NULL; node->daemon = NULL; + node->available = NULL; node->num_procs = 0; node->procs = PMIX_NEW(pmix_pointer_array_t); @@ -639,7 +639,9 @@ static void prte_node_destruct(prte_node_t *node) PMIX_RELEASE(node->daemon); node->daemon = NULL; } - + if (NULL != node->available) { + hwloc_bitmap_free(node->available); + } for (i = 0; i < node->procs->size; i++) { if (NULL != (proc = (prte_proc_t *) pmix_pointer_array_get_item(node->procs, i))) { pmix_pointer_array_set_item(node->procs, i, NULL); @@ -671,6 +673,8 @@ static void prte_proc_construct(prte_proc_t *proc) proc->state = PRTE_PROC_STATE_UNDEF; proc->app_idx = 0; proc->node = NULL; + proc->obj = NULL; + proc->cpuset = NULL; proc->exit_code = 0; /* Assume we won't fail unless otherwise notified */ proc->rml_uri = NULL; proc->flags = 0; @@ -683,7 +687,10 @@ static void prte_proc_destruct(prte_proc_t *proc) PMIX_RELEASE(proc->node); proc->node = NULL; } - + if (NULL != proc->cpuset) { + free(proc->cpuset); + proc->cpuset = NULL; + } if (NULL != proc->rml_uri) { free(proc->rml_uri); proc->rml_uri = NULL; @@ -751,7 +758,7 @@ static void tcon(prte_topology_t *t) static void tdes(prte_topology_t *t) { if (NULL != t->topo) { - prte_hwloc_base_free_topology(t->topo); + hwloc_topology_destroy(t->topo); } if (NULL != t->sig) { free(t->sig); diff --git a/src/runtime/prte_globals.h b/src/runtime/prte_globals.h index 5f5d3996a6..491c200c6c 100644 --- a/src/runtime/prte_globals.h +++ b/src/runtime/prte_globals.h @@ -268,6 +268,8 @@ typedef struct { char **aliases; /* daemon on this node */ struct prte_proc_t *daemon; + /* track the unassigned cpus */ + hwloc_cpuset_t available; /** number of procs on this node */ prte_node_rank_t num_procs; /* array of pointers to procs on this node */ @@ -342,9 +344,6 @@ typedef struct { * indicates the node where we stopped */ prte_node_t *bookmark; - /* if we are binding, bookmark the index of the - * last object we bound to */ - unsigned int bkmark_obj; /* state of the overall job */ prte_job_state_t state; /* number of procs mapped */ @@ -425,6 +424,11 @@ struct prte_proc_t { prte_app_idx_t app_idx; /* pointer to the node where this proc is executing */ prte_node_t *node; + /* pointer to the object on that node where the + * proc is mapped */ + hwloc_obj_t obj; + /* cpuset where the proc is bound */ + char *cpuset; /* RML contact info */ char *rml_uri; /* some boolean flags */ diff --git a/src/tools/prte/prte.c b/src/tools/prte/prte.c index 9d7684758d..0f029570e2 100644 --- a/src/tools/prte/prte.c +++ b/src/tools/prte/prte.c @@ -603,14 +603,6 @@ int main(int argc, char *argv[]) goto DONE; } - opt = pmix_cmd_line_get_param(&results, PRTE_CLI_MAPBY); - if (NULL != opt) { - if (NULL != strcasestr(opt->values[0], PRTE_CLI_NOLAUNCH)) { - prte_set_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, - PRTE_ATTR_GLOBAL, NULL, PMIX_BOOL); - } - } - /* Did the user specify a prefix, or want prefix by default? */ opt = pmix_cmd_line_get_param(&results, PRTE_CLI_PREFIX); if (NULL != opt || want_prefix_by_default) { diff --git a/src/tools/prun/prun.c b/src/tools/prun/prun.c index a89e7f387f..d4365e7d7b 100644 --- a/src/tools/prun/prun.c +++ b/src/tools/prun/prun.c @@ -771,9 +771,6 @@ int prun(int argc, char *argv[]) opt = pmix_cmd_line_get_param(&results, PRTE_CLI_MAPBY); if (NULL != opt) { PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_MAPBY, opt->values[0], PMIX_STRING); - if (NULL != strcasestr(opt->values[0], "DONOTLAUNCH")) { - PMIX_INFO_LIST_ADD(ret, jinfo, "PRTE_JOB_DO_NOT_LAUNCH", NULL, PMIX_BOOL); - } } /* if the user specified a ranking policy, then set it */ diff --git a/src/util/attr.c b/src/util/attr.c index 831923d0ea..44c532d7fc 100644 --- a/src/util/attr.c +++ b/src/util/attr.c @@ -387,8 +387,6 @@ const char *prte_attr_key_to_str(prte_attribute_key_t key) return "PRTE_JOB_TRANSPORT_KEY"; case PRTE_JOB_INFO_CACHE: return "PRTE_JOB_INFO_CACHE"; - case PRTE_JOB_FULLY_DESCRIBED: - return "PRTE_JOB_FULLY_DESCRIBED"; case PRTE_JOB_SILENT_TERMINATION: return "PRTE_JOB_SILENT_TERMINATION"; case PRTE_JOB_SET_ENVAR: @@ -480,12 +478,6 @@ const char *prte_attr_key_to_str(prte_attribute_key_t key) case PRTE_PROC_NOBARRIER: return "PROC-NOBARRIER"; - case PRTE_PROC_CPU_BITMAP: - return "PROC-CPU-BITMAP"; - case PRTE_PROC_HWLOC_LOCALE: - return "PROC-HWLOC-LOCALE"; - case PRTE_PROC_HWLOC_BOUND: - return "PROC-HWLOC-BOUND"; case PRTE_PROC_PRIOR_NODE: return "PROC-PRIOR-NODE"; case PRTE_PROC_NRESTARTS: diff --git a/src/util/attr.h b/src/util/attr.h index c6e13d7bb9..01e5d644a1 100644 --- a/src/util/attr.h +++ b/src/util/attr.h @@ -106,104 +106,105 @@ typedef uint16_t prte_job_flags_t; /*** JOB ATTRIBUTE KEYS ***/ #define PRTE_JOB_START_KEY PRTE_NODE_MAX_KEY -#define PRTE_JOB_LAUNCH_MSG_SENT (PRTE_JOB_START_KEY + 1) // timeval - time launch message was sent -#define PRTE_JOB_LAUNCH_MSG_RECVD (PRTE_JOB_START_KEY + 2) // timeval - time launch message was recvd -#define PRTE_JOB_MAX_LAUNCH_MSG_RECVD (PRTE_JOB_START_KEY + 3) // timeval - max time for launch msg to be received -#define PRTE_JOB_CKPT_STATE (PRTE_JOB_START_KEY + 5) // size_t - ckpt state -#define PRTE_JOB_SNAPSHOT_REF (PRTE_JOB_START_KEY + 6) // string - snapshot reference -#define PRTE_JOB_SNAPSHOT_LOC (PRTE_JOB_START_KEY + 7) // string - snapshot location -#define PRTE_JOB_SNAPC_INIT_BAR (PRTE_JOB_START_KEY + 8) // prte_grpcomm_coll_id_t - collective id -#define PRTE_JOB_SNAPC_FINI_BAR (PRTE_JOB_START_KEY + 9) // prte_grpcomm_coll_id_t - collective id -#define PRTE_JOB_NUM_NONZERO_EXIT (PRTE_JOB_START_KEY + 10) // int32 - number of procs with non-zero exit codes -#define PRTE_SPAWN_TIMEOUT_EVENT (PRTE_JOB_START_KEY + 11) // prte_ptr (prte_timer_t*) - timer event for failure detect/response - // if fails to launch -#define PRTE_JOB_ABORTED_PROC (PRTE_JOB_START_KEY + 12) // prte_ptr (prte_proc_t*) - proc that caused abort to happen -#define PRTE_JOB_MAPPER (PRTE_JOB_START_KEY + 13) // bool - job consists of MapReduce mappers -#define PRTE_JOB_REDUCER (PRTE_JOB_START_KEY + 14) // bool - job consists of MapReduce reducers -#define PRTE_JOB_COMBINER (PRTE_JOB_START_KEY + 15) // bool - job consists of MapReduce combiners -#define PRTE_JOB_INDEX_ARGV (PRTE_JOB_START_KEY + 16) // bool - automatically index argvs -#define PRTE_JOB_NO_VM (PRTE_JOB_START_KEY + 17) // bool - do not use VM launch -#define PRTE_JOB_SPIN_FOR_DEBUG (PRTE_JOB_START_KEY + 18) // bool - job consists of continuously operating apps -#define PRTE_JOB_CONTINUOUS_OP (PRTE_JOB_START_KEY + 19) // bool - recovery policy defined for job -#define PRTE_JOB_RECOVER_DEFINED (PRTE_JOB_START_KEY + 20) // bool - recovery policy has been defined -#define PRTE_JOB_NON_PRTE_JOB (PRTE_JOB_START_KEY + 22) // bool - non-prte job -#define PRTE_JOB_STDOUT_TARGET (PRTE_JOB_START_KEY + 23) // pmix_nspace_t - job that is to receive the stdout (on its stdin) from this one -#define PRTE_JOB_POWER (PRTE_JOB_START_KEY + 24) // string - power setting for nodes in job -#define PRTE_JOB_MAX_FREQ (PRTE_JOB_START_KEY + 25) // string - max freq setting for nodes in job -#define PRTE_JOB_MIN_FREQ (PRTE_JOB_START_KEY + 26) // string - min freq setting for nodes in job -#define PRTE_JOB_GOVERNOR (PRTE_JOB_START_KEY + 27) // string - governor used for nodes in job -#define PRTE_JOB_FAIL_NOTIFIED (PRTE_JOB_START_KEY + 28) // bool - abnormal term of proc within job has been reported -#define PRTE_JOB_TERM_NOTIFIED (PRTE_JOB_START_KEY + 29) // bool - normal term of job has been reported -#define PRTE_JOB_PEER_MODX_ID (PRTE_JOB_START_KEY + 30) // prte_grpcomm_coll_id_t - collective id -#define PRTE_JOB_INIT_BAR_ID (PRTE_JOB_START_KEY + 31) // prte_grpcomm_coll_id_t - collective id -#define PRTE_JOB_FINI_BAR_ID (PRTE_JOB_START_KEY + 32) // prte_grpcomm_coll_id_t - collective id -#define PRTE_JOB_FWDIO_TO_TOOL (PRTE_JOB_START_KEY + 33) // Forward IO for this job to the tool requesting its spawn -#define PRTE_JOB_LAUNCHED_DAEMONS (PRTE_JOB_START_KEY + 35) // bool - Job caused new daemons to be spawned -#define PRTE_JOB_REPORT_BINDINGS (PRTE_JOB_START_KEY + 36) // bool - Report process bindings -#define PRTE_JOB_CPUSET (PRTE_JOB_START_KEY + 37) // string - "soft" cgroup envelope for the job -#define PRTE_JOB_NOTIFICATIONS (PRTE_JOB_START_KEY + 38) // string - comma-separated list of desired notifications+methods -#define PRTE_JOB_ROOM_NUM (PRTE_JOB_START_KEY + 39) // int - number of remote request's hotel room -#define PRTE_JOB_LAUNCH_PROXY (PRTE_JOB_START_KEY + 40) // pmix_proc_t - name of spawn requestor -#define PRTE_JOB_NSPACE_REGISTERED (PRTE_JOB_START_KEY + 41) // bool - job has been registered with embedded PMIx server -#define PRTE_JOB_FIXED_DVM (PRTE_JOB_START_KEY + 42) // bool - do not change the size of the DVM for this job -#define PRTE_JOB_DVM_JOB (PRTE_JOB_START_KEY + 43) // bool - job is using a DVM -#define PRTE_JOB_CANCELLED (PRTE_JOB_START_KEY + 44) // bool - job was cancelled -#define PRTE_JOB_OUTPUT_TO_FILE (PRTE_JOB_START_KEY + 45) // string - path to use as basename of files to which stdout/err is to be directed -#define PRTE_JOB_MERGE_STDERR_STDOUT (PRTE_JOB_START_KEY + 46) // bool - merge stderr into stdout stream -#define PRTE_JOB_TAG_OUTPUT (PRTE_JOB_START_KEY + 47) // bool - tag stdout/stderr -#define PRTE_JOB_TIMESTAMP_OUTPUT (PRTE_JOB_START_KEY + 48) // bool - timestamp stdout/stderr -#define PRTE_JOB_MULTI_DAEMON_SIM (PRTE_JOB_START_KEY + 49) // bool - multiple daemons/node to simulate large cluster -#define PRTE_JOB_NOTIFY_COMPLETION (PRTE_JOB_START_KEY + 50) // bool - notify parent proc when spawned job terminates -#define PRTE_JOB_TRANSPORT_KEY (PRTE_JOB_START_KEY + 51) // string - transport keys assigned to this job -#define PRTE_JOB_INFO_CACHE (PRTE_JOB_START_KEY + 52) // pmix_list_t - list of prte_value_t to be included in job_info -#define PRTE_JOB_FULLY_DESCRIBED (PRTE_JOB_START_KEY + 53) // bool - job is fully described in launch msg -#define PRTE_JOB_SILENT_TERMINATION (PRTE_JOB_START_KEY + 54) // bool - do not generate an event notification when job - // normally terminates -#define PRTE_JOB_SET_ENVAR (PRTE_JOB_START_KEY + 55) // prte_envar_t - set the given envar to the specified value -#define PRTE_JOB_UNSET_ENVAR (PRTE_JOB_START_KEY + 56) // string - name of envar to unset, if present -#define PRTE_JOB_PREPEND_ENVAR (PRTE_JOB_START_KEY + 57) // prte_envar_t - prepend the specified value to the given envar -#define PRTE_JOB_APPEND_ENVAR (PRTE_JOB_START_KEY + 58) // prte_envar_t - append the specified value to the given envar -#define PRTE_JOB_ADD_ENVAR (PRTE_JOB_START_KEY + 59) // prte_envar_t - add envar, do not override pre-existing one -#define PRTE_JOB_APP_SETUP_DATA (PRTE_JOB_START_KEY + 60) // pmix_byte_object_t - blob containing app setup data -#define PRTE_JOB_OUTPUT_TO_DIRECTORY (PRTE_JOB_START_KEY + 61) // string - path of directory to which stdout/err is to be directed -#define PRTE_JOB_STOP_ON_EXEC (PRTE_JOB_START_KEY + 62) // pmix_rank_t of procs to stop on first instruction for debugger attach -#define PRTE_JOB_SPAWN_NOTIFIED (PRTE_JOB_START_KEY + 63) // bool - process requesting a spawn operation has been notified of result -#define PRTE_JOB_DISPLAY_MAP (PRTE_JOB_START_KEY + 64) // bool - display job map -#define PRTE_JOB_DISPLAY_DEVEL_MAP (PRTE_JOB_START_KEY + 65) // bool - display devel level job map -#define PRTE_JOB_DISPLAY_TOPO (PRTE_JOB_START_KEY + 66) // bool - display topology with job map +#define PRTE_JOB_LAUNCH_MSG_SENT (PRTE_JOB_START_KEY + 1) // timeval - time launch message was sent +#define PRTE_JOB_LAUNCH_MSG_RECVD (PRTE_JOB_START_KEY + 2) // timeval - time launch message was recvd +#define PRTE_JOB_MAX_LAUNCH_MSG_RECVD (PRTE_JOB_START_KEY + 3) // timeval - max time for launch msg to be received +#define PRTE_JOB_CKPT_STATE (PRTE_JOB_START_KEY + 5) // size_t - ckpt state +#define PRTE_JOB_SNAPSHOT_REF (PRTE_JOB_START_KEY + 6) // string - snapshot reference +#define PRTE_JOB_SNAPSHOT_LOC (PRTE_JOB_START_KEY + 7) // string - snapshot location +#define PRTE_JOB_SNAPC_INIT_BAR (PRTE_JOB_START_KEY + 8) // prte_grpcomm_coll_id_t - collective id +#define PRTE_JOB_SNAPC_FINI_BAR (PRTE_JOB_START_KEY + 9) // prte_grpcomm_coll_id_t - collective id +#define PRTE_JOB_NUM_NONZERO_EXIT (PRTE_JOB_START_KEY + 10) // int32 - number of procs with non-zero exit codes +#define PRTE_SPAWN_TIMEOUT_EVENT (PRTE_JOB_START_KEY + 11) // prte_ptr (prte_timer_t*) - timer event for failure detect/response + // if fails to launch +#define PRTE_JOB_ABORTED_PROC (PRTE_JOB_START_KEY + 12) // prte_ptr (prte_proc_t*) - proc that caused abort to happen +#define PRTE_JOB_MAPPER (PRTE_JOB_START_KEY + 13) // bool - job consists of MapReduce mappers +#define PRTE_JOB_REDUCER (PRTE_JOB_START_KEY + 14) // bool - job consists of MapReduce reducers +#define PRTE_JOB_COMBINER (PRTE_JOB_START_KEY + 15) // bool - job consists of MapReduce combiners +#define PRTE_JOB_INDEX_ARGV (PRTE_JOB_START_KEY + 16) // bool - automatically index argvs +#define PRTE_JOB_NO_VM (PRTE_JOB_START_KEY + 17) // bool - do not use VM launch +#define PRTE_JOB_SPIN_FOR_DEBUG (PRTE_JOB_START_KEY + 18) // bool - job consists of continuously operating apps +#define PRTE_JOB_CONTINUOUS_OP (PRTE_JOB_START_KEY + 19) // bool - recovery policy defined for job +#define PRTE_JOB_RECOVER_DEFINED (PRTE_JOB_START_KEY + 20) // bool - recovery policy has been defined +#define PRTE_JOB_NON_PRTE_JOB (PRTE_JOB_START_KEY + 22) // bool - non-prte job +#define PRTE_JOB_STDOUT_TARGET (PRTE_JOB_START_KEY + 23) // pmix_nspace_t - job that is to receive the stdout (on its + // stdin) from this one +#define PRTE_JOB_POWER (PRTE_JOB_START_KEY + 24) // string - power setting for nodes in job +#define PRTE_JOB_MAX_FREQ (PRTE_JOB_START_KEY + 25) // string - max freq setting for nodes in job +#define PRTE_JOB_MIN_FREQ (PRTE_JOB_START_KEY + 26) // string - min freq setting for nodes in job +#define PRTE_JOB_GOVERNOR (PRTE_JOB_START_KEY + 27) // string - governor used for nodes in job +#define PRTE_JOB_FAIL_NOTIFIED (PRTE_JOB_START_KEY + 28) // bool - abnormal term of proc within job has been reported +#define PRTE_JOB_TERM_NOTIFIED (PRTE_JOB_START_KEY + 29) // bool - normal term of job has been reported +#define PRTE_JOB_PEER_MODX_ID (PRTE_JOB_START_KEY + 30) // prte_grpcomm_coll_id_t - collective id +#define PRTE_JOB_INIT_BAR_ID (PRTE_JOB_START_KEY + 31) // prte_grpcomm_coll_id_t - collective id +#define PRTE_JOB_FINI_BAR_ID (PRTE_JOB_START_KEY + 32) // prte_grpcomm_coll_id_t - collective id +#define PRTE_JOB_FWDIO_TO_TOOL (PRTE_JOB_START_KEY + 33) // Forward IO for this job to the tool requesting its spawn +#define PRTE_JOB_LAUNCHED_DAEMONS (PRTE_JOB_START_KEY + 35) // bool - Job caused new daemons to be spawned +#define PRTE_JOB_REPORT_BINDINGS (PRTE_JOB_START_KEY + 36) // bool - Report process bindings +#define PRTE_JOB_CPUSET (PRTE_JOB_START_KEY + 37) // string - "soft" cgroup envelope for the job +#define PRTE_JOB_NOTIFICATIONS (PRTE_JOB_START_KEY + 38) // string - comma-separated list of desired notifications+methods +#define PRTE_JOB_ROOM_NUM (PRTE_JOB_START_KEY + 39) // int - number of remote request's hotel room +#define PRTE_JOB_LAUNCH_PROXY (PRTE_JOB_START_KEY + 40) // pmix_proc_t - name of spawn requestor +#define PRTE_JOB_NSPACE_REGISTERED (PRTE_JOB_START_KEY + 41) // bool - job has been registered with embedded PMIx server +#define PRTE_JOB_FIXED_DVM (PRTE_JOB_START_KEY + 42) // bool - do not change the size of the DVM for this job +#define PRTE_JOB_DVM_JOB (PRTE_JOB_START_KEY + 43) // bool - job is using a DVM +#define PRTE_JOB_CANCELLED (PRTE_JOB_START_KEY + 44) // bool - job was cancelled +#define PRTE_JOB_OUTPUT_TO_FILE (PRTE_JOB_START_KEY + 45) // string - path to use as basename of files to which + // stdout/err is to be directed +#define PRTE_JOB_MERGE_STDERR_STDOUT (PRTE_JOB_START_KEY + 46) // bool - merge stderr into stdout stream +#define PRTE_JOB_TAG_OUTPUT (PRTE_JOB_START_KEY + 47) // bool - tag stdout/stderr +#define PRTE_JOB_TIMESTAMP_OUTPUT (PRTE_JOB_START_KEY + 48) // bool - timestamp stdout/stderr +#define PRTE_JOB_MULTI_DAEMON_SIM (PRTE_JOB_START_KEY + 49) // bool - multiple daemons/node to simulate large cluster +#define PRTE_JOB_NOTIFY_COMPLETION (PRTE_JOB_START_KEY + 50) // bool - notify parent proc when spawned job terminates +#define PRTE_JOB_TRANSPORT_KEY (PRTE_JOB_START_KEY + 51) // string - transport keys assigned to this job +#define PRTE_JOB_INFO_CACHE (PRTE_JOB_START_KEY + 52) // pmix_list_t - list of prte_value_t to be included in job_info +#define PRTE_JOB_SILENT_TERMINATION (PRTE_JOB_START_KEY + 54) // bool - do not generate an event notification when job + // normally terminates +#define PRTE_JOB_SET_ENVAR (PRTE_JOB_START_KEY + 55) // prte_envar_t - set the given envar to the specified value +#define PRTE_JOB_UNSET_ENVAR (PRTE_JOB_START_KEY + 56) // string - name of envar to unset, if present +#define PRTE_JOB_PREPEND_ENVAR (PRTE_JOB_START_KEY + 57) // prte_envar_t - prepend the specified value to the given envar +#define PRTE_JOB_APPEND_ENVAR (PRTE_JOB_START_KEY + 58) // prte_envar_t - append the specified value to the given envar +#define PRTE_JOB_ADD_ENVAR (PRTE_JOB_START_KEY + 59) // prte_envar_t - add envar, do not override pre-existing one +#define PRTE_JOB_APP_SETUP_DATA (PRTE_JOB_START_KEY + 60) // pmix_byte_object_t - blob containing app setup data +#define PRTE_JOB_OUTPUT_TO_DIRECTORY (PRTE_JOB_START_KEY + 61) // string - path of directory to which stdout/err is to be directed +#define PRTE_JOB_STOP_ON_EXEC (PRTE_JOB_START_KEY + 62) // pmix_rank_t of procs to stop on first instruction for debugger attach +#define PRTE_JOB_SPAWN_NOTIFIED (PRTE_JOB_START_KEY + 63) // bool - process requesting a spawn operation has been notified of result +#define PRTE_JOB_DISPLAY_MAP (PRTE_JOB_START_KEY + 64) // bool - display job map +#define PRTE_JOB_DISPLAY_DEVEL_MAP (PRTE_JOB_START_KEY + 65) // bool - display devel level job map +#define PRTE_JOB_DISPLAY_TOPO (PRTE_JOB_START_KEY + 66) // bool - display topology with job map // 67 was removed option diffable map -#define PRTE_JOB_DISPLAY_ALLOC (PRTE_JOB_START_KEY + 68) // bool - display allocation -#define PRTE_JOB_DO_NOT_LAUNCH (PRTE_JOB_START_KEY + 69) // bool - do not launch job -#define PRTE_JOB_XML_OUTPUT (PRTE_JOB_START_KEY + 70) // bool - print in xml format -#define PRTE_JOB_TIMEOUT (PRTE_JOB_START_KEY + 71) // int32 - number of seconds job can run before terminating it as timed out -#define PRTE_JOB_STACKTRACES (PRTE_JOB_START_KEY + 72) // bool - include process stack traces in timeout report -#define PRTE_JOB_REPORT_STATE (PRTE_JOB_START_KEY + 73) // bool - include process state in timeout report -#define PRTE_JOB_TIMEOUT_EVENT (PRTE_JOB_START_KEY + 74) // prte_ptr (prte_timer_t*) - timer event for job timeout -#define PRTE_JOB_TRACE_TIMEOUT_EVENT (PRTE_JOB_START_KEY + 75) // prte_ptr (prte_timer_t*) - timer event for stacktrace collection -#define PRTE_JOB_INHERIT (PRTE_JOB_START_KEY + 76) // bool - job inherits parent's mapping/ranking/binding policies -#define PRTE_JOB_PES_PER_PROC (PRTE_JOB_START_KEY + 77) // uint16_t - number of cpus to be assigned to each process -#define PRTE_JOB_DIST_DEVICE (PRTE_JOB_START_KEY + 78) // char* - device to use for dist mapping -#define PRTE_JOB_HWT_CPUS (PRTE_JOB_START_KEY + 79) // bool - job requests hwthread cpus -#define PRTE_JOB_CORE_CPUS (PRTE_JOB_START_KEY + 80) // bool - job requests core cpus -#define PRTE_JOB_PPR (PRTE_JOB_START_KEY + 81) // char* - string specifying the procs-per-resource pattern -#define PRTE_JOB_NOINHERIT (PRTE_JOB_START_KEY + 82) // bool do NOT inherit parent's mapping/ranking/binding policies -#define PRTE_JOB_FILE (PRTE_JOB_START_KEY + 83) // char* - file to use for sequential or rankfile mapping -#define PRTE_JOB_DO_NOT_RESOLVE (PRTE_JOB_START_KEY + 84) // bool - do not resolve nodes -#define PRTE_JOB_DEBUG_TARGET (PRTE_JOB_START_KEY + 85) // pmix_proc_t - application proc to co-locate daemons with -#define PRTE_JOB_DEBUG_DAEMONS_PER_NODE (PRTE_JOB_START_KEY + 86) // uint16_t - Number of debug daemons per node -#define PRTE_JOB_DEBUG_DAEMONS_PER_PROC (PRTE_JOB_START_KEY + 87) // uint16_t - Number of debug daemons per application proc -#define PRTE_JOB_STOP_IN_INIT (PRTE_JOB_START_KEY + 88) // pmix_rank_t of procs to stop -#define PRTE_JOB_STOP_IN_APP (PRTE_JOB_START_KEY + 89) // pmix_rank_t of procs to stop -#define PRTE_JOB_ENVARS_HARVESTED (PRTE_JOB_START_KEY + 90) // envars have already been harvested -#define PRTE_JOB_OUTPUT_NOCOPY (PRTE_JOB_START_KEY + 91) // bool - do not copy output to stdout/err -#define PRTE_JOB_RANK_OUTPUT (PRTE_JOB_START_KEY + 92) // bool - tag stdout/stderr with rank -#define PRTE_SPAWN_TIMEOUT (PRTE_JOB_START_KEY + 93) // int32 - number of seconds to spawn before terminating it as timed out -#define PRTE_JOB_RAW_OUTPUT (PRTE_JOB_START_KEY + 94) // bool - do not buffer output -#define PRTE_JOB_EXEC_AGENT (PRTE_JOB_START_KEY + 95) // char* - string specifying the cmd to use when exec'ing the local proc -#define PRTE_JOB_NOAGG_HELP (PRTE_JOB_START_KEY + 96) // bool - do not aggregate show_help messages -#define PRTE_JOB_COLOCATE_PROCS (PRTE_JOB_START_KEY + 97) // pmix_data_array_t - colocate this job's procs with the given ones -#define PRTE_JOB_COLOCATE_NPERPROC (PRTE_JOB_START_KEY + 98) // uint16_t - number of procs to colocate at each proc -#define PRTE_JOB_COLOCATE_NPERNODE (PRTE_JOB_START_KEY + 99) // uint16_t - number of procs to colocate on node of each proc +#define PRTE_JOB_DISPLAY_ALLOC (PRTE_JOB_START_KEY + 68) // bool - display allocation +#define PRTE_JOB_DO_NOT_LAUNCH (PRTE_JOB_START_KEY + 69) // bool - do not launch job +#define PRTE_JOB_XML_OUTPUT (PRTE_JOB_START_KEY + 70) // bool - print in xml format +#define PRTE_JOB_TIMEOUT (PRTE_JOB_START_KEY + 71) // int32 - number of seconds job can run before terminating it as timed out +#define PRTE_JOB_STACKTRACES (PRTE_JOB_START_KEY + 72) // bool - include process stack traces in timeout report +#define PRTE_JOB_REPORT_STATE (PRTE_JOB_START_KEY + 73) // bool - include process state in timeout report +#define PRTE_JOB_TIMEOUT_EVENT (PRTE_JOB_START_KEY + 74) // prte_ptr (prte_timer_t*) - timer event for job timeout +#define PRTE_JOB_TRACE_TIMEOUT_EVENT (PRTE_JOB_START_KEY + 75) // prte_ptr (prte_timer_t*) - timer event for stacktrace collection +#define PRTE_JOB_INHERIT (PRTE_JOB_START_KEY + 76) // bool - job inherits parent's mapping/ranking/binding policies +#define PRTE_JOB_PES_PER_PROC (PRTE_JOB_START_KEY + 77) // uint16_t - number of cpus to be assigned to each process +#define PRTE_JOB_DIST_DEVICE (PRTE_JOB_START_KEY + 78) // char* - device to use for dist mapping +#define PRTE_JOB_HWT_CPUS (PRTE_JOB_START_KEY + 79) // bool - job requests hwthread cpus +#define PRTE_JOB_CORE_CPUS (PRTE_JOB_START_KEY + 80) // bool - job requests core cpus +#define PRTE_JOB_PPR (PRTE_JOB_START_KEY + 81) // char* - string specifying the procs-per-resource pattern +#define PRTE_JOB_NOINHERIT (PRTE_JOB_START_KEY + 82) // bool do NOT inherit parent's mapping/ranking/binding policies +#define PRTE_JOB_FILE (PRTE_JOB_START_KEY + 83) // char* - file to use for sequential or rankfile mapping +#define PRTE_JOB_DO_NOT_RESOLVE (PRTE_JOB_START_KEY + 84) // bool - do not resolve nodes +#define PRTE_JOB_DEBUG_TARGET (PRTE_JOB_START_KEY + 85) // pmix_proc_t - application proc to co-locate daemons with +#define PRTE_JOB_DEBUG_DAEMONS_PER_NODE (PRTE_JOB_START_KEY + 86) // uint16_t - Number of debug daemons per node +#define PRTE_JOB_DEBUG_DAEMONS_PER_PROC (PRTE_JOB_START_KEY + 87) // uint16_t - Number of debug daemons per application proc +#define PRTE_JOB_STOP_IN_INIT (PRTE_JOB_START_KEY + 88) // pmix_rank_t of procs to stop +#define PRTE_JOB_STOP_IN_APP (PRTE_JOB_START_KEY + 89) // pmix_rank_t of procs to stop +#define PRTE_JOB_ENVARS_HARVESTED (PRTE_JOB_START_KEY + 90) // envars have already been harvested +#define PRTE_JOB_OUTPUT_NOCOPY (PRTE_JOB_START_KEY + 91) // bool - do not copy output to stdout/err +#define PRTE_JOB_RANK_OUTPUT (PRTE_JOB_START_KEY + 92) // bool - tag stdout/stderr with rank +#define PRTE_SPAWN_TIMEOUT (PRTE_JOB_START_KEY + 93) // int32 - number of seconds to spawn before terminating it as timed out +#define PRTE_JOB_RAW_OUTPUT (PRTE_JOB_START_KEY + 94) // bool - do not buffer output +#define PRTE_JOB_EXEC_AGENT (PRTE_JOB_START_KEY + 95) // char* - string specifying the cmd to use when exec'ing the local proc +#define PRTE_JOB_NOAGG_HELP (PRTE_JOB_START_KEY + 96) // bool - do not aggregate show_help messages +#define PRTE_JOB_COLOCATE_PROCS (PRTE_JOB_START_KEY + 97) // pmix_data_array_t - colocate this job's procs with the given ones +#define PRTE_JOB_COLOCATE_NPERPROC (PRTE_JOB_START_KEY + 98) // uint16_t - number of procs to colocate at each proc +#define PRTE_JOB_COLOCATE_NPERNODE (PRTE_JOB_START_KEY + 99) // uint16_t - number of procs to colocate on node of each proc #define PRTE_JOB_MAX_KEY 300 @@ -228,9 +229,6 @@ typedef uint16_t prte_proc_flags_t; #define PRTE_PROC_START_KEY PRTE_JOB_MAX_KEY #define PRTE_PROC_NOBARRIER (PRTE_PROC_START_KEY + 1) // bool - indicates proc should not barrier in prte_init -#define PRTE_PROC_CPU_BITMAP (PRTE_PROC_START_KEY + 2) // string - string representation of cpu bindings -#define PRTE_PROC_HWLOC_LOCALE (PRTE_PROC_START_KEY + 3) // prte_ptr (hwloc_obj_t) = pointer to object where proc was mapped -#define PRTE_PROC_HWLOC_BOUND (PRTE_PROC_START_KEY + 4) // prte_ptr (hwloc_obj_t) = pointer to object where proc was bound #define PRTE_PROC_PRIOR_NODE (PRTE_PROC_START_KEY + 5) // void* - pointer to prte_node_t where this proc last executed #define PRTE_PROC_NRESTARTS (PRTE_PROC_START_KEY + 6) // int32 - number of times this process has been restarted #define PRTE_PROC_RESTART_TIME (PRTE_PROC_START_KEY + 7) // timeval - time of last restart diff --git a/src/util/nidmap.c b/src/util/nidmap.c index 6f36bccc67..1c7c37335e 100644 --- a/src/util/nidmap.c +++ b/src/util/nidmap.c @@ -431,853 +431,3 @@ int prte_util_decode_nidmap(pmix_data_buffer_t *buf) } return rc; } - -int prte_util_pass_node_info(pmix_data_buffer_t *buffer) -{ - uint16_t *slots = NULL, slot = UINT16_MAX; - uint8_t *flags = NULL, flag = UINT8_MAX; - int8_t i8; - int16_t i16; - int32_t ntopos; - int rc, m, n, nbitmap; - bool compressed, unislots = true, uniflags = true; - prte_node_t *nptr; - pmix_byte_object_t bo; - size_t sz, nslots; - pmix_data_buffer_t bucket; - prte_topology_t *t; - pmix_topology_t pt; - char **topos = NULL; - - /* make room for the number of slots on each node */ - nslots = sizeof(uint16_t) * prte_node_pool->size; - slots = (uint16_t *) malloc(nslots); - /* and for the flags for each node - only need one bit/node */ - nbitmap = (prte_node_pool->size / 8) + 1; - flags = (uint8_t *) calloc(1, nbitmap); - - /* indicate if we have hetero nodes */ - if (prte_hetero_nodes) { - i8 = 1; - } else { - i8 = 0; - } - rc = PMIx_Data_pack(NULL, buffer, &i8, 1, PMIX_INT8); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - - /* we only need to send topologies if we have hetero nodes */ - if (prte_hetero_nodes) { - PMIX_DATA_BUFFER_CONSTRUCT(&bucket); - pt.source = strdup("hwloc"); - ntopos = 0; - for (n = 0; n < prte_node_topologies->size; n++) { - t = (prte_topology_t *) pmix_pointer_array_get_item(prte_node_topologies, n); - if (NULL == t) { - continue; - } - /* pack the topology string */ - rc = PMIx_Data_pack(NULL, &bucket, &t->sig, 1, PMIX_STRING); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - free(pt.source); - goto cleanup; - } - /* track it */ - pmix_argv_append_nosize(&topos, t->sig); - /* pack the topology itself */ - pt.topology = t->topo; - rc = PMIx_Data_pack(NULL, &bucket, &pt, 1, PMIX_TOPO); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - free(pt.source); - goto cleanup; - } - ++ntopos; - } - free(pt.source); - /* pack the number of topologies */ - rc = PMIx_Data_pack(NULL, buffer, &ntopos, 1, PMIX_INT32); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* send them along */ - if (PMIx_Data_compress((uint8_t *) bucket.base_ptr, bucket.bytes_used, - (uint8_t **) &bo.bytes, &sz)) { - /* the data was compressed - mark that we compressed it */ - compressed = true; - rc = PMIx_Data_pack(NULL, buffer, &compressed, 1, PMIX_BOOL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - bo.size = sz; - } else { - /* mark that it was not compressed */ - compressed = false; - rc = PMIx_Data_pack(NULL, buffer, &compressed, 1, PMIX_BOOL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - rc = PMIx_Data_unload(&bucket, &bo); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - } - /* pack the info */ - rc = PMIx_Data_pack(NULL, buffer, &bo, 1, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - free(bo.bytes); - } - - /* construct the per-node info */ - PMIX_DATA_BUFFER_CONSTRUCT(&bucket); - for (n = 0; n < prte_node_pool->size; n++) { - if (NULL == (nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n))) { - continue; - } - /* track the topology, if required */ - if (prte_hetero_nodes && NULL != nptr->daemon) { - rc = PMIx_Data_pack(NULL, &bucket, &nptr->daemon->name.rank, 1, PMIX_PROC_RANK); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - /* find this signature in the topos */ - for (m = 0; NULL != topos && NULL != topos[m]; m++) { - if (0 == strcmp(topos[m], nptr->topology->sig)) { - rc = PMIx_Data_pack(NULL, &bucket, &m, 1, PMIX_INT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - break; - } - } - } - /* store the number of slots */ - slots[n] = nptr->slots; - if (UINT16_MAX == slot) { - slot = nptr->slots; - } else if (slot != nptr->slots) { - unislots = false; - } - /* store the flag */ - if (PRTE_FLAG_TEST(nptr, PRTE_NODE_FLAG_SLOTS_GIVEN)) { - flags[n / 8] |= (1 << (7 - (n % 8))); - if (UINT8_MAX == flag) { - flag = 1; - } else if (1 != flag) { - uniflags = false; - } - } else { - if (UINT8_MAX == flag) { - flag = 0; - } else if (0 != flag) { - uniflags = false; - } - } - } - - /* deal with the topology assignments */ - if (prte_hetero_nodes) { - if (PMIx_Data_compress((uint8_t *) bucket.base_ptr, bucket.bytes_used, - (uint8_t **) &bo.bytes, &sz)) { - /* mark that this was compressed */ - compressed = true; - bo.size = sz; - } else { - /* mark that this was not compressed */ - compressed = false; - bo.bytes = bucket.base_ptr; - bo.size = bucket.bytes_used; - } - /* indicate compression */ - rc = PMIx_Data_pack(NULL, buffer, &compressed, 1, PMIX_BOOL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - if (compressed) { - free(bo.bytes); - } - goto cleanup; - } - /* add the object */ - rc = PMIx_Data_pack(NULL, buffer, &bo, 1, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - if (compressed) { - free(bo.bytes); - } - goto cleanup; - } - } - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - - /* if we have uniform #slots, then just flag it - no - * need to pass anything */ - if (unislots) { - i16 = -1 * slot; - rc = PMIx_Data_pack(NULL, buffer, &i16, 1, PMIX_INT16); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - } else { - if (PMIx_Data_compress((uint8_t *) slots, nslots, (uint8_t **) &bo.bytes, &sz)) { - /* mark that this was compressed */ - i16 = 1; - compressed = true; - bo.size = sz; - } else { - /* mark that this was not compressed */ - i16 = 0; - compressed = false; - bo.bytes = (char *) slots; - bo.size = nslots; - } - /* indicate compression */ - rc = PMIx_Data_pack(NULL, buffer, &i16, 1, PMIX_INT16); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - if (compressed) { - free(bo.bytes); - } - goto cleanup; - } - /* add the object */ - rc = PMIx_Data_pack(NULL, buffer, &bo, 1, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - if (compressed) { - free(bo.bytes); - } - goto cleanup; - } - } - - /* if we have uniform flags, then just flag it - no - * need to pass anything */ - if (uniflags) { - if (1 == flag) { - i8 = -1; - } else { - i8 = -2; - } - rc = PMIx_Data_pack(NULL, buffer, &i8, 1, PMIX_INT8); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - } else { - if (PMIx_Data_compress(flags, nbitmap, (uint8_t **) &bo.bytes, &sz)) { - /* mark that this was compressed */ - i8 = 2; - compressed = true; - bo.size = sz; - } else { - /* mark that this was not compressed */ - i8 = 3; - compressed = false; - bo.bytes = (char *) flags; - bo.size = nbitmap; - } - /* indicate compression */ - rc = PMIx_Data_pack(NULL, buffer, &i8, 1, PMIX_INT8); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - if (compressed) { - free(bo.bytes); - } - goto cleanup; - } - /* add the object */ - rc = PMIx_Data_pack(NULL, buffer, &bo, 1, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - if (compressed) { - free(bo.bytes); - } - goto cleanup; - } - } - -cleanup: - if (NULL != slots) { - free(slots); - } - if (NULL != flags) { - free(flags); - } - if (NULL != topos) { - pmix_argv_free(topos); - } - return rc; -} - -int prte_util_parse_node_info(pmix_data_buffer_t *buf) -{ - int8_t i8; - int16_t i16; - int32_t ntopos; - bool compressed, found; - int rc = PRTE_SUCCESS, cnt, n, m; - prte_node_t *nptr; - size_t sz; - pmix_byte_object_t pbo; - uint16_t *slots = NULL; - uint8_t *flags = NULL; - uint8_t *bytes = NULL; - prte_topology_t *t2, *t3; - pmix_topology_t ptopo; - hwloc_topology_t topo; - char *sig; - pmix_data_buffer_t bucket; - hwloc_obj_t root; - prte_hwloc_topo_data_t *sum; - char **topos = NULL; - pmix_rank_t drk; - - /* check to see if we have uniform topologies */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &i8, &cnt, PMIX_INT8); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* we already defaulted to uniform topology, so only need to - * process this if it is non-uniform */ - if (0 != i8) { - prte_hetero_nodes = true; - /* get the number of topologies */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &ntopos, &cnt, PMIX_INT32); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* unpack the compression flag */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &compressed, &cnt, PMIX_BOOL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* unpack the topology object */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &pbo, &cnt, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - - /* if compressed, decompress */ - if (compressed) { - if (!PMIx_Data_decompress((uint8_t *) pbo.bytes, pbo.size, (uint8_t **) &bytes, &sz)) { - PRTE_ERROR_LOG(PRTE_ERROR); - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - rc = PRTE_ERROR; - goto cleanup; - } - } else { - bytes = (uint8_t *) pbo.bytes; - sz = pbo.size; - pbo.bytes = NULL; - pbo.size = 0; - } - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); // release pre-existing data - PMIX_BYTE_OBJECT_LOAD(&pbo, bytes, sz); - - /* setup to unpack */ - PMIX_DATA_BUFFER_CONSTRUCT(&bucket); - rc = PMIx_Data_load(&bucket, &pbo); - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - - for (n = 0; n < ntopos; n++) { - /* unpack the signature */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, &bucket, &sig, &cnt, PMIX_STRING); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* cache it */ - pmix_argv_append_nosize(&topos, sig); - /* unpack the topology */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, &bucket, &ptopo, &cnt, PMIX_TOPO); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - topo = ptopo.topology; - ptopo.topology = NULL; - PMIX_TOPOLOGY_DESTRUCT(&ptopo); - /* see if we already have it - there aren't many topologies - * in a cluster, so this won't take long */ - found = false; - for (m = 0; m < prte_node_topologies->size; m++) { - t3 = (prte_topology_t *) pmix_pointer_array_get_item(prte_node_topologies, m); - if (NULL == t3) { - continue; - } - if (0 == strcmp(sig, t3->sig)) { - found = true; - break; - } - } - if (found) { - hwloc_topology_destroy(topo); - free(sig); - } else { - /* record it */ - t2 = PMIX_NEW(prte_topology_t); - t2->sig = sig; - t2->topo = topo; - /* need to ensure the summary is setup */ - root = hwloc_get_root_obj(topo); - root->userdata = (void *) PMIX_NEW(prte_hwloc_topo_data_t); - sum = (prte_hwloc_topo_data_t *) root->userdata; - sum->available = prte_hwloc_base_setup_summary(topo); - t2->index = pmix_pointer_array_add(prte_node_topologies, t2); - } - } - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - - /* now get the array of assigned topologies */ - /* unpack the compression flag */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &compressed, &cnt, PMIX_BOOL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* unpack the topologies object */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &pbo, &cnt, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* if compressed, decompress */ - if (compressed) { - if (!PMIx_Data_decompress((uint8_t *) pbo.bytes, pbo.size, (uint8_t **) &bytes, &sz)) { - PRTE_ERROR_LOG(PRTE_ERROR); - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - rc = PRTE_ERROR; - goto cleanup; - } - } else { - bytes = (uint8_t *) pbo.bytes; - sz = pbo.size; - pbo.bytes = NULL; - pbo.size = 0; - } - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); // release pre-existing data - PMIX_BYTE_OBJECT_LOAD(&pbo, bytes, sz); - - /* setup to unpack */ - PMIX_DATA_BUFFER_CONSTRUCT(&bucket); - rc = PMIx_Data_load(&bucket, &pbo); - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); // release pre-existing data - - cnt = 1; - rc = PMIx_Data_unpack(NULL, &bucket, &drk, &cnt, PMIX_PROC_RANK); - while (PMIX_SUCCESS == rc) { - nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, drk); - if (NULL == nptr) { - PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - cnt = 1; - rc = PMIx_Data_unpack(NULL, &bucket, &m, &cnt, PMIX_INT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - /* the topology signature we want is in that location in - * the topos argv array */ - sig = topos[m]; - /* find that signature in our topologies - might be at a - * different location */ - for (m = 0; m < prte_node_topologies->size; m++) { - t3 = (prte_topology_t *) pmix_pointer_array_get_item(prte_node_topologies, m); - if (NULL == t3) { - continue; - } - if (0 == strcmp(sig, t3->sig)) { - nptr->topology = t3; - break; - } - } - /* unpack the next daemon rank */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, &bucket, &drk, &cnt, PMIX_PROC_RANK); - } - if (PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - } - - /* check to see if we have uniform slot assignments */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &i16, &cnt, PMIX_INT16); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - - /* if so, then make every node the same */ - if (0 > i16) { - i16 = -1 * i16; - for (n = 0; n < prte_node_pool->size; n++) { - if (NULL != (nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n))) { - nptr->slots = i16; - } - } - } else { - /* unpack the slots object */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &pbo, &cnt, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* if compressed, decompress */ - if (1 == i16) { - if (!PMIx_Data_decompress((uint8_t *) pbo.bytes, pbo.size, (uint8_t **) &slots, &sz)) { - PRTE_ERROR_LOG(PRTE_ERROR); - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - rc = PRTE_ERROR; - goto cleanup; - } - } else { - slots = (uint16_t *) pbo.bytes; - pbo.bytes = NULL; - pbo.size = 0; - } - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - - /* cycle across the node pool and assign the values */ - for (n = 0, m = 0; n < prte_node_pool->size; n++) { - if (NULL != (nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n))) { - nptr->slots = slots[m]; - ++m; - } - } - } - - /* check to see if we have uniform flag assignments */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &i8, &cnt, PMIX_INT8); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - - /* if so, then make every node the same */ - if (0 > i8) { - i8 += 2; - for (n = 0; n < prte_node_pool->size; n++) { - if (NULL != (nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n))) { - if (i8) { - PRTE_FLAG_SET(nptr, PRTE_NODE_FLAG_SLOTS_GIVEN); - } else { - PRTE_FLAG_UNSET(nptr, PRTE_NODE_FLAG_SLOTS_GIVEN); - } - } - } - } else { - /* unpack the slots object */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &pbo, &cnt, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto cleanup; - } - /* if compressed, decompress */ - if (2 == i8) { - if (!PMIx_Data_decompress((uint8_t *) pbo.bytes, pbo.size, (uint8_t **) &flags, &sz)) { - PRTE_ERROR_LOG(PRTE_ERROR); - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - rc = PRTE_ERROR; - goto cleanup; - } - } else { - flags = (uint8_t *) pbo.bytes; - pbo.bytes = NULL; - pbo.size = 0; - } - PMIX_BYTE_OBJECT_DESTRUCT(&pbo); - - /* cycle across the node pool and assign the values */ - for (n = 0, m = 0; n < prte_node_pool->size; n++) { - if (NULL != (nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n))) { - if (flags[m]) { - PRTE_FLAG_SET(nptr, PRTE_NODE_FLAG_SLOTS_GIVEN); - } else { - PRTE_FLAG_UNSET(nptr, PRTE_NODE_FLAG_SLOTS_GIVEN); - } - ++m; - } - } - } - -cleanup: - if (NULL != slots) { - free(slots); - } - if (NULL != flags) { - free(flags); - } - if (NULL != topos) { - pmix_argv_free(topos); - } - return rc; -} - -int prte_util_generate_ppn(prte_job_t *jdata, pmix_data_buffer_t *buf) -{ - uint16_t ppn; - int rc = PRTE_SUCCESS; - prte_app_idx_t i; - int j, k; - pmix_byte_object_t bo; - bool compressed; - prte_node_t *nptr; - prte_proc_t *proc; - size_t sz; - pmix_data_buffer_t bucket; - prte_app_context_t *app; - - for (i = 0; i < jdata->num_apps; i++) { - PMIX_DATA_BUFFER_CONSTRUCT(&bucket); - /* for each app_context */ - if (NULL != (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i))) { - for (j = 0; j < jdata->map->num_nodes; j++) { - if (NULL - == (nptr = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, j))) { - continue; - } - if (NULL == nptr->daemon) { - continue; - } - ppn = 0; - for (k = 0; k < nptr->procs->size; k++) { - if (NULL - != (proc = (prte_proc_t *) pmix_pointer_array_get_item(nptr->procs, k))) { - if (PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace) - && proc->app_idx == app->idx) { - ++ppn; - } - } - } - if (0 < ppn) { - rc = PMIx_Data_pack(NULL, &bucket, &nptr->index, 1, PMIX_INT32); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - rc = PMIx_Data_pack(NULL, &bucket, &ppn, 1, PMIX_UINT16); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - } - } - } - - if (PMIx_Data_compress((uint8_t *) bucket.base_ptr, bucket.bytes_used, - (uint8_t **) &bo.bytes, &sz)) { - /* mark that this was compressed */ - compressed = true; - bo.size = sz; - } else { - /* mark that this was not compressed */ - compressed = false; - rc = PMIx_Data_unload(&bucket, &bo); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - goto cleanup; - } - } - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - /* indicate compression */ - rc = PMIx_Data_pack(NULL, buf, &compressed, 1, PMIX_BOOL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - if (compressed) { - PMIX_BYTE_OBJECT_DESTRUCT(&bo); - } - goto cleanup; - } - /* add the object */ - rc = PMIx_Data_pack(NULL, buf, &bo, 1, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - PMIX_BYTE_OBJECT_DESTRUCT(&bo); - break; - } - PMIX_BYTE_OBJECT_DESTRUCT(&bo); - } - -cleanup: - return rc; -} - -int prte_util_decode_ppn(prte_job_t *jdata, pmix_data_buffer_t *buf) -{ - int32_t index; - prte_app_idx_t n; - int cnt, rc = PRTE_SUCCESS, m; - pmix_byte_object_t bo; - bool compressed; - uint8_t *bytes; - size_t sz; - uint16_t ppn, k; - prte_node_t *node; - prte_proc_t *proc; - pmix_data_buffer_t bucket; - - /* reset any flags */ - for (m = 0; m < jdata->map->nodes->size; m++) { - if (NULL != (node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m))) { - PRTE_FLAG_UNSET(node, PRTE_NODE_FLAG_MAPPED); - } - } - - for (n = 0; n < jdata->num_apps; n++) { - /* unpack the compression flag */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &compressed, &cnt, PMIX_BOOL); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return rc; - } - /* unpack the byte object describing this app */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, buf, &bo, &cnt, PMIX_BYTE_OBJECT); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - return rc; - } - - if (PRTE_PROC_IS_MASTER) { - /* just discard it */ - PMIX_BYTE_OBJECT_DESTRUCT(&bo); - continue; - } - - /* decompress if required */ - if (compressed) { - if (!PMIx_Data_decompress((uint8_t *) bo.bytes, bo.size, &bytes, &sz)) { - PRTE_ERROR_LOG(PRTE_ERROR); - PMIX_BYTE_OBJECT_DESTRUCT(&bo); - return PRTE_ERROR; - } - } else { - bytes = (uint8_t *) bo.bytes; - sz = bo.size; - bo.bytes = NULL; - bo.size = 0; - } - PMIX_BYTE_OBJECT_DESTRUCT(&bo); // release pre-existing data - PMIX_BYTE_OBJECT_LOAD(&bo, bytes, sz); - - /* setup to unpack */ - PMIX_DATA_BUFFER_CONSTRUCT(&bucket); - rc = PMIx_Data_load(&bucket, &bo); - PMIX_BYTE_OBJECT_DESTRUCT(&bo); - - /* unpack each node and its ppn */ - cnt = 1; - while (PMIX_SUCCESS == (rc = PMIx_Data_unpack(NULL, &bucket, &index, &cnt, PMIX_INT32))) { - /* get the corresponding node object */ - if (NULL - == (node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, index))) { - rc = PRTE_ERR_NOT_FOUND; - PRTE_ERROR_LOG(rc); - goto error; - } - /* add the node to the job map if not already assigned */ - if (!PRTE_FLAG_TEST(node, PRTE_NODE_FLAG_MAPPED)) { - PMIX_RETAIN(node); - pmix_pointer_array_add(jdata->map->nodes, node); - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_MAPPED); - } - /* get the ppn */ - cnt = 1; - rc = PMIx_Data_unpack(NULL, &bucket, &ppn, &cnt, PMIX_UINT16); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto error; - } - /* create a proc object for each one */ - for (k = 0; k < ppn; k++) { - proc = PMIX_NEW(prte_proc_t); - PMIX_LOAD_NSPACE(proc->name.nspace, jdata->nspace); - /* leave the vpid undefined as this will be determined - * later when we do the overall ranking */ - proc->app_idx = n; - proc->parent = node->daemon->name.rank; - PMIX_RETAIN(node); - proc->node = node; - /* flag the proc as ready for launch */ - proc->state = PRTE_PROC_STATE_INIT; - pmix_pointer_array_add(node->procs, proc); - node->num_procs++; - /* we will add the proc to the jdata array when we - * compute its rank */ - } - cnt = 1; - } - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - } - if (PMIX_SUCCESS != rc && PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { - PMIX_ERROR_LOG(rc); - } - - /* reset any flags */ - for (m = 0; m < jdata->map->nodes->size; m++) { - node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m); - if (NULL != node) { - PRTE_FLAG_UNSET(node, PRTE_NODE_FLAG_MAPPED); - } - } - return PRTE_SUCCESS; - -error: - PMIX_DATA_BUFFER_DESTRUCT(&bucket); - /* reset any flags */ - for (m = 0; m < jdata->map->nodes->size; m++) { - node = (prte_node_t *) pmix_pointer_array_get_item(jdata->map->nodes, m); - if (NULL != node) { - PRTE_FLAG_UNSET(node, PRTE_NODE_FLAG_MAPPED); - } - } - return rc; -} diff --git a/src/util/nidmap.h b/src/util/nidmap.h index f652b2507f..9473538398 100644 --- a/src/util/nidmap.h +++ b/src/util/nidmap.h @@ -35,14 +35,4 @@ PRTE_EXPORT int prte_util_nidmap_create(pmix_pointer_array_t *pool, pmix_data_bu PRTE_EXPORT int prte_util_decode_nidmap(pmix_data_buffer_t *buf); -/* pass topology and #slots info */ -PRTE_EXPORT int prte_util_pass_node_info(pmix_data_buffer_t *buf); - -PRTE_EXPORT int prte_util_parse_node_info(pmix_data_buffer_t *buf); - -/* pass info about node assignments for a specific job */ -PRTE_EXPORT int prte_util_generate_ppn(prte_job_t *jdata, pmix_data_buffer_t *buf); - -PRTE_EXPORT int prte_util_decode_ppn(prte_job_t *jdata, pmix_data_buffer_t *buf); - #endif /* PRTE_NIDMAP_H */ diff --git a/src/util/prte_cmd_line.h b/src/util/prte_cmd_line.h index a236ff3d8d..637e05abe3 100644 --- a/src/util/prte_cmd_line.h +++ b/src/util/prte_cmd_line.h @@ -171,6 +171,13 @@ BEGIN_C_DECLS #define PRTE_CLI_RANKFILE "rankfile" #define PRTE_CLI_NONE "none" #define PRTE_CLI_HWTCPUS "hwtcpus" +#define PRTE_CLI_PELIST "pe-list=" + +// Ranking directives +// PRTE_CLI_SLOT, PRTE_CLI_NODE, PRTE_CLI_SPAN reused here +#define PRTE_CLI_FILL "fill" +#define PRTE_CLI_OBJ "object" + // Output directives #define PRTE_CLI_TAG "tag" @@ -201,11 +208,9 @@ BEGIN_C_DECLS #define PRTE_CLI_DEVICE "device=" #define PRTE_CLI_INHERIT "inherit" #define PRTE_CLI_NOINHERIT "noinherit" -#define PRTE_CLI_PELIST "pe-list=" #define PRTE_CLI_QDIR "dir=" #define PRTE_CLI_QFILE "file=" #define PRTE_CLI_NOLAUNCH "donotlaunch" -#define PRTE_CLI_FILL "fill" #define PRTE_CLI_OVERLOAD "overload-allowed" #define PRTE_CLI_NOOVERLOAD "no-overload" #define PRTE_CLI_IF_SUPP "if-supported"