Skip to content

Commit

Permalink
Fix resource usage tracking for map/bind operations
Browse files Browse the repository at this point in the history
Tracking solely at the slot level doesn't adequately protect against
overlapping CPU assignments and other more complex mapping requests.
Modify the resource usage tracking to operate at the CPU level and
combine the mapping/binding operation into a single pass.

Ranking must still be done as a second pass, but restrict the options
to simplify implementation and avoid confusion. Update the help output
to reflect the changes.

Allow the DVM to also support "do-not-launch" directives for testing
purposes, and to accept simulated node/topologies.

Fix a few other minor problems along the way.

Signed-off-by: Ralph Castain <rhc@pmix.org>
  • Loading branch information
rhc54 committed Jul 15, 2022
1 parent 06254c3 commit 0acc016
Show file tree
Hide file tree
Showing 63 changed files with 3,099 additions and 6,719 deletions.
84 changes: 84 additions & 0 deletions src/hwloc/help-prte-hwloc-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved
# Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
# Copyright (c) 2022 Nanook Consulting. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand Down Expand Up @@ -85,3 +86,86 @@ The binding request contains an unrecognized modifier:
Request: %s

Please check your request and try again.
#
[bind-to-option]
By default, processes are bound to individual CPUs (either COREs
or HWTHREADs, as defined by default or by user specification for
the job). On nodes that are OVERSUBSCRIBEd (i.e., where the number
of procs exceeds the number of assigned slots), the default is to
not bind the processes.

NOTE: processes from prior jobs that are already executing on a node
are not "unbound" when a new job mapping results in the node
becoming oversubscribed.

Binding is performed to the first available specified object type
within the object where the process was mapped. In other words,
binding can only be done to the mapped object or to a resource
located beneath that object.

An object is considered completely consumed when the number of
processes bound to it equals the number of CPUs within it. Unbound
processes are not considered in this computation. Additional
processes cannot be mapped to consumed objects unless the
OVERLOAD qualifier is provided via the "--bind-to" command
line option.

Note that directives and qualifiers are case-insensitive
and can be shortened to the minimum number of characters
to uniquely identify them. Thus, "L1CACHE" can be given
as "l1cache" or simply as "L1".

Supported binding directives include:

- NONE does not bind the processes

- HWTHREAD binds each process to a single hardware
thread/ This requires that hwthreads be treated
as independent CPUs (i.e., that either the HWTCPUS
qualifier be provided to the "map-by" option or
that hwthreads be designated as CPUs by default).

- CORE binds each process to a single core. This
can be done whether hwthreads or cores are being
treated as independent CPUs provided that mapping
is performed at the core or higher level.

- L1CACHE binds each process to all the CPUs in
an L1 cache.

- L2CACHE binds each process to all the CPUs in
an L2 cache

- L3CACHE binds each process to all the CPUs in
an L3 cache

- NUMA binds each process to all the CPUs in a NUMA
region

- PACKAGE binds each process to all the CPUs in a PACKAGE

Any directive can include qualifiers by adding a colon (:) and any
combination of one or more of the following to the --bind-to option:

- OVERLOAD indicates that objects can have more
processes bound to them than CPUs within them

- IF-SUPPORTED indicates that the job should continue to
be launched and executed even if binding cannot be
performed as requested.

- REPORT outputs a report on the bindings for the processes
to stderr
#
[bind-upwards]
Binding is performed to the first available specified object type
within the object where the process was mapped. In other words,
binding can only be done to the mapped object or to a resource
located beneath that object.

The specified binding lies above the mapping object type:

Mapping level: %s
Binding level: %s

Please correct the map/bind directives and try again.
50 changes: 1 addition & 49 deletions src/hwloc/hwloc-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,35 +113,6 @@ typedef struct {
size_t mbs_len;
} prte_hwloc_base_memory_segment_t;

/* structs for storing info on objects */
typedef struct {
pmix_object_t super;
hwloc_cpuset_t available;
bool npus_calculated;
unsigned int npus;
unsigned int idx;
unsigned int num_bound;
} prte_hwloc_obj_data_t;
PMIX_CLASS_DECLARATION(prte_hwloc_obj_data_t);

typedef struct {
pmix_list_item_t super;
hwloc_obj_type_t type;
unsigned cache_level;
unsigned int num_objs;
pmix_list_t sorted_by_dist_list;
} prte_hwloc_summary_t;
PMIX_CLASS_DECLARATION(prte_hwloc_summary_t);

typedef struct {
pmix_object_t super;
hwloc_cpuset_t available;
pmix_list_t summaries;
hwloc_obj_t* numas;
unsigned num_numas;
} prte_hwloc_topo_data_t;
PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_hwloc_topo_data_t);

/* define binding policies */
typedef uint16_t prte_binding_policy_t;
#define PRTE_BINDING_POLICY PRTE_UINT16
Expand All @@ -150,9 +121,6 @@ typedef uint16_t prte_binding_policy_t;
#define PRTE_BIND_IF_SUPPORTED 0x1000
#define PRTE_BIND_ALLOW_OVERLOAD 0x2000
#define PRTE_BIND_GIVEN 0x4000
/* bind each rank to the cpu in the given
* cpu list based on its node-local-rank */
#define PRTE_BIND_ORDERED 0x8000
// overload policy was given
#define PRTE_BIND_OVERLOAD_GIVEN 0x0100

Expand Down Expand Up @@ -184,12 +152,10 @@ typedef uint16_t prte_binding_policy_t;
/* macro to detect if binding is forced */
#define PRTE_BIND_OVERLOAD_ALLOWED(n) (PRTE_BIND_ALLOW_OVERLOAD & (n))
#define PRTE_BIND_OVERLOAD_SET(n) (PRTE_BIND_OVERLOAD_GIVEN & (n))
#define PRTE_BIND_ORDERED_REQUESTED(n) (PRTE_BIND_ORDERED & (n))

/* some global values */
PRTE_EXPORT extern hwloc_topology_t prte_hwloc_topology;
PRTE_EXPORT extern prte_binding_policy_t prte_hwloc_default_binding_policy;
PRTE_EXPORT extern hwloc_cpuset_t prte_hwloc_my_cpuset;
PRTE_EXPORT extern hwloc_obj_type_t prte_hwloc_levels[];
PRTE_EXPORT extern char *prte_hwloc_default_cpu_list;
PRTE_EXPORT extern bool prte_hwloc_default_use_hwthread_cpus;
Expand Down Expand Up @@ -264,15 +230,6 @@ PRTE_EXPORT int prte_hwloc_base_set_default_binding(void *jdata,
void *options);
PRTE_EXPORT int prte_hwloc_base_set_binding_policy(void *jdata, char *spec);

/**
* Loads prte_hwloc_my_cpuset (global variable in
* src/hwloc/hwloc-internal.h) for this process. prte_hwloc_my_cpuset
* will be loaded with this process' binding, or, if the process is
* not bound, use the hwloc root object's (available and online)
* cpuset.
*/
PRTE_EXPORT void prte_hwloc_base_get_local_cpuset(void);

struct prte_rmaps_numa_node_t {
pmix_list_item_t super;
int index;
Expand Down Expand Up @@ -322,26 +279,21 @@ PRTE_EXPORT int prte_hwloc_base_set_topology(char *topofile);
PRTE_EXPORT hwloc_cpuset_t prte_hwloc_base_generate_cpuset(hwloc_topology_t topo,
bool use_hwthread_cpus, char *cpulist);

PRTE_EXPORT int prte_hwloc_base_filter_cpus(hwloc_topology_t topo);
PRTE_EXPORT hwloc_cpuset_t prte_hwloc_base_filter_cpus(hwloc_topology_t topo);

/**
* Free the hwloc topology.
*/
PRTE_EXPORT void prte_hwloc_base_free_topology(hwloc_topology_t topo);
PRTE_EXPORT unsigned int prte_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo,
hwloc_obj_type_t target,
unsigned cache_level);
PRTE_EXPORT void prte_hwloc_base_clear_usage(hwloc_topology_t topo);

PRTE_EXPORT hwloc_obj_t prte_hwloc_base_get_obj_by_type(hwloc_topology_t topo,
hwloc_obj_type_t target,
unsigned cache_level,
unsigned int instance);
PRTE_EXPORT unsigned int prte_hwloc_base_get_obj_idx(hwloc_topology_t topo, hwloc_obj_t obj);

PRTE_EXPORT int prte_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char *device_name,
pmix_list_t *sorted_list);

/**
* Get the number of pu's under a given hwloc object.
*/
Expand Down
126 changes: 15 additions & 111 deletions src/hwloc/hwloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -282,19 +282,13 @@ void prte_hwloc_base_close(void)
return;
}

/* free memory */
if (NULL != prte_hwloc_my_cpuset) {
hwloc_bitmap_free(prte_hwloc_my_cpuset);
prte_hwloc_my_cpuset = NULL;
}

if (NULL != prte_hwloc_default_cpu_list) {
free(prte_hwloc_default_cpu_list);
}

/* destroy the topology */
if (NULL != prte_hwloc_topology) {
prte_hwloc_base_free_topology(prte_hwloc_topology);
hwloc_topology_destroy(prte_hwloc_topology);
prte_hwloc_topology = NULL;
}

Expand All @@ -305,7 +299,7 @@ void prte_hwloc_base_close(void)
int prte_hwloc_base_set_default_binding(void *jd, void *opt)
{
prte_job_t *jdata = (prte_job_t*)jd;
prte_schizo_options_t *options = (prte_schizo_options_t*)opt;
prte_rmaps_options_t *options = (prte_rmaps_options_t*)opt;
prte_mapping_policy_t mpol;

if (prte_get_attribute(&jdata->attributes, PRTE_JOB_PES_PER_PROC, NULL, PMIX_UINT16)) {
Expand Down Expand Up @@ -358,73 +352,34 @@ int prte_hwloc_base_set_default_binding(void *jd, void *opt)
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_PACKAGE);
} else {
/* we are mapping by node or some other non-object method */
if (options->nprocs <= 2) {
if (options->use_hwthreads) {
/* if we are using hwthread cpus, then bind to those */
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given - using byhwthread", __LINE__);
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding,
PRTE_BIND_TO_HWTHREAD);
} else {
/* for performance, bind to core */
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given - using bycore", __LINE__);
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding,
PRTE_BIND_TO_CORE);
}
if (options->use_hwthreads) {
/* if we are using hwthread cpus, then bind to those */
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given - using byhwthread", __LINE__);
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding,
PRTE_BIND_TO_HWTHREAD);
} else {
/* bind to numa (if present), or by package (if numa isn't present and package is) */
if (NULL != hwloc_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_NUMANODE, 0)) {
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given - using bynuma", __LINE__);
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_NUMA);
} else if (NULL != hwloc_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_PACKAGE, 0)) {
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given - using bypackage", __LINE__);
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_PACKAGE);
} else {
/* if we have neither, then just don't bind */
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given and no NUMA "
"or packages - not binding",
__LINE__);
PRTE_SET_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_NONE);
}
/* otherwise bind to core */
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given - using bycore", __LINE__);
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding,
PRTE_BIND_TO_CORE);
}
}
} else if (options->nprocs <= 2) {
} else {
if (options->use_hwthreads) {
/* if we are using hwthread cpus, then bind to those */
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given - using byhwthread",
__LINE__);
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_HWTHREAD);
} else {
/* for performance, bind to core */
/* otherwise bind to core */
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given - using bycore",
__LINE__);
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_CORE);
}
} else {
/* for performance, bind to numa, if available, else try package */
if (NULL != hwloc_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_NUMANODE, 0)) {
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given - using bynuma",
__LINE__);
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_NUMA);
} else if (NULL != hwloc_get_obj_by_type(prte_hwloc_topology, HWLOC_OBJ_PACKAGE, 0)) {
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given - using bypackage",
__LINE__);
PRTE_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_PACKAGE);
} else {
/* just don't bind */
prte_output_verbose(options->verbosity, options->stream,
"setdefaultbinding[%d] binding not given and no packages - not binding",
__LINE__);
PRTE_SET_BINDING_POLICY(jdata->map->binding, PRTE_BIND_TO_NONE);
}
}
}
/* they might have set the overload-allowed flag while wanting PRRTE
Expand Down Expand Up @@ -566,55 +521,6 @@ char *prte_hwloc_base_print_locality(prte_hwloc_locality_t locality)
return ptr->buffers[ptr->cntr];
}

static void obj_data_const(prte_hwloc_obj_data_t *ptr)
{
ptr->npus_calculated = false;
ptr->npus = 0;
ptr->idx = UINT_MAX;
ptr->num_bound = 0;
}
PMIX_CLASS_INSTANCE(prte_hwloc_obj_data_t, pmix_object_t, obj_data_const, NULL);

static void sum_const(prte_hwloc_summary_t *ptr)
{
ptr->num_objs = 0;
PMIX_CONSTRUCT(&ptr->sorted_by_dist_list, pmix_list_t);
}
static void sum_dest(prte_hwloc_summary_t *ptr)
{
pmix_list_item_t *item;
while (NULL != (item = pmix_list_remove_first(&ptr->sorted_by_dist_list))) {
PMIX_RELEASE(item);
}
PMIX_DESTRUCT(&ptr->sorted_by_dist_list);
}
PMIX_CLASS_INSTANCE(prte_hwloc_summary_t, pmix_list_item_t, sum_const, sum_dest);
static void topo_data_const(prte_hwloc_topo_data_t *ptr)
{
ptr->available = NULL;
PMIX_CONSTRUCT(&ptr->summaries, pmix_list_t);
ptr->numas = NULL;
ptr->num_numas = 0;
}
static void topo_data_dest(prte_hwloc_topo_data_t *ptr)
{
pmix_list_item_t *item;

if (NULL != ptr->available) {
hwloc_bitmap_free(ptr->available);
}
while (NULL != (item = pmix_list_remove_first(&ptr->summaries))) {
PMIX_RELEASE(item);
}
PMIX_DESTRUCT(&ptr->summaries);
if (NULL != ptr->numas) {
free(ptr->numas);
}
}
PMIX_CLASS_INSTANCE(prte_hwloc_topo_data_t, pmix_object_t, topo_data_const, topo_data_dest);

PMIX_CLASS_INSTANCE(prte_rmaps_numa_node_t, pmix_list_item_t, NULL, NULL);

int prte_hwloc_base_set_binding_policy(void *jdat, char *spec)
{
int i;
Expand Down Expand Up @@ -647,8 +553,6 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec)
} else if (0 == strcasecmp(quals[i], "no-overload")) {
tmp = (tmp & ~PRTE_BIND_ALLOW_OVERLOAD);
tmp |= PRTE_BIND_OVERLOAD_GIVEN;
} else if (0 == strcasecmp(quals[i], "ordered")) {
tmp |= PRTE_BIND_ORDERED;
} else if (0 == strcasecmp(quals[i], "REPORT")) {
if (NULL == jdata) {
pmix_show_help("help-prte-rmaps-base.txt", "unsupported-default-modifier", true,
Expand Down
Loading

0 comments on commit 0acc016

Please sign in to comment.