Skip to content

Commit

Permalink
Fix add-hostfile and add-host operations
Browse files Browse the repository at this point in the history
When in a managed allocation, allow addition of nodes
provided that either (a) all the nodes in the managed
portion of the allocation have the same #slots in them
so we can infer how many are in the new nodes, or (b)
the user specifies the #slots for the new nodes.

Check node names as well as aliases for match.

Add a new ras component to simulate a managed allocation.

Signed-off-by: Ralph Castain <rhc@pmix.org>
(cherry picked from commit bffbc40)
  • Loading branch information
rhc54 committed Nov 19, 2023
1 parent d683772 commit 47a66cf
Show file tree
Hide file tree
Showing 7 changed files with 318 additions and 20 deletions.
15 changes: 15 additions & 0 deletions src/mca/ras/base/help-ras-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,18 @@ file could not be opened for reading:
File: %s

Please check the filename and try again.
#
[ras-base:nonuniform-slots]
A request was made to add hosts from a hostfile while operating
in a managed allocation. In this case, either the slots must be
specified in the given hostfile, or the number of slots assigned
by the resource manager on the existing nodes must be uniform.

The current allocation does not conform to that requirement:

Base number of slots: %d
Node: %s
Number of slots: %d

Please assign a number of slots for each node to be added to the
allocation.
107 changes: 88 additions & 19 deletions src/mca/ras/base/ras_base_allocate.c
Original file line number Diff line number Diff line change
Expand Up @@ -753,13 +753,14 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
{
int rc;
pmix_list_t nodes;
int i, k, n, slots;
int i, k, m, n, slots;
prte_app_context_t *app;
prte_node_t *node, *next, *nptr;
char *hosts, *line, *cptr, *ptr, **hostfiles;
char *hosts, *line, *cptr, *ptr, **hostfiles, *nm;
FILE *fp;
bool addslots, found;
bool extend = false;
int default_slots = -1;

PMIX_CONSTRUCT(&nodes, pmix_list_t);

Expand All @@ -776,6 +777,32 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
* can be present
*/

/* if we are in a managed allocation, the best we can do for nodes
* that do not include a specific slot assignment is to (a) check
* to see if there is a uniform assignment on existing nodes and
* use that, or (b) generate an error as we cannot know what the
* host environment might have set
*/
if (prte_managed_allocation) {
for (n = 0; n < prte_node_pool->size; n++) {
nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n);
if (NULL == nptr) {
continue;
}
if (-1 == default_slots) {
default_slots = nptr->slots;
continue;
}
if (default_slots != nptr->slots) {
// generate an error message
pmix_show_help("help-ras-base.txt", "ras-base:nonuniform-slots", true,
default_slots, nptr->name, nptr->slots);
PMIX_LIST_DESTRUCT(&nodes);
return PRTE_ERR_SILENT;
}
}
}

for (i = 0; i < jdata->apps->size; i++) {
if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(jdata->apps, i))) {
continue;
Expand Down Expand Up @@ -819,28 +846,35 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
free(line);
continue;
}

addslots = false;
// because there can be arbitrary whitespace around keywords,
// we manually parse the line to get the directives
ptr = cptr;
while (NULL != ptr && !isspace(*ptr)) {
while ('\0' != *ptr && !isspace(*ptr)) {
++ptr;
}
*ptr = '\0';
if ('\0' == *ptr) {
// end of the line - just the node name was given
slots = default_slots;
goto process;
}
*ptr = '\0'; // terminate the name
// find the '=' sign
++ptr;
ptr = strchr(ptr, '=');
if (NULL == ptr) {
// didn't specify slots - autodetect them
slots = -1;
while ('\0' != *ptr && ('=' != *ptr || isspace(*ptr))) {
++ptr;
}
if ('\0' == *ptr) {
// didn't specify slots - use the default value
slots = default_slots;
goto process;
}
// find the value
++ptr;
while (NULL != ptr && '\0' != *ptr && isspace(*ptr)) {
while ('\0' != *ptr && isspace(*ptr)) {
++ptr;
}
if (NULL == ptr || '\0' == *ptr) {
if ('\0' == *ptr) {
// bad syntax
PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM);
fclose(fp);
Expand All @@ -851,7 +885,6 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
}
// if it is a '+' or '-', then we are adjusting
// the #slots
addslots = false;
if ('+' == *ptr || '-' == *ptr) {
addslots = true;
}
Expand All @@ -860,23 +893,42 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
process:
// see if we have this node
found = false;
for (n = 0; n < prte_node_pool->size; n++) {
// does the name refer to me?
if (prte_check_host_is_local(cptr)) {
nm = prte_process_info.nodename;
} else {
nm = cptr;
}

for (n = 0; !found && n < prte_node_pool->size; n++) {
nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n);
if (NULL == nptr) {
continue;
}
if (0 == strcmp(cptr, nptr->name)) {
if (0 == strcmp(nm, nptr->name)) {
// we have the node
if (addslots) {
nptr->slots += slots;
if (0 > nptr->slots) {
nptr->slots = 0;
}
} else {
nptr->slots = slots;
}
found = true;
break;
} else if (NULL != nptr->aliases) {
/* no choice but an exhaustive search - fortunately, these lists are short! */
for (m = 0; NULL != nptr->aliases[m]; m++) {
if (0 == strcmp(cptr, nptr->aliases[m])) {
if (addslots) {
nptr->slots += slots;
if (0 > nptr->slots) {
nptr->slots = 0;
}
}
found = true;
break;
}
}
}
}
if (!found) {
Expand Down Expand Up @@ -942,7 +994,8 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
PMIX_LIST_FOREACH_SAFE(node, next, &nodes, prte_node_t)
{
node->state = PRTE_NODE_STATE_ADDED;
for (n = 0; n < prte_node_pool->size; n++) {
found = false;
for (n = 0; !found && n < prte_node_pool->size; n++) {
nptr = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n);
if (NULL == nptr) {
continue;
Expand All @@ -956,7 +1009,22 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
}
pmix_list_remove_item(&nodes, &node->super);
PMIX_RELEASE(node);
break;
found = true;
} else if (NULL != nptr->aliases) {
/* no choice but an exhaustive search - fortunately, these lists are short! */
for (m = 0; !found && NULL != nptr->aliases[m]; m++) {
if (0 == strcmp(node->name, nptr->aliases[m])) {
if (prte_get_attribute(&node->attributes, PRTE_NODE_ADD_SLOTS, NULL, PMIX_BOOL)) {
nptr->slots += node->slots;
prte_remove_attribute(&node->attributes, PRTE_NODE_ADD_SLOTS);
} else {
nptr->slots = node->slots;
}
pmix_list_remove_item(&nodes, &node->super);
PMIX_RELEASE(node);
found = true;
}
}
}
}
}
Expand All @@ -981,7 +1049,8 @@ int prte_ras_base_add_hosts(prte_job_t *jdata)
}

/* shall we display the results? */
if (0 < pmix_output_get_verbosity(prte_ras_base_framework.framework_output)) {
if (0 < pmix_output_get_verbosity(prte_ras_base_framework.framework_output) ||
prte_get_attribute(&jdata->attributes, PRTE_JOB_DISPLAY_ALLOC, NULL, PMIX_BOOL)) {
prte_ras_base_display_alloc(jdata);
}

Expand Down
42 changes: 42 additions & 0 deletions src/mca/ras/testrm/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#
# Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2017-2020 Intel, Inc. All rights reserved.
# Copyright (c) 2022-2023 Nanook Consulting. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

sources = \
ras_testrm.h \
ras_testrm_component.c \
ras_testrm.c

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).

if MCA_BUILD_prte_ras_testrm_DSO
lib =
lib_sources =
component = prte_mca_ras_testrm.la
component_sources = $(sources)
else
lib = libprtemca_ras_testrm.la
lib_sources = $(sources)
component =
component_sources =
endif

mcacomponentdir = $(prtelibdir)
mcacomponent_LTLIBRARIES = $(component)
prte_mca_ras_testrm_la_SOURCES = $(component_sources)
prte_mca_ras_testrm_la_LDFLAGS = -module -avoid-version
prte_mca_ras_testrm_la_LIBADD = $(top_builddir)/src/libprrte.la

noinst_LTLIBRARIES = $(lib)
libprtemca_ras_testrm_la_SOURCES = $(lib_sources)
libprtemca_ras_testrm_la_LDFLAGS = -module -avoid-version
54 changes: 54 additions & 0 deletions src/mca/ras/testrm/ras_testrm.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
* Copyright (c) 2015-2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
*
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "prte_config.h"
#include "constants.h"
#include "types.h"

#include "src/class/pmix_list.h"
#include "src/runtime/prte_globals.h"
#include "src/util/hostfile/hostfile.h"
#include "ras_testrm.h"

/*
* Local functions
*/
static int allocate(prte_job_t *jdata, pmix_list_t *nodes);
static int finalize(void);

/*
* Global variable
*/
prte_ras_base_module_t prte_ras_testrm_module = {
.init = NULL,
.allocate = allocate,
.deallocate = NULL,
.finalize = finalize
};

static int allocate(prte_job_t *jdata, pmix_list_t *nodes)
{
int rc;

rc = prte_util_add_hostfile_nodes(nodes, prte_mca_ras_testrm_component.hostfile);
return rc;
}

/*
* There's really nothing to do here
*/
static int finalize(void)
{
return PRTE_SUCCESS;
}
34 changes: 34 additions & 0 deletions src/mca/ras/testrm/ras_testrm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2015-2019 Intel, Inc. All rights reserved.
* Copyright (c) 2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/

#ifndef PRTE_RAS_TESTRM_H
#define PRTE_RAS_TESTRM_H

#include "prte_config.h"
#include "src/mca/ras/base/base.h"
#include "src/mca/ras/ras.h"

BEGIN_C_DECLS

struct prte_ras_testrm_component_t {
prte_ras_base_component_t super;
char *hostfile;
};
typedef struct prte_ras_testrm_component_t prte_ras_testrm_component_t;

PRTE_EXPORT extern prte_ras_testrm_component_t prte_mca_ras_testrm_component;
PRTE_EXPORT extern prte_ras_base_module_t prte_ras_testrm_module;

END_C_DECLS

#endif
Loading

0 comments on commit 47a66cf

Please sign in to comment.