From d5f6e860568bcad0d50da78ae50e781cbae41e26 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 25 Jun 2017 07:29:32 -0700 Subject: [PATCH 1/2] Update to track PMIx v2.0.1 Signed-off-by: Ralph Castain (cherry picked from commit ed85512a7c3440b1a3efad15406e7774f932a4ab) --- opal/mca/pmix/pmix2x/pmix/AUTHORS | 15 +-- opal/mca/pmix/pmix2x/pmix/INSTALL | 4 +- opal/mca/pmix/pmix2x/pmix/NEWS | 59 --------- opal/mca/pmix/pmix2x/pmix/VERSION | 10 +- .../pmix/pmix2x/pmix/include/pmix_common.h | 2 + .../pmix/pmix2x/pmix/src/buffer_ops/copy.c | 2 +- .../pmix/pmix2x/pmix/src/buffer_ops/unpack.c | 1 + .../pmix2x/pmix/src/client/pmix_client_get.c | 66 ++++++---- .../pmix/pmix2x/pmix/src/server/pmix_server.c | 113 +++++++++++------- opal/mca/pmix/pmix2x/pmix/src/util/hash.c | 34 +++++- .../pmix/pmix2x/pmix/test/simple/simpclient.c | 50 ++++++-- opal/mca/pmix/pmix2x/pmix/test/test_common.c | 6 +- 12 files changed, 206 insertions(+), 156 deletions(-) diff --git a/opal/mca/pmix/pmix2x/pmix/AUTHORS b/opal/mca/pmix/pmix2x/pmix/AUTHORS index 581a22ec73a..c429d324c00 100644 --- a/opal/mca/pmix/pmix2x/pmix/AUTHORS +++ b/opal/mca/pmix/pmix2x/pmix/AUTHORS @@ -9,31 +9,22 @@ Email Name Affiliation(s) alinask Elena Shipunova Mellanox annu13 Annapurna Dasari Intel artpol84 Artem Polyakov Mellanox -ashleypittman Ashley Pittman Intel dsolt Dave Solt IBM -garlick Jim Garlick LLNL ggouaillardet Gilles Gouaillardet RIST hjelmn Nathan Hjelm LANL igor-ivanov Igor Ivanov Mellanox jladd-mlnx Joshua Ladd Mellanox -jjhursey Joshua Hursey IBM -jsquyres Jeff Squyres Cisco -karasevb Boris Karasev Mellanox -kawashima-fj Takahiro Kawashima Fujitsu +jsquyres Jeff Squyres Cisco, IU nkogteva Nadezhda Kogteva Mellanox -nysal Nysal Jan KA IBM -PHHargrove Paul Hargrove LBNL -rhc54 Ralph Castain Intel +rhc54 Ralph Castain LANL, Cisco, Intel ------------------------------- --------------------------- ------------------- Affiliation abbreviations: -------------------------- Cisco = Cisco Systems, Inc. -Fujitsu = Fujitsu IBM = International Business Machines, Inc. Intel = Intel, Inc. +IU = Indiana University LANL = Los Alamos National Laboratory -LBNL = Lawrence Berkeley National Laboratory -LLNL = Lawrence Livermore National Laboratory Mellanox = Mellanox RIST = Research Organization for Information Science and Technology diff --git a/opal/mca/pmix/pmix2x/pmix/INSTALL b/opal/mca/pmix/pmix2x/pmix/INSTALL index e1fc5e3f6db..6bdd1c1c502 100644 --- a/opal/mca/pmix/pmix2x/pmix/INSTALL +++ b/opal/mca/pmix/pmix2x/pmix/INSTALL @@ -24,7 +24,7 @@ This file is a *very* short overview of building and installing the PMIx library. Much more information is available on the PMIx web site (e.g., see the FAQ section): - http://pmix.github.io/pmix/pmix + http://pmix.github.io/pmix/master Developer Builds @@ -34,7 +34,7 @@ If you have checked out a DEVELOPER'S COPY of PMIx (i.e., you checked out from Git), you should read the HACKING file before attempting to build PMIx. You must then run: -shell$ ./autogen.pl +shell$ ./autogen.sh You will need very recent versions of GNU Autoconf, Automake, and Libtool. If autogen.sh fails, read the HACKING file. If anything diff --git a/opal/mca/pmix/pmix2x/pmix/NEWS b/opal/mca/pmix/pmix2x/pmix/NEWS index 4df8ad3aae6..86f4438f1bb 100644 --- a/opal/mca/pmix/pmix2x/pmix/NEWS +++ b/opal/mca/pmix/pmix2x/pmix/NEWS @@ -24,65 +24,6 @@ current release as well as the "stable" bug fix release branch. Master (not on release branches yet) ------------------------------------ - -2.0.0 ------- -**** NOTE: This release implements the complete PMIX v2.0 Standard -**** and therefore includes a number of new APIs and features. These -**** can be tracked by their RFC's in the RFC repository at: -**** https://github.com/pmix/RFCs. A formal standards document will -**** be included in a later v2.x release. Some of the changes are -**** identified below. -- Added the Modular Component Architecture (MCA) plugin manager and - converted a number of operations to plugins, thereby allowing easy - customization and extension (including proprietary offerings) -- Added support for TCP sockets instead of Unix domain sockets for - client-server communications -- Added support for on-the-fly Allocation requests, including requests - for additional resources, extension of time for currently allocated - resources, and return of identified allocated resources to the scheduler - (RFC 0005 - https://github.com/pmix/RFCs/blob/master/RFC0005.md) -- Tightened rules on the processing of PMIx_Get requests, including - reservation of the "pmix" prefix for attribute keys and specifying - behaviors associated with the PMIX_RANK_WILDCARD value - (RFC 0009 - https://github.com/pmix/RFCs/blob/master/RFC0009.md) -- Extended support for tool interactions with a PMIx server aimed at - meeting the needs of debuggers and other tools. Includes support - for rendezvousing with a system-level PMIx server for interacting - with the system management stack (SMS) outside of an allocated - session, and adds two new APIs: - - PMIx_Query: request general information such as the process - table for a specified job, and available SMS capabilities - - PMIx_Log: log messages (e.g., application progress) to a - system-hosted persistent store - (RFC 0010 - https://github.com/pmix/RFCs/blob/master/RFC0010.md) -- Added support for fabric/network interactions associated with - "instant on" application startup - (RFC 0012 - https://github.com/pmix/RFCs/blob/master/RFC0012.md) -- Added an attribute to support getting the time remaining in an - allocation via the PMIx_Query interface - (RFC 0013 - https://github.com/pmix/RFCs/blob/master/RFC0013.md) -- Added interfaces to support job control and monitoring requests, - including heartbeat and file monitors to detect stalled applications. - Job control interface supports standard signal-related operations - (pause, kill, resume, etc.) as well as checkpoint/restart requests. - The interface can also be used by an application to indicate it is - willing to be pre-empted, with the host RM providing an event - notification when the preemption is desired. - (RFC 0015 - https://github.com/pmix/RFCs/blob/master/RFC0015.md) -- Extended the event notification system to support notifications - across threads in the same process, and the ability to direct - ordering of notifications when registering event handlers. - (RFC 0018 - https://github.com/pmix/RFCs/blob/master/RFC0018.md) -- Expose the buffer manipulation functions via a new set of APIs - to support heterogeneous data transfers within the host RM - environment - (RFC 0020 - https://github.com/pmix/RFCs/blob/master/RFC0020.md) -- Fix a number of race condition issues that arose at scale -- Enable PMIx servers to generate notifications to the host RM - and to themselves - - 1.2.2 -- 21 March 2017 ---------------------- - Compiler fix for Sun/Oracle CC (PR #322) diff --git a/opal/mca/pmix/pmix2x/pmix/VERSION b/opal/mca/pmix/pmix2x/pmix/VERSION index c3dd7d08258..f597e9f5e3b 100644 --- a/opal/mca/pmix/pmix2x/pmix/VERSION +++ b/opal/mca/pmix/pmix2x/pmix/VERSION @@ -13,7 +13,7 @@ # major, minor, and release are generally combined in the form # ... -major=2 +major=3 minor=0 release=0 @@ -23,14 +23,14 @@ release=0 # The only requirement is that it must be entirely printable ASCII # characters and have no white space. -greek= +greek=a1 # If repo_rev is empty, then the repository version number will be # obtained during "make dist" via the "git describe --tags --always" # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=git6fb501d +repo_rev=git4c2c8d0 # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +44,7 @@ tarball_version= # The date when this release was created -date="Jun 19, 2017" +date="Jun 25, 2017" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library @@ -75,4 +75,4 @@ date="Jun 19, 2017" # Version numbers are described in the Libtool current:revision:age # format. -libpmix_so_version=3:0:1 +libpmix_so_version=0:0:0 diff --git a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h index e2cc36d8a3f..00ef3e79620 100644 --- a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h +++ b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h @@ -124,6 +124,8 @@ typedef uint32_t pmix_rank_t; #define PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first #define PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data #define PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server +#define PMIX_SERVER_NSPACE "pmix.srv.nspace" // (char*) Name of the nspace to use for this server +#define PMIX_SERVER_RANK "pmix.srv.rank" // (pmix_rank_t) Rank of this server /* identification attributes */ diff --git a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/copy.c b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/copy.c index 756d3c92818..b65d6944b41 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/copy.c +++ b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/copy.c @@ -425,7 +425,7 @@ PMIX_EXPORT pmix_status_t pmix_value_xfer(pmix_value_t *p, pmix_value_t *src) break; } /* allocate space and do the copy */ - switch (src->type) { + switch (src->data.darray->type) { case PMIX_UINT8: case PMIX_INT8: case PMIX_BYTE: diff --git a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c index 53e73ac1c9b..8e488457aa2 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c +++ b/opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c @@ -768,6 +768,7 @@ pmix_status_t pmix_bfrop_unpack_info(pmix_buffer_t *buffer, void *dest, return ret; } if (NULL == tmp) { + PMIX_ERROR_LOG(PMIX_ERROR); return PMIX_ERROR; } (void)strncpy(ptr[i].key, tmp, PMIX_MAX_KEYLEN); diff --git a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c index e0932889707..928eb721f51 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c +++ b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c @@ -111,7 +111,7 @@ PMIX_EXPORT pmix_status_t PMIx_Get(const pmix_proc_t *proc, const char key[], PMIX_RELEASE(cb); pmix_output_verbose(2, pmix_globals.debug_output, - "pmix:client get completed"); + "pmix:client get completed %d", rc); return rc; } @@ -464,7 +464,7 @@ static pmix_status_t process_val(pmix_value_t *val, } nvals = 0; for (n=0; n < nsize; n++) { - if (PMIX_SUCCESS != (rc = pmix_pointer_array_add(results, &info[n]))) { + if (0 > (rc = pmix_pointer_array_add(results, &info[n]))) { return rc; } ++nvals; @@ -536,25 +536,45 @@ static void _getnbfn(int fd, short flags, void *cbdata) /* if the rank is WILDCARD, then they want all the job-level info, * so no need to check the modex */ if (PMIX_RANK_WILDCARD != cb->rank) { + rc = PMIX_ERR_NOT_FOUND; #if defined(PMIX_ENABLE_DSTORE) && (PMIX_ENABLE_DSTORE == 1) - if (PMIX_SUCCESS == (rc = pmix_dstore_fetch(nptr->nspace, cb->rank, NULL, &val))) { -#else - if (PMIX_SUCCESS == (rc = pmix_hash_fetch(&nptr->modex, cb->rank, NULL, &val))) { + /* my own data is in the hash table, so don't bother looking + * in the dstore if that is what they want */ + if (pmix_globals.myid.rank != cb->rank) { + if (PMIX_SUCCESS == (rc = pmix_dstore_fetch(nptr->nspace, cb->rank, NULL, &val))) { + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix_get[%d]: value retrieved from dstore", __LINE__); + if (PMIX_SUCCESS != (rc = process_val(val, &nvals, &results))) { + cb->value_cbfunc(rc, NULL, cb->cbdata); + /* cleanup */ + if (NULL != val) { + PMIX_VALUE_RELEASE(val); + } + PMIX_RELEASE(cb); + return; + } + } + } #endif /* PMIX_ENABLE_DSTORE */ - pmix_output_verbose(2, pmix_globals.debug_output, - "pmix_get[%d]: value retrieved from dstore", __LINE__); - if (PMIX_SUCCESS != (rc = process_val(val, &nvals, &results))) { - cb->value_cbfunc(rc, NULL, cb->cbdata); - /* cleanup */ - if (NULL != val) { - PMIX_VALUE_RELEASE(val); + if (PMIX_SUCCESS != rc) { + /* if the user was asking about themselves, or we aren't using the dstore, + * then we need to check the hash table */ + if (PMIX_SUCCESS == (rc = pmix_hash_fetch(&nptr->modex, cb->rank, NULL, &val))) { + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix_get[%d]: value retrieved from hash", __LINE__); + if (PMIX_SUCCESS != (rc = process_val(val, &nvals, &results))) { + cb->value_cbfunc(rc, NULL, cb->cbdata); + /* cleanup */ + if (NULL != val) { + PMIX_VALUE_RELEASE(val); + } + PMIX_RELEASE(cb); + return; } - PMIX_RELEASE(cb); - return; + PMIX_VALUE_RELEASE(val); } - /* cleanup */ - PMIX_VALUE_RELEASE(val); - } else { + } + if (PMIX_SUCCESS != rc) { /* if we didn't find a modex for this rank, then we need * to go get it. Thus, the caller wants -all- information for * the specified rank, not just the job-level info. */ @@ -572,12 +592,17 @@ static void _getnbfn(int fd, short flags, void *cbdata) PMIX_RELEASE(cb); return; } - /* cleanup */ PMIX_VALUE_RELEASE(val); } /* now let's package up the results */ PMIX_VALUE_CREATE(val, 1); val->type = PMIX_DATA_ARRAY; + val->data.darray = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t)); + if (NULL == val->data.darray) { + PMIX_VALUE_RELEASE(val); + cb->value_cbfunc(PMIX_ERR_NOMEM, NULL, cb->cbdata); + return; + } val->data.darray->type = PMIX_INFO; val->data.darray->size = nvals; PMIX_INFO_CREATE(iptr, nvals); @@ -597,14 +622,13 @@ static void _getnbfn(int fd, short flags, void *cbdata) } else { pmix_value_xfer(&iptr[n].value, &info->value); } - PMIX_INFO_FREE(info, 1); + PMIX_INFO_DESTRUCT(info); } } /* done with results array */ PMIX_DESTRUCT(&results); - /* return the result to the caller */ + /* return the result to the caller - they are responsible for releasing it */ cb->value_cbfunc(PMIX_SUCCESS, val, cb->cbdata); - PMIX_VALUE_FREE(val, 1); PMIX_RELEASE(cb); return; } diff --git a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c index 94bc36c4fe1..ca22d7c708d 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c +++ b/opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c @@ -87,24 +87,6 @@ static inline int _my_client(const char *nspace, pmix_rank_t rank); static pmix_status_t initialize_server_base(pmix_server_module_t *module) { - char *evar; - - /* look for our namespace, if one was given */ - if (NULL == (evar = getenv("PMIX_SERVER_NAMESPACE"))) { - /* use a fake namespace */ - (void)strncpy(pmix_globals.myid.nspace, "pmix-server", PMIX_MAX_NSLEN); - } else { - (void)strncpy(pmix_globals.myid.nspace, evar, PMIX_MAX_NSLEN); - } - /* look for our rank, if one was given */ - mypid = getpid(); - if (NULL == (evar = getenv("PMIX_SERVER_RANK"))) { - /* use our pid */ - pmix_globals.myid.rank = mypid; - } else { - pmix_globals.myid.rank = strtol(evar, NULL, 10); - } - /* setup the server-specific globals */ PMIX_CONSTRUCT(&pmix_server_globals.clients, pmix_pointer_array_t); pmix_pointer_array_init(&pmix_server_globals.clients, 1, INT_MAX, 1); @@ -131,7 +113,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, pmix_status_t rc; size_t n, m; pmix_kval_t kv; - bool protect; + bool protect, nspace_given = false, rank_given = false; char *protected[] = { PMIX_USERID, PMIX_GRPID, @@ -140,6 +122,8 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, PMIX_SERVER_SYSTEM_SUPPORT, NULL }; + char *evar; + pmix_rank_info_t *rinfo; PMIX_ACQUIRE_THREAD(&pmix_global_lock); @@ -159,31 +143,22 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, return rc; } -#if defined(PMIX_ENABLE_DSTORE) && (PMIX_ENABLE_DSTORE == 1) - if (PMIX_SUCCESS != (rc = pmix_dstore_init(info, ninfo))) { - PMIX_RELEASE_THREAD(&pmix_global_lock); - return rc; - } -#endif /* PMIX_ENABLE_DSTORE */ - - /* setup the wildcard recv for inbound messages from clients */ - req = PMIX_NEW(pmix_ptl_posted_recv_t); - req->tag = UINT32_MAX; - req->cbfunc = server_message_handler; - /* add it to the end of the list of recvs */ - pmix_list_append(&pmix_ptl_globals.posted_recvs, &req->super); - - if (PMIX_SUCCESS != pmix_ptl_base_start_listening(info, ninfo)) { - pmix_show_help("help-pmix-server.txt", "listener-thread-start", true); - PMIX_RELEASE_THREAD(&pmix_global_lock); - return PMIX_ERR_INIT; - } - /* check the info keys for info we - * need to provide to every client */ + * need to provide to every client and + * directives aimed at us */ if (NULL != info) { PMIX_CONSTRUCT(&kv, pmix_kval_t); for (n=0; n < ninfo; n++) { + if (0 == strncmp(info[n].key, PMIX_SERVER_NSPACE, PMIX_MAX_KEYLEN)) { + (void)strncpy(pmix_globals.myid.nspace, info[n].value.data.string, PMIX_MAX_NSLEN); + nspace_given = true; + continue; + } + if (0 == strncmp(info[n].key, PMIX_SERVER_RANK, PMIX_MAX_KEYLEN)) { + pmix_globals.myid.rank = info[n].value.data.rank; + rank_given = true; + continue; + } /* check the list of protected keys */ protect = false; for (m=0; NULL != protected[m]; m++) { @@ -215,6 +190,64 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module, PMIX_DESTRUCT(&kv); } + if (!nspace_given) { + /* look for our namespace, if one was given */ + if (NULL == (evar = getenv("PMIX_SERVER_NAMESPACE"))) { + /* use a fake namespace */ + (void)strncpy(pmix_globals.myid.nspace, "pmix-server", PMIX_MAX_NSLEN); + } else { + (void)strncpy(pmix_globals.myid.nspace, evar, PMIX_MAX_NSLEN); + } + } + if (!rank_given) { + /* look for our rank, if one was given */ + mypid = getpid(); + if (NULL == (evar = getenv("PMIX_SERVER_RANK"))) { + /* use our pid */ + pmix_globals.myid.rank = mypid; + } else { + pmix_globals.myid.rank = strtol(evar, NULL, 10); + } + } + + /* copy it into mypeer entries */ + if (NULL == pmix_globals.mypeer->info) { + rinfo = PMIX_NEW(pmix_rank_info_t); + pmix_globals.mypeer->info = rinfo; + } else { + rinfo = pmix_globals.mypeer->info; + } + if (NULL == rinfo->nptr) { + rinfo->nptr = PMIX_NEW(pmix_nspace_t); + /* ensure our own nspace is first on the list */ + PMIX_RETAIN(rinfo->nptr); + rinfo->nptr->server = PMIX_NEW(pmix_server_nspace_t); + pmix_list_prepend(&pmix_globals.nspaces, &rinfo->nptr->super); + } + (void)strncpy(rinfo->nptr->nspace, pmix_globals.myid.nspace, PMIX_MAX_NSLEN); + rinfo->rank = pmix_globals.myid.rank; + + +#if defined(PMIX_ENABLE_DSTORE) && (PMIX_ENABLE_DSTORE == 1) + if (PMIX_SUCCESS != (rc = pmix_dstore_init(info, ninfo))) { + PMIX_RELEASE_THREAD(&pmix_global_lock); + return rc; + } +#endif /* PMIX_ENABLE_DSTORE */ + + /* setup the wildcard recv for inbound messages from clients */ + req = PMIX_NEW(pmix_ptl_posted_recv_t); + req->tag = UINT32_MAX; + req->cbfunc = server_message_handler; + /* add it to the end of the list of recvs */ + pmix_list_append(&pmix_ptl_globals.posted_recvs, &req->super); + + if (PMIX_SUCCESS != pmix_ptl_base_start_listening(info, ninfo)) { + pmix_show_help("help-pmix-server.txt", "listener-thread-start", true); + PMIX_RELEASE_THREAD(&pmix_global_lock); + return PMIX_ERR_INIT; + } + /* get our available security modules */ security_mode = pmix_psec.get_available_modules(); diff --git a/opal/mca/pmix/pmix2x/pmix/src/util/hash.c b/opal/mca/pmix/pmix2x/pmix/src/util/hash.c index d76a45ac4a3..fe31dd28ab6 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/util/hash.c +++ b/opal/mca/pmix/pmix2x/pmix/src/util/hash.c @@ -106,6 +106,9 @@ pmix_status_t pmix_hash_fetch(pmix_hash_table_t *table, pmix_rank_t rank, pmix_kval_t *hv; uint64_t id; char *node; + pmix_info_t *info; + size_t ninfo, n; + pmix_value_t *val; pmix_output_verbose(10, pmix_globals.debug_output, "HASH:FETCH rank %d key %s", @@ -143,7 +146,36 @@ pmix_status_t pmix_hash_fetch(pmix_hash_table_t *table, pmix_rank_t rank, if (NULL == key) { /* we will return the data as an array of pmix_info_t * in the kvs pmix_value_t */ - + val = (pmix_value_t*)malloc(sizeof(pmix_value_t)); + if (NULL == val) { + return PMIX_ERR_NOMEM; + } + val->type = PMIX_DATA_ARRAY; + val->data.darray = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t)); + if (NULL == val->data.darray) { + PMIX_VALUE_RELEASE(val); + return PMIX_ERR_NOMEM; + } + val->data.darray->type = PMIX_INFO; + val->data.darray->size = 0; + val->data.darray->array = NULL; + ninfo = pmix_list_get_size(&proc_data->data); + PMIX_INFO_CREATE(info, ninfo); + if (NULL == info) { + PMIX_VALUE_RELEASE(val); + return PMIX_ERR_NOMEM; + } + /* copy the list elements */ + n=0; + PMIX_LIST_FOREACH(hv, &proc_data->data, pmix_kval_t) { + (void)strncpy(info[n].key, hv->key, PMIX_MAX_KEYLEN); + pmix_value_xfer(&info[n].value, hv->value); + ++n; + } + val->data.darray->size = ninfo; + val->data.darray->array = info; + *kvs = val; + return PMIX_SUCCESS; } else { /* find the value from within this proc_data object */ hv = lookup_keyval(&proc_data->data, key); diff --git a/opal/mca/pmix/pmix2x/pmix/test/simple/simpclient.c b/opal/mca/pmix/pmix2x/pmix/test/simple/simpclient.c index df50881b5c9..cd58ee5ff43 100644 --- a/opal/mca/pmix/pmix2x/pmix/test/simple/simpclient.c +++ b/opal/mca/pmix/pmix2x/pmix/test/simple/simpclient.c @@ -269,21 +269,51 @@ int main(int argc, char **argv) PMIX_VALUE_RELEASE(val); free(tmp); - (void)asprintf(&tmp, "%s-%d-remote-%d", proc.nspace, n, j); - if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) { - /* this data should _not_ be found as we are on the same node - * and the data was "put" with a PMIX_REMOTE scope */ - pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp); - continue; + if (n != myproc.rank) { + (void)asprintf(&tmp, "%s-%d-remote-%d", proc.nspace, n, j); + if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) { + /* this data should _not_ be found as we are on the same node + * and the data was "put" with a PMIX_REMOTE scope */ + pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp); + continue; + } + pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned remote data for a local proc", + myproc.nspace, myproc.rank, j, tmp); + PMIX_VALUE_RELEASE(val); + free(tmp); } - pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned remote data for a local proc", - myproc.nspace, myproc.rank, j, tmp); - PMIX_VALUE_RELEASE(val); - free(tmp); } } } + /* now get the data blob for myself */ + pmix_output(0, "Client ns %s rank %d testing internal modex blob", + myproc.nspace, myproc.rank); + if (PMIX_SUCCESS == (rc = PMIx_Get(&myproc, NULL, NULL, 0, &val))) { + if (PMIX_DATA_ARRAY != val->type) { + pmix_output(0, "Client ns %s rank %d did not return an array for its internal modex blob", + myproc.nspace, myproc.rank); + PMIX_VALUE_RELEASE(val); + } else if (PMIX_INFO != val->data.darray->type) { + pmix_output(0, "Client ns %s rank %d returned an internal modex array of type %s instead of PMIX_INFO", + myproc.nspace, myproc.rank, PMIx_Data_type_string(val->data.darray->type)); + PMIX_VALUE_RELEASE(val); + } else if (0 == val->data.darray->size) { + pmix_output(0, "Client ns %s rank %d returned an internal modex array of zero length", + myproc.nspace, myproc.rank); + PMIX_VALUE_RELEASE(val); + } else { + pmix_info_t *iptr = (pmix_info_t*)val->data.darray->array; + for (n=0; n < val->data.darray->size; n++) { + pmix_output(0, "\tKey: %s", iptr[n].key); + } + PMIX_VALUE_RELEASE(val); + } + } else { + pmix_output(0, "Client ns %s rank %d internal modex blob FAILED with error %s(%d)", + myproc.nspace, myproc.rank, PMIx_Error_string(rc), rc); + } + /* log something */ PMIX_INFO_CONSTRUCT(&info); (void)strncpy(info.key, "foobar", PMIX_MAX_KEYLEN); diff --git a/opal/mca/pmix/pmix2x/pmix/test/test_common.c b/opal/mca/pmix/pmix2x/pmix/test/test_common.c index 8692a1be176..5d9ba374416 100644 --- a/opal/mca/pmix/pmix2x/pmix/test/test_common.c +++ b/opal/mca/pmix/pmix2x/pmix/test/test_common.c @@ -226,10 +226,7 @@ void parse_cmd(int argc, char **argv, test_params *params) } // Fix rank if running under SLURM -#if 0 - /* the following "if" statement can never be true as rank is - * an unsigned 32-bit int */ - if( 0 > params->rank ){ + if( PMIX_RANK_UNDEF == params->rank ){ char *ranklist = getenv("SLURM_GTIDS"); char *rankno = getenv("SLURM_LOCALID"); if( NULL != ranklist && NULL != rankno ){ @@ -246,7 +243,6 @@ void parse_cmd(int argc, char **argv, test_params *params) pmix_argv_free(argv); } } -#endif // Fix namespace if running under SLURM if( NULL == params->nspace ){ From 5f5a2d7308f2748d67ac71d1b0075762f031502e Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 26 Jun 2017 09:34:57 -0700 Subject: [PATCH 2/2] Track PMIx v2.0.1 Signed-off-by: Ralph Castain (cherry picked from commit e6c2a8d34679dec4ca2d80af7e701cbade4a4d37) --- opal/mca/pmix/pmix2x/pmix/AUTHORS | 15 ++++- opal/mca/pmix/pmix2x/pmix/INSTALL | 4 +- opal/mca/pmix/pmix2x/pmix/NEWS | 59 +++++++++++++++++++ opal/mca/pmix/pmix2x/pmix/VERSION | 6 +- opal/mca/pmix/pmix2x/pmix/autogen.pl | 2 +- .../pmix/pmix2x/pmix/include/pmix_common.h | 20 +++---- 6 files changed, 87 insertions(+), 19 deletions(-) diff --git a/opal/mca/pmix/pmix2x/pmix/AUTHORS b/opal/mca/pmix/pmix2x/pmix/AUTHORS index c429d324c00..581a22ec73a 100644 --- a/opal/mca/pmix/pmix2x/pmix/AUTHORS +++ b/opal/mca/pmix/pmix2x/pmix/AUTHORS @@ -9,22 +9,31 @@ Email Name Affiliation(s) alinask Elena Shipunova Mellanox annu13 Annapurna Dasari Intel artpol84 Artem Polyakov Mellanox +ashleypittman Ashley Pittman Intel dsolt Dave Solt IBM +garlick Jim Garlick LLNL ggouaillardet Gilles Gouaillardet RIST hjelmn Nathan Hjelm LANL igor-ivanov Igor Ivanov Mellanox jladd-mlnx Joshua Ladd Mellanox -jsquyres Jeff Squyres Cisco, IU +jjhursey Joshua Hursey IBM +jsquyres Jeff Squyres Cisco +karasevb Boris Karasev Mellanox +kawashima-fj Takahiro Kawashima Fujitsu nkogteva Nadezhda Kogteva Mellanox -rhc54 Ralph Castain LANL, Cisco, Intel +nysal Nysal Jan KA IBM +PHHargrove Paul Hargrove LBNL +rhc54 Ralph Castain Intel ------------------------------- --------------------------- ------------------- Affiliation abbreviations: -------------------------- Cisco = Cisco Systems, Inc. +Fujitsu = Fujitsu IBM = International Business Machines, Inc. Intel = Intel, Inc. -IU = Indiana University LANL = Los Alamos National Laboratory +LBNL = Lawrence Berkeley National Laboratory +LLNL = Lawrence Livermore National Laboratory Mellanox = Mellanox RIST = Research Organization for Information Science and Technology diff --git a/opal/mca/pmix/pmix2x/pmix/INSTALL b/opal/mca/pmix/pmix2x/pmix/INSTALL index 6bdd1c1c502..e1fc5e3f6db 100644 --- a/opal/mca/pmix/pmix2x/pmix/INSTALL +++ b/opal/mca/pmix/pmix2x/pmix/INSTALL @@ -24,7 +24,7 @@ This file is a *very* short overview of building and installing the PMIx library. Much more information is available on the PMIx web site (e.g., see the FAQ section): - http://pmix.github.io/pmix/master + http://pmix.github.io/pmix/pmix Developer Builds @@ -34,7 +34,7 @@ If you have checked out a DEVELOPER'S COPY of PMIx (i.e., you checked out from Git), you should read the HACKING file before attempting to build PMIx. You must then run: -shell$ ./autogen.sh +shell$ ./autogen.pl You will need very recent versions of GNU Autoconf, Automake, and Libtool. If autogen.sh fails, read the HACKING file. If anything diff --git a/opal/mca/pmix/pmix2x/pmix/NEWS b/opal/mca/pmix/pmix2x/pmix/NEWS index 86f4438f1bb..4df8ad3aae6 100644 --- a/opal/mca/pmix/pmix2x/pmix/NEWS +++ b/opal/mca/pmix/pmix2x/pmix/NEWS @@ -24,6 +24,65 @@ current release as well as the "stable" bug fix release branch. Master (not on release branches yet) ------------------------------------ + +2.0.0 +------ +**** NOTE: This release implements the complete PMIX v2.0 Standard +**** and therefore includes a number of new APIs and features. These +**** can be tracked by their RFC's in the RFC repository at: +**** https://github.com/pmix/RFCs. A formal standards document will +**** be included in a later v2.x release. Some of the changes are +**** identified below. +- Added the Modular Component Architecture (MCA) plugin manager and + converted a number of operations to plugins, thereby allowing easy + customization and extension (including proprietary offerings) +- Added support for TCP sockets instead of Unix domain sockets for + client-server communications +- Added support for on-the-fly Allocation requests, including requests + for additional resources, extension of time for currently allocated + resources, and return of identified allocated resources to the scheduler + (RFC 0005 - https://github.com/pmix/RFCs/blob/master/RFC0005.md) +- Tightened rules on the processing of PMIx_Get requests, including + reservation of the "pmix" prefix for attribute keys and specifying + behaviors associated with the PMIX_RANK_WILDCARD value + (RFC 0009 - https://github.com/pmix/RFCs/blob/master/RFC0009.md) +- Extended support for tool interactions with a PMIx server aimed at + meeting the needs of debuggers and other tools. Includes support + for rendezvousing with a system-level PMIx server for interacting + with the system management stack (SMS) outside of an allocated + session, and adds two new APIs: + - PMIx_Query: request general information such as the process + table for a specified job, and available SMS capabilities + - PMIx_Log: log messages (e.g., application progress) to a + system-hosted persistent store + (RFC 0010 - https://github.com/pmix/RFCs/blob/master/RFC0010.md) +- Added support for fabric/network interactions associated with + "instant on" application startup + (RFC 0012 - https://github.com/pmix/RFCs/blob/master/RFC0012.md) +- Added an attribute to support getting the time remaining in an + allocation via the PMIx_Query interface + (RFC 0013 - https://github.com/pmix/RFCs/blob/master/RFC0013.md) +- Added interfaces to support job control and monitoring requests, + including heartbeat and file monitors to detect stalled applications. + Job control interface supports standard signal-related operations + (pause, kill, resume, etc.) as well as checkpoint/restart requests. + The interface can also be used by an application to indicate it is + willing to be pre-empted, with the host RM providing an event + notification when the preemption is desired. + (RFC 0015 - https://github.com/pmix/RFCs/blob/master/RFC0015.md) +- Extended the event notification system to support notifications + across threads in the same process, and the ability to direct + ordering of notifications when registering event handlers. + (RFC 0018 - https://github.com/pmix/RFCs/blob/master/RFC0018.md) +- Expose the buffer manipulation functions via a new set of APIs + to support heterogeneous data transfers within the host RM + environment + (RFC 0020 - https://github.com/pmix/RFCs/blob/master/RFC0020.md) +- Fix a number of race condition issues that arose at scale +- Enable PMIx servers to generate notifications to the host RM + and to themselves + + 1.2.2 -- 21 March 2017 ---------------------- - Compiler fix for Sun/Oracle CC (PR #322) diff --git a/opal/mca/pmix/pmix2x/pmix/VERSION b/opal/mca/pmix/pmix2x/pmix/VERSION index f597e9f5e3b..15236cc64b2 100644 --- a/opal/mca/pmix/pmix2x/pmix/VERSION +++ b/opal/mca/pmix/pmix2x/pmix/VERSION @@ -23,14 +23,14 @@ release=0 # The only requirement is that it must be entirely printable ASCII # characters and have no white space. -greek=a1 +greek= # If repo_rev is empty, then the repository version number will be # obtained during "make dist" via the "git describe --tags --always" # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=git4c2c8d0 +repo_rev=gitaa26b56 # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +44,7 @@ tarball_version= # The date when this release was created -date="Jun 25, 2017" +date="Jun 26, 2017" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library diff --git a/opal/mca/pmix/pmix2x/pmix/autogen.pl b/opal/mca/pmix/pmix2x/pmix/autogen.pl index 2f86eaf9613..be66633f6cc 100755 --- a/opal/mca/pmix/pmix2x/pmix/autogen.pl +++ b/opal/mca/pmix/pmix2x/pmix/autogen.pl @@ -191,7 +191,7 @@ sub mca_process_framework { $mca_found->{$framework}->{found} = 1; opendir(DIR, $dir) || my_die "Can't open $dir directory"; - foreach my $d (readdir(DIR)) { + foreach my $d (sort(readdir(DIR))) { # Skip any non-directory, "base", or any dir that # begins with "." next diff --git a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h index 00ef3e79620..f74862c995b 100644 --- a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h +++ b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h @@ -938,20 +938,20 @@ typedef struct pmix_value { PMIX_PROC_INFO_DESTRUCT(_info[_n].value.data.pinfo); \ } \ } \ - } \ - } else if (PMIX_BYTE_OBJECT == (m)->data.darray->type) { \ - pmix_byte_object_t *_obj = \ - (pmix_byte_object_t*)(m)->data.darray->array; \ - for (_n=0; _n < (m)->data.darray->size; _n++) { \ - if (NULL != _obj[_n].bytes) { \ - free(_obj[_n].bytes); \ + } else if (PMIX_BYTE_OBJECT == (m)->data.darray->type) { \ + pmix_byte_object_t *_obj = \ + (pmix_byte_object_t*)(m)->data.darray->array; \ + for (_n=0; _n < (m)->data.darray->size; _n++) { \ + if (NULL != _obj[_n].bytes) { \ + free(_obj[_n].bytes); \ + } \ } \ } \ - } \ - if (NULL != (m)->data.darray->array) { \ free((m)->data.darray->array); \ } \ - free((m)->data.darray); \ + if (NULL != (m)->data.darray) { \ + free((m)->data.darray); \ + } \ /**** DEPRECATED ****/ \ } else if (PMIX_INFO_ARRAY == (m)->type) { \ pmix_info_t *_p = (pmix_info_t*)((m)->data.array->array); \