Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple commits #1556

Merged
merged 10 commits into from
Oct 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ examples/debugger/stdincheck
examples/debugger/mpihello
examples/legacy
examples/colocate
examples/pset

src/sys/powerpc/atomic-32.s
src/sys/powerpc/atomic-64.s
Expand Down
3 changes: 2 additions & 1 deletion examples/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ EXAMPLES = \
launcher \
showkeys \
legacy \
colocate
colocate \
pset

all: $(EXAMPLES)

Expand Down
3 changes: 2 additions & 1 deletion examples/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,5 @@ EXTRA_DIST += \
examples/showkeys.c \
examples/target.c \
examples/tool.c \
examples/colocate.c
examples/colocate.c \
examples/pset.c
11 changes: 4 additions & 7 deletions examples/debugger/direct-multi.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2021 Nanook Consulting All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -305,9 +305,8 @@ int parse_command_line(int argc, char **argv)
printf("$ prte --daemonize\n");
printf("$ %s [OPTIONS]\n", argv[0]);
printf("\n");
printf(" --stop-in-init Stop application in init (Default)\n");
printf(" --stop-in-init Stop application in PMIx_Init (Default)\n");
printf(" --stop-on-exec Stop application on exec\n");
printf(" --stop-in-init Stop application in init (Default)\n");
printf(" --app-npernode Number of processes per node (Default: 2)\n");
printf(" --app-np Number of total processes. Must be multiple of "
"--app-npernode (Default: 2)\n");
Expand Down Expand Up @@ -559,11 +558,9 @@ static pmix_status_t spawn_app(void)
/* Provide job-level directives so the apps do what the user requested */
PMIX_INFO_LIST_START(tinfo);
if (stop_on_exec) {
PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_DEBUG_STOP_ON_EXEC, &all_ranks,
PMIX_PROC_RANK); // All procs stop at first instruction
PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_DEBUG_STOP_ON_EXEC, NULL, PMIX_BOOL); // All procs stop at first instruction
} else {
PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_DEBUG_STOP_IN_INIT, &all_ranks,
PMIX_PROC_RANK); // All procs stop in PMIx_Init
PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_DEBUG_STOP_IN_INIT, NULL, PMIX_BOOL); // All procs stop in PMIx_Init
}
sprintf(map_str, "ppr:%d:node", app_npernode);
PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_MAPBY, map_str, PMIX_STRING); // app procs/node
Expand Down
17 changes: 6 additions & 11 deletions examples/debugger/direct.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2021 Nanook Consulting All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -370,12 +370,10 @@ static int cospawn_launch(myrel_t *myrel)
n = 0;
if (stop_on_exec) {
/* Stop application at first instruction */
PMIX_INFO_LOAD(&app[n].info[0], PMIX_DEBUG_STOP_ON_EXEC, &all_ranks,
PMIX_PROC_RANK);
PMIX_INFO_LOAD(&app[n].info[0], PMIX_DEBUG_STOP_ON_EXEC, NULL, PMIX_BOOL);
} else if (stop_in_init) {
/* Stop application in PMIx_Init */
PMIX_INFO_LOAD(&app[n].info[0], PMIX_DEBUG_STOP_IN_INIT, &all_ranks,
PMIX_PROC_RANK);
PMIX_INFO_LOAD(&app[n].info[0], PMIX_DEBUG_STOP_IN_INIT, NULL, PMIX_BOOL);
}
} else {
app[0].ninfo = 0;
Expand Down Expand Up @@ -583,9 +581,8 @@ int main(int argc, char **argv)
printf("$ %s [OPTIONS]\n", argv[0]);
printf("\n");
printf(" -c | --cospawn Test Cospawn\n");
printf(" --stop-in-init Stop application in init (Default)\n");
printf(" --stop-in-init Stop application in PMIx_Init (Default)\n");
printf(" --stop-on-exec Stop application on exec\n");
printf(" --stop-in-init Stop application in init (Default)\n");
printf(" --app-npernode Number of processes per node (Default: 2)\n");
printf(" --app-np Number of total processes. Must be multiple of "
"--app-npernode (Default: 2)\n");
Expand Down Expand Up @@ -816,12 +813,10 @@ int main(int argc, char **argv)
PMIX_INFO_LIST_START(dirs);
if (stop_on_exec) {
// procs are to stop on first instruction
PMIX_INFO_LIST_ADD(rc, dirs, PMIX_DEBUG_STOP_ON_EXEC, &all_ranks,
PMIX_PROC_RANK);
PMIX_INFO_LIST_ADD(rc, dirs, PMIX_DEBUG_STOP_ON_EXEC, NULL, PMIX_BOOL);
} else {
// procs are to pause in PMIx_Init for debugger attach
PMIX_INFO_LIST_ADD(rc, dirs, PMIX_DEBUG_STOP_IN_INIT, &all_ranks,
PMIX_PROC_RANK);
PMIX_INFO_LIST_ADD(rc, dirs, PMIX_DEBUG_STOP_IN_INIT, NULL, PMIX_BOOL);
}
sprintf(map_str, "ppr:%d:node", app_npernode);
PMIX_INFO_LIST_ADD(rc, dirs, PMIX_MAPBY, map_str, PMIX_STRING); // 1 per node
Expand Down
2 changes: 1 addition & 1 deletion examples/debugger/indirect-multi.c
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ static pmix_status_t spawn_app(char *myuri, int argc, char **argv,
* to do with the app it is going to spawn for us */
PMIX_INFO_LIST_START(linfo);
rank = PMIX_RANK_WILDCARD;
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_DEBUG_STOP_IN_INIT, &rank, PMIX_PROC_RANK); // stop all procs in init
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_DEBUG_STOP_IN_INIT, NULL, PMIX_BOOL); // stop all procs in PMIx_Init
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_NOTIFY_JOB_EVENTS, NULL, PMIX_BOOL);
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_FWD_STDOUT, NULL, PMIX_BOOL); // forward stdout to me
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_FWD_STDERR, NULL, PMIX_BOOL); // forward stderr to me
Expand Down
4 changes: 2 additions & 2 deletions examples/debugger/indirect.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,9 @@ int main(int argc, char **argv)
PMIX_INFO_LIST_START(linfo);
rank = PMIX_RANK_WILDCARD;
if (NULL != strstr(argv[1], "mpi")) {
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_DEBUG_STOP_IN_APP, &rank, PMIX_PROC_RANK); // stop all procs in MPI_Init
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_DEBUG_STOP_IN_APP, NULL, PMIX_BOOL); // stop all procs in MPI_Init
} else {
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_DEBUG_STOP_IN_INIT, &rank, PMIX_PROC_RANK); // stop all procs in PMIx_Init
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_DEBUG_STOP_IN_INIT, NULL, PMIX_BOOL); // stop all procs in PMIx_Init
}
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_NOTIFY_JOB_EVENTS, NULL, PMIX_BOOL);
PMIX_INFO_LIST_ADD(rc, linfo, PMIX_FWD_STDOUT, NULL, PMIX_BOOL); // forward stdout to me
Expand Down
89 changes: 89 additions & 0 deletions examples/pset.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/

#define _GNU_SOURCE
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>

#include <pmix.h>

int main(int argc, char **argv)
{
int rc;
size_t n;
pmix_value_t value;
pmix_value_t *val = &value;
pmix_proc_t proc, *pptr;
pid_t pid;
pmix_proc_t myproc;

pid = getpid();

if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Init failed: %d\n", myproc.nspace, myproc.rank,
rc);
exit(0);
}
fprintf(stderr, "Client ns %s rank %d pid %lu: Running\n", myproc.nspace, myproc.rank,
(unsigned long) pid);

/* get our pset name */
PMIX_LOAD_PROCID(&proc, myproc.nspace, PMIX_RANK_WILDCARD);
if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_PSET_NAME, NULL, 0, &val))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Get pset name failed: %s\n",
myproc.nspace, myproc.rank, PMIx_Error_string(rc));
goto done;
}
fprintf(stderr, "Client %s:%d pset name %s\n",
myproc.nspace, myproc.rank, val->data.string);
PMIX_VALUE_FREE(val, 1);

/* since this is our pset, get our membership */
if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_PSET_MEMBERS, NULL, 0, &val))) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Get of pset members failed: %s\n",
myproc.nspace, myproc.rank, PMIx_Error_string(rc));
goto done;
}
/* must return a pmix_data_array_t of members */
if (PMIX_DATA_ARRAY != val->type) {
fprintf(stderr, "Client ns %s rank %d: PMIx_Get of pset members returned incorrect data type: %s\n",
myproc.nspace, myproc.rank, PMIx_Data_type_string(val->type));
goto done;
}
fprintf(stderr, "Client %s:%d PMIx_Get returned %d members\n", myproc.nspace, myproc.rank,
val->data.darray->size);
pptr = (pmix_proc_t*)val->data.darray->array;
for (n=0; n < val->data.darray->size; n++) {
fprintf(stderr, "\t%s:%d\n", pptr[n].nspace, pptr[n].rank);
}
PMIX_VALUE_FREE(val, 1);

done:
return (rc);
}
8 changes: 2 additions & 6 deletions src/mca/errmgr/dvm/errmgr_dvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ static void proc_errors(int fd, short args, void *cbdata)
prte_state_caddy_t *caddy = (prte_state_caddy_t *) cbdata;
prte_job_t *jdata;
prte_proc_t *pptr, *proct;
pmix_proc_t *proc = &caddy->name, parent;
pmix_proc_t *proc = &caddy->name;
prte_proc_state_t state = caddy->proc_state;
int i;
int32_t i32, *i32ptr;
Expand Down Expand Up @@ -580,18 +580,15 @@ static void check_send_notification(prte_job_t *jdata,
pmix_proc_t target;
pmix_data_buffer_t pbkt;
pmix_data_range_t range = PMIX_RANGE_CUSTOM;
pmix_status_t cret;

// pmix_output_verbose(5, prte_state_base_framework.framework_output,
pmix_output(0,
pmix_output_verbose(5, prte_state_base_framework.framework_output,
"%s errmgr:dvm:sending notification %s affected proc %s",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME),
PMIx_Error_string(event),
PRTE_NAME_PRINT(&proc->name));

if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_NOTIFY_ERRORS, NULL, PMIX_BOOL) ||
prte_dvm_abort_ordered) {
pmix_output(0, "NOT NOTIFYING");
return;
}
/* we checked for termination due to the specific error we encountered, but
Expand All @@ -601,7 +598,6 @@ static void check_send_notification(prte_job_t *jdata,
if (PRTE_FLAG_TEST(jdata, PRTE_JOB_FLAG_ABORTED)) {
/* this job has already been aborted, so we don't need to notify
* about the fate of any proc within it */
pmix_output(0, "JOB ABORTED");
return;
}
/* notify the other procs of the termination */
Expand Down
1 change: 0 additions & 1 deletion src/mca/ess/base/ess_base_std_prted.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ int prte_ess_base_prted_setup(void)
prte_job_t *jdata;
prte_proc_t *proc;
prte_app_context_t *app;
char *param;
hwloc_obj_t obj;
unsigned i, j;
prte_topology_t *t;
Expand Down
1 change: 1 addition & 0 deletions src/mca/ess/env/ess_env_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ static int rte_init(int argc, char **argv)
{
int ret;
char *error = NULL;
PRTE_HIDE_UNUSED_PARAMS(argc, argv);

/* run the prolog */
if (PRTE_SUCCESS != (ret = prte_ess_base_std_prolog())) {
Expand Down
5 changes: 3 additions & 2 deletions src/mca/ess/hnp/ess_hnp_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,11 @@ static int rte_init(int argc, char **argv)
prte_node_t *node;
prte_proc_t *proc;
prte_app_context_t *app;
char *coprocessors, **sns;
uint32_t h;
int idx;
prte_topology_t *t;
pmix_value_t pval;
pmix_status_t pret;
PRTE_HIDE_UNUSED_PARAMS(argc);

/* run the prolog */
if (PRTE_SUCCESS != (ret = prte_ess_base_std_prolog())) {
Expand Down Expand Up @@ -536,6 +535,8 @@ static int rte_finalize(void)

static void rte_abort(int status, bool report)
{
PRTE_HIDE_UNUSED_PARAMS(report);

pmix_output(0, "ABORT");
/* do NOT do a normal finalize as this will very likely
* hang the process. We are aborting due to an abnormal condition
Expand Down
1 change: 1 addition & 0 deletions src/mca/ess/slurm/ess_slurm_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ static int rte_init(int argc, char **argv)
{
int ret;
char *error = NULL;
PRTE_HIDE_UNUSED_PARAMS(argc, argv);

/* run the prolog */
if (PRTE_SUCCESS != (ret = prte_ess_base_std_prolog())) {
Expand Down
4 changes: 4 additions & 0 deletions src/mca/filem/raw/filem_raw_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ static void recv_ack(int status, pmix_proc_t *sender, pmix_data_buffer_t *buffer
prte_filem_raw_xfer_t *xfer;
char *file;
int st, n, rc;
PRTE_HIDE_UNUSED_PARAMS(status, tag, cbdata);

/* unpack the file */
n = 1;
Expand Down Expand Up @@ -732,6 +733,7 @@ static void send_chunk(int xxx, short argc, void *cbdata)
int rc;
pmix_data_buffer_t chunk;
prte_grpcomm_signature_t *sig;
PRTE_HIDE_UNUSED_PARAMS(xxx, argc);

PMIX_ACQUIRE_OBJECT(rev);

Expand Down Expand Up @@ -930,6 +932,7 @@ static void recv_files(int status, pmix_proc_t *sender, pmix_data_buffer_t *buff
pmix_list_item_t *item;
int32_t type;
char *cptr;
PRTE_HIDE_UNUSED_PARAMS(status, sender, tag, cbdata);

/* unpack the data */
n = 1;
Expand Down Expand Up @@ -1086,6 +1089,7 @@ static void write_handler(int fd, short event, void *cbdata)
char *dirname, *cmd;
char homedir[MAXPATHLEN];
int rc;
PRTE_HIDE_UNUSED_PARAMS(fd, event);

PMIX_ACQUIRE_OBJECT(sink);

Expand Down
7 changes: 0 additions & 7 deletions src/mca/grpcomm/base/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,9 @@ PRTE_EXPORT int prte_grpcomm_API_xcast(prte_grpcomm_signature_t *sig, prte_rml_t
PRTE_EXPORT int prte_grpcomm_API_allgather(prte_grpcomm_signature_t *sig, pmix_data_buffer_t *buf,
int mode, pmix_status_t local_status,
prte_grpcomm_cbfunc_t cbfunc, void *cbdata);
/* reliable broadcast API */
PRTE_EXPORT int prte_grpcomm_API_rbcast(prte_grpcomm_signature_t *sig, prte_rml_tag_t tag,
pmix_data_buffer_t *buf);
PRTE_EXPORT int prte_grpcomm_API_register_cb(prte_grpcomm_rbcast_cb_t callback);

PRTE_EXPORT prte_grpcomm_coll_t *prte_grpcomm_base_get_tracker(prte_grpcomm_signature_t *sig,
bool create);
PRTE_EXPORT void prte_grpcomm_base_mark_distance_recv(prte_grpcomm_coll_t *coll, uint32_t distance);
PRTE_EXPORT unsigned int prte_grpcomm_base_check_distance_recv(prte_grpcomm_coll_t *coll,
uint32_t distance);

END_C_DECLS
#endif
7 changes: 1 addition & 6 deletions src/mca/grpcomm/base/grpcomm_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,7 @@ prte_grpcomm_base_t prte_grpcomm_base = {

prte_grpcomm_API_module_t prte_grpcomm = {
.xcast = prte_grpcomm_API_xcast,
.allgather = prte_grpcomm_API_allgather,
.rbcast = prte_grpcomm_API_rbcast,
.register_cb = prte_grpcomm_API_register_cb,
.unregister_cb = NULL
.allgather = prte_grpcomm_API_allgather
};

static int base_register(pmix_mca_base_register_flag_t flags)
Expand Down Expand Up @@ -143,7 +140,6 @@ static void ccon(prte_grpcomm_coll_t *p)
p->sig = NULL;
p->status = PMIX_SUCCESS;
PMIX_DATA_BUFFER_CONSTRUCT(&p->bucket);
PMIX_CONSTRUCT(&p->distance_mask_recv, pmix_bitmap_t);
p->dmns = NULL;
p->ndmns = 0;
p->nexpected = 0;
Expand All @@ -158,7 +154,6 @@ static void cdes(prte_grpcomm_coll_t *p)
PMIX_RELEASE(p->sig);
}
PMIX_DATA_BUFFER_DESTRUCT(&p->bucket);
PMIX_DESTRUCT(&p->distance_mask_recv);
free(p->dmns);
free(p->buffers);
}
Expand Down
Loading