From 19acfa7ab8f4f3cf62b77c165cc7c914ec74fd1f Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 19 Nov 2023 15:06:17 -0700 Subject: [PATCH] Remove the event base param to prte_wait_cb Reduce potential for mistakes by locking the wait callbacks to occur in the main event thread. Signed-off-by: Ralph Castain (cherry picked from commit ad63c0f30ea8499fd25a5591a2bd4f7e1f134b4b) --- src/mca/errmgr/prted/errmgr_prted.c | 3 +-- src/mca/odls/base/odls_base_default_fns.c | 4 ++-- src/mca/plm/alps/plm_alps_module.c | 2 +- src/mca/plm/slurm/plm_slurm_module.c | 2 +- src/mca/plm/ssh/plm_ssh_module.c | 2 +- src/runtime/prte_wait.c | 18 ++++++++++-------- src/runtime/prte_wait.h | 6 +++--- 7 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/mca/errmgr/prted/errmgr_prted.c b/src/mca/errmgr/prted/errmgr_prted.c index 8b1d7671f0..ca228a8e4e 100644 --- a/src/mca/errmgr/prted/errmgr_prted.c +++ b/src/mca/errmgr/prted/errmgr_prted.c @@ -410,8 +410,7 @@ static void proc_errors(int fd, short args, void *cbdata) t2 = PMIX_NEW(prte_wait_tracker_t); PMIX_RETAIN(child); // protect against race conditions t2->child = child; - t2->evb = prte_event_base; - prte_event_set(t2->evb, &t2->ev, -1, PRTE_EV_WRITE, + prte_event_set(prte_event_base, &t2->ev, -1, PRTE_EV_WRITE, prte_odls_base_default_wait_local_proc, t2); prte_event_active(&t2->ev, PRTE_EV_WRITE, 1); goto cleanup; diff --git a/src/mca/odls/base/odls_base_default_fns.c b/src/mca/odls/base/odls_base_default_fns.c index 8ae51218b4..fcb04929cd 100644 --- a/src/mca/odls/base/odls_base_default_fns.c +++ b/src/mca/odls/base/odls_base_default_fns.c @@ -1397,7 +1397,7 @@ void prte_odls_base_default_launch_local(int fd, short sd, void *cbdata) /* set the waitpid callback here for thread protection and * to ensure we can capture the callback on shortlived apps */ PRTE_FLAG_SET(child, PRTE_PROC_FLAG_ALIVE); - prte_wait_cb(child, prte_odls_base_default_wait_local_proc, prte_event_base, NULL); + prte_wait_cb(child, prte_odls_base_default_wait_local_proc, NULL); /* dispatch this child to the next available launch thread */ cd = PMIX_NEW(prte_odls_spawn_caddy_t); @@ -2053,7 +2053,7 @@ int prte_odls_base_default_restart_proc(prte_proc_t *child, prte_odls_globals.next_base = 0; } evb = prte_odls_globals.ev_bases[prte_odls_globals.next_base]; - prte_wait_cb(child, prte_odls_base_default_wait_local_proc, prte_event_base, NULL); + prte_wait_cb(child, prte_odls_base_default_wait_local_proc, NULL); PMIX_OUTPUT_VERBOSE((5, prte_odls_base_framework.framework_output, "%s restarting app %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), app->app)); diff --git a/src/mca/plm/alps/plm_alps_module.c b/src/mca/plm/alps/plm_alps_module.c index 477ed6f64f..c031537ab7 100644 --- a/src/mca/plm/alps/plm_alps_module.c +++ b/src/mca/plm/alps/plm_alps_module.c @@ -558,7 +558,7 @@ static int plm_alps_start_proc(int argc, char **argv, char **env, char *prefix) /* be sure to mark it as alive so we don't instantly fire */ PRTE_FLAG_SET(alpsrun, PRTE_PROC_FLAG_ALIVE); /* setup the waitpid so we can find out if alps succeeds! */ - prte_wait_cb(alpsrun, alps_wait_cb, prte_event_base, NULL); + prte_wait_cb(alpsrun, alps_wait_cb, NULL); if (0 == alps_pid) { /* child */ char *bin_base = NULL, *lib_base = NULL; diff --git a/src/mca/plm/slurm/plm_slurm_module.c b/src/mca/plm/slurm/plm_slurm_module.c index f822513b7d..2edae0312a 100644 --- a/src/mca/plm/slurm/plm_slurm_module.c +++ b/src/mca/plm/slurm/plm_slurm_module.c @@ -654,7 +654,7 @@ static int plm_slurm_start_proc(int argc, char **argv, char *prefix) /* be sure to mark it as alive so we don't instantly fire */ PRTE_FLAG_SET(dummy, PRTE_PROC_FLAG_ALIVE); /* setup the waitpid so we can find out if srun succeeds! */ - prte_wait_cb(dummy, srun_wait_cb, prte_event_base, NULL); + prte_wait_cb(dummy, srun_wait_cb, NULL); if (0 == srun_pid) { /* child */ char *bin_base = NULL, *lib_base = NULL; diff --git a/src/mca/plm/ssh/plm_ssh_module.c b/src/mca/plm/ssh/plm_ssh_module.c index 51fe10d15e..3608ae64a7 100644 --- a/src/mca/plm/ssh/plm_ssh_module.c +++ b/src/mca/plm/ssh/plm_ssh_module.c @@ -932,7 +932,7 @@ static void process_launch_list(int fd, short args, void *cbdata) caddy = (prte_plm_ssh_caddy_t *) item; /* register the sigchild callback */ PRTE_FLAG_SET(caddy->daemon, PRTE_PROC_FLAG_ALIVE); - prte_wait_cb(caddy->daemon, ssh_wait_daemon, prte_event_base, (void *) caddy); + prte_wait_cb(caddy->daemon, ssh_wait_daemon, (void *) caddy); /* fork a child to exec the ssh/ssh session */ pid = fork(); diff --git a/src/runtime/prte_wait.c b/src/runtime/prte_wait.c index c82c9f43e1..e90129f2b1 100644 --- a/src/runtime/prte_wait.c +++ b/src/runtime/prte_wait.c @@ -75,7 +75,8 @@ static void timer_dest(prte_timer_t *tm) { prte_event_free(tm->ev); } -PMIX_CLASS_INSTANCE(prte_timer_t, pmix_object_t, timer_const, timer_dest); +PMIX_CLASS_INSTANCE(prte_timer_t, pmix_object_t, + timer_const, timer_dest); static void wccon(prte_wait_tracker_t *p) { @@ -89,7 +90,8 @@ static void wcdes(prte_wait_tracker_t *p) PMIX_RELEASE(p->child); } } -PMIX_CLASS_INSTANCE(prte_wait_tracker_t, pmix_list_item_t, wccon, wcdes); +PMIX_CLASS_INSTANCE(prte_wait_tracker_t, pmix_list_item_t, + wccon, wcdes); /* Local Variables */ static prte_event_t handler; @@ -114,7 +116,8 @@ int prte_wait_init(void) { PMIX_CONSTRUCT(&pending_cbs, pmix_list_t); - prte_event_set(prte_event_base, &handler, SIGCHLD, PRTE_EV_SIGNAL | PRTE_EV_PERSIST, + prte_event_set(prte_event_base, &handler, SIGCHLD, + PRTE_EV_SIGNAL | PRTE_EV_PERSIST, wait_signal_callback, &handler); prte_event_add(&handler, NULL); @@ -133,7 +136,8 @@ int prte_wait_finalize(void) /* this function *must* always be called from * within an event in the prte_event_base */ -void prte_wait_cb(prte_proc_t *child, prte_wait_cbfunc_t callback, prte_event_base_t *evb, +void prte_wait_cb(prte_proc_t *child, + prte_wait_cbfunc_t callback, void *data) { prte_wait_tracker_t *t2; @@ -151,10 +155,9 @@ void prte_wait_cb(prte_proc_t *child, prte_wait_cbfunc_t callback, prte_event_ba t2 = PMIX_NEW(prte_wait_tracker_t); PMIX_RETAIN(child); // protect against race conditions t2->child = child; - t2->evb = evb; t2->cbfunc = callback; t2->cbdata = data; - prte_event_set(t2->evb, &t2->ev, -1, PRTE_EV_WRITE, t2->cbfunc, t2); + prte_event_set(prte_event_base, &t2->ev, -1, PRTE_EV_WRITE, t2->cbfunc, t2); prte_event_active(&t2->ev, PRTE_EV_WRITE, 1); } return; @@ -173,7 +176,6 @@ void prte_wait_cb(prte_proc_t *child, prte_wait_cbfunc_t callback, prte_event_ba t2 = PMIX_NEW(prte_wait_tracker_t); PMIX_RETAIN(child); // protect against race conditions t2->child = child; - t2->evb = evb; t2->cbfunc = callback; t2->cbdata = data; pmix_list_append(&pending_cbs, &t2->super); @@ -254,7 +256,7 @@ static void wait_signal_callback(int fd, short event, void *arg) t2->child->exit_code = status; pmix_list_remove_item(&pending_cbs, &t2->super); if (NULL != t2->cbfunc) { - prte_event_set(t2->evb, &t2->ev, -1, PRTE_EV_WRITE, t2->cbfunc, t2); + prte_event_set(prte_event_base, &t2->ev, -1, PRTE_EV_WRITE, t2->cbfunc, t2); prte_event_active(&t2->ev, PRTE_EV_WRITE, 1); } else { PMIX_RELEASE(t2); diff --git a/src/runtime/prte_wait.h b/src/runtime/prte_wait.h index 6c560ffea5..9cbfc2ef2c 100644 --- a/src/runtime/prte_wait.h +++ b/src/runtime/prte_wait.h @@ -63,7 +63,6 @@ typedef void (*prte_wait_cbfunc_t)(int fd, short args, void *cb); typedef struct { pmix_list_item_t super; prte_event_t ev; - prte_event_base_t *evb; prte_proc_t *child; prte_wait_cbfunc_t cbfunc; void *cbdata; @@ -86,8 +85,9 @@ PRTE_EXPORT void prte_wait_disable(void); * \c waitpid() will have already been called on the process at this * time. */ -PRTE_EXPORT void prte_wait_cb(prte_proc_t *proc, prte_wait_cbfunc_t callback, - prte_event_base_t *evb, void *data); +PRTE_EXPORT void prte_wait_cb(prte_proc_t *proc, + prte_wait_cbfunc_t callback, + void *data); PRTE_EXPORT void prte_wait_cb_cancel(prte_proc_t *proc);