Skip to content

Commit

Permalink
Merge pull request #1232 from rhc54/topic/stg3
Browse files Browse the repository at this point in the history
RML Rework: Stage 3 - remove routed framework
  • Loading branch information
rhc54 authored Mar 2, 2022
2 parents 88831e3 + 17368b1 commit dbbcb66
Show file tree
Hide file tree
Showing 61 changed files with 472 additions and 1,597 deletions.
5 changes: 1 addition & 4 deletions src/mca/errmgr/base/errmgr_base_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@

#include "src/mca/base/base.h"
#include "src/mca/mca.h"

#include "src/util/pmix_argv.h"
#include "src/util/pmix_basename.h"
#include "src/util/pmix_os_dirpath.h"
#include "src/util/output.h"
#include "src/util/pmix_printf.h"

#include "src/util/name_fns.h"
#include "src/util/proc_info.h"
#include "src/util/session_dir.h"
Expand All @@ -69,9 +69,6 @@
#include "src/mca/ess/ess.h"
#include "src/mca/odls/odls.h"
#include "src/mca/plm/plm.h"
#include "src/rml/rml.h"
#include "src/rml/rml_types.h"
#include "src/mca/routed/routed.h"
#include "src/mca/state/state.h"

#include "src/mca/errmgr/base/base.h"
Expand Down
7 changes: 3 additions & 4 deletions src/mca/errmgr/detector/errmgr_detector.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
#include "src/mca/plm/plm.h"
#include "src/mca/rmaps/rmaps_types.h"
#include "src/rml/rml.h"
#include "src/mca/routed/routed.h"
#include "src/mca/state/state.h"
#include "src/threads/pmix_threads.h"

Expand Down Expand Up @@ -213,7 +212,7 @@ static void error_notify_cbfunc(size_t evhdlr_registration_id, pmix_status_t sta
}

/* send this process's info to hnp */
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP, alert, PRTE_RML_TAG_PLM, NULL);
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, alert, PRTE_RML_TAG_PLM, NULL);
if (PRTE_SUCCESS != rc) {
PRTE_OUTPUT_VERBOSE((5, prte_errmgr_base_framework.framework_output,
"%s errmgr:detector: send to hnp failed",
Expand Down Expand Up @@ -414,7 +413,7 @@ static int pmix_fd_heartbeat_request(prte_errmgr_detector_t *detector)
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
}
PRTE_RML_SEND(rc, &daemon, buffer, PRTE_RML_TAG_HEARTBEAT_REQUEST, NULL);
PRTE_RML_SEND(rc, daemon.rank, buffer, PRTE_RML_TAG_HEARTBEAT_REQUEST, NULL);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(buffer);
Expand Down Expand Up @@ -542,7 +541,7 @@ static void pmix_fd_heartbeat_send(prte_errmgr_detector_t *detector)
PMIX_ERROR_LOG(rc);
}
/* send the heartbeat with eager send */
PRTE_RML_SEND(rc, &daemon, buffer, PRTE_RML_TAG_HEARTBEAT, NULL);
PRTE_RML_SEND(rc, daemon.rank, buffer, PRTE_RML_TAG_HEARTBEAT, NULL);
if (PRTE_SUCCESS != rc) {
PRTE_OUTPUT_VERBOSE((5, prte_errmgr_base_framework.framework_output,
"errmgr:detector:failed to send heartbeat to %s",
Expand Down
22 changes: 10 additions & 12 deletions src/mca/errmgr/dvm/errmgr_dvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
#include "src/mca/plm/plm.h"
#include "src/mca/rmaps/rmaps_types.h"
#include "src/rml/rml.h"
#include "src/mca/routed/routed.h"
#include "src/mca/state/state.h"

#include "src/threads/pmix_threads.h"
Expand Down Expand Up @@ -199,7 +198,7 @@ static void error_notify_cbfunc(size_t evhdlr_registration_id, pmix_status_t sta
}

/* send this process's info to hnp */
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP, alert, PRTE_RML_TAG_PLM, NULL);
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, alert, PRTE_RML_TAG_PLM, NULL);
if (PRTE_SUCCESS != rc) {
PRTE_OUTPUT_VERBOSE((5, prte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: send to hnp failed",
Expand Down Expand Up @@ -433,15 +432,14 @@ static void proc_errors(int fd, short args, void *cbdata)
"%s Comm failure: daemons terminating - recording daemon %s as gone",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(proc)));
/* remove from dependent routes, if it is one */
prte_routed.route_lost(proc);
prte_rml_route_lost(proc->rank);
/* if all my routes and local children are gone, then terminate ourselves */
if (0 == prte_routed.num_routes()) {
if (0 == pmix_list_get_size(&prte_rml_base.children)) {
for (i = 0; i < prte_local_children->size; i++) {
if (NULL
!= (proct = (prte_proc_t *)
pmix_pointer_array_get_item(prte_local_children, i))
&& PRTE_FLAG_TEST(pptr, PRTE_PROC_FLAG_ALIVE)
&& proct->state < PRTE_PROC_STATE_UNTERMINATED) {
proct = (prte_proc_t *) pmix_pointer_array_get_item(prte_local_children, i);
if (NULL != proct &&
PRTE_FLAG_TEST(pptr, PRTE_PROC_FLAG_ALIVE) &&
proct->state < PRTE_PROC_STATE_UNTERMINATED) {
/* at least one is still alive */
PRTE_OUTPUT_VERBOSE((5, prte_errmgr_base_framework.framework_output,
"%s Comm failure: at least one proc (%s) still alive",
Expand All @@ -459,7 +457,7 @@ static void proc_errors(int fd, short args, void *cbdata)
PRTE_OUTPUT_VERBOSE((5, prte_errmgr_base_framework.framework_output,
"%s Comm failure: %d routes remain alive",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME),
(int) prte_routed.num_routes()));
(int) pmix_list_get_size(&prte_rml_base.children)));
}
goto cleanup;
}
Expand Down Expand Up @@ -512,7 +510,7 @@ static void proc_errors(int fd, short args, void *cbdata)
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == prte_routed.num_routes()) {
if (0 == pmix_list_get_size(&prte_rml_base.children)) {
PRTE_OUTPUT_VERBOSE((2, prte_errmgr_base_framework.framework_output,
"%s errmgr:default:dvm all routes gone - exiting",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)));
Expand Down Expand Up @@ -725,7 +723,7 @@ static void proc_errors(int fd, short args, void *cbdata)
_terminate_job(jdata->nspace);
}
/* remove from dependent routes, if it is one */
prte_routed.route_lost(proc);
prte_rml_route_lost(proc->rank);
break;

case PRTE_PROC_STATE_UNABLE_TO_SEND_MSG:
Expand Down
17 changes: 8 additions & 9 deletions src/mca/errmgr/prted/errmgr_prted.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
#include "src/mca/odls/odls.h"
#include "src/mca/plm/plm_types.h"
#include "src/rml/rml.h"
#include "src/mca/routed/routed.h"
#include "src/mca/state/state.h"

#include "src/runtime/prte_globals.h"
Expand Down Expand Up @@ -213,7 +212,7 @@ static void prted_abort(int error_code, char *fmt, ...)
}

/* send it */
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP, alert, PRTE_RML_TAG_PLM);
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, alert, PRTE_RML_TAG_PLM);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
PMIX_RELEASE(alert);
Expand Down Expand Up @@ -306,7 +305,7 @@ static void job_errors(int fd, short args, void *cbdata)
goto cleanup;
}
/* send it */
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP, alert, PRTE_RML_TAG_PLM);
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, alert, PRTE_RML_TAG_PLM);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
PMIX_RELEASE(alert);
Expand Down Expand Up @@ -442,7 +441,7 @@ static void proc_errors(int fd, short args, void *cbdata)
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == prte_routed.num_routes()) {
if (0 == pmix_list_get_size(&prte_rml_base.children)) {
PRTE_OUTPUT_VERBOSE((2, prte_errmgr_base_framework.framework_output,
"%s errmgr:default:prted all routes gone - exiting",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)));
Expand All @@ -451,7 +450,7 @@ static void proc_errors(int fd, short args, void *cbdata)
PRTE_OUTPUT_VERBOSE((2, prte_errmgr_base_framework.framework_output,
"%s errmgr:default:prted not exiting, num_routes() == %d",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME),
(int) prte_routed.num_routes()));
(int) pmix_list_get_size(&prte_rml_base.children)));
}
}
/* if not, then we can continue */
Expand Down Expand Up @@ -514,7 +513,7 @@ static void proc_errors(int fd, short args, void *cbdata)
"non-zero status (local procs = %d)",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&child->name),
jdata->num_local_procs));
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP, alert, PRTE_RML_TAG_PLM);
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, alert, PRTE_RML_TAG_PLM);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
PMIX_RELEASE(alert);
Expand Down Expand Up @@ -580,7 +579,7 @@ static void proc_errors(int fd, short args, void *cbdata)
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == prte_routed.num_routes()) {
if (0 == pmix_list_get_size(&prte_rml_base.children)) {
PRTE_OUTPUT_VERBOSE((2, prte_errmgr_base_framework.framework_output,
"%s errmgr:default:prted all routes gone - exiting",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)));
Expand Down Expand Up @@ -627,7 +626,7 @@ static void proc_errors(int fd, short args, void *cbdata)
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&child->name),
jdata->num_local_procs));
/* send it */
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP, alert, PRTE_RML_TAG_PLM);
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, alert, PRTE_RML_TAG_PLM);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(alert);
Expand Down Expand Up @@ -686,7 +685,7 @@ static void proc_errors(int fd, short args, void *cbdata)
PMIX_RELEASE(jdata);

/* send it */
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP, alert, PRTE_RML_TAG_PLM);
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, alert, PRTE_RML_TAG_PLM);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(alert);
Expand Down
18 changes: 1 addition & 17 deletions src/mca/ess/base/ess_base_std_prted.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@
#include "src/mca/plm/base/base.h"
#include "src/mca/prtereachable/base/base.h"
#include "src/mca/rmaps/base/base.h"
#include "src/mca/routed/base/base.h"
#include "src/mca/routed/routed.h"
#include "src/mca/rtc/base/base.h"
#include "src/mca/schizo/base/base.h"
#include "src/mca/state/base/base.h"
Expand Down Expand Up @@ -329,19 +327,6 @@ int prte_ess_base_prted_setup(void)
}

/* Setup the communication infrastructure */
/* Routed system */
if (PRTE_SUCCESS
!= (ret = prte_mca_base_framework_open(&prte_routed_base_framework,
PRTE_MCA_BASE_OPEN_DEFAULT))) {
PRTE_ERROR_LOG(ret);
error = "prte_routed_base_open";
goto error;
}
if (PRTE_SUCCESS != (ret = prte_routed_base_select())) {
PRTE_ERROR_LOG(ret);
error = "prte_routed_base_select";
goto error;
}
if (PRTE_SUCCESS
!= (ret = prte_mca_base_framework_open(&prte_prtereachable_base_framework,
PRTE_MCA_BASE_OPEN_DEFAULT))) {
Expand Down Expand Up @@ -596,7 +581,6 @@ int prte_ess_base_prted_finalize(void)
prte_odls.kill_local_procs(NULL);
(void) prte_mca_base_framework_close(&prte_rtc_base_framework);
(void) prte_mca_base_framework_close(&prte_odls_base_framework);
(void) prte_mca_base_framework_close(&prte_routed_base_framework);
(void) prte_mca_base_framework_close(&prte_errmgr_base_framework);
prte_rml_close();
(void) prte_mca_base_framework_close(&prte_oob_base_framework);
Expand Down Expand Up @@ -672,7 +656,7 @@ static void signal_forward_callback(int fd, short event, void *arg)
}

/* send it to ourselves */
PRTE_RML_SEND(rc, PRTE_PROC_MY_NAME, cmd, PRTE_RML_TAG_DAEMON);
PRTE_RML_SEND(rc, PRTE_PROC_MY_NAME->rank, cmd, PRTE_RML_TAG_DAEMON);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(cmd);
Expand Down
2 changes: 0 additions & 2 deletions src/mca/ess/env/ess_env_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@
#include "src/mca/plm/base/base.h"
#include "src/mca/ras/base/base.h"
#include "src/rml/rml.h"
#include "src/mca/routed/base/base.h"
#include "src/mca/routed/routed.h"

#include "src/mca/filem/base/base.h"
#include "src/mca/rmaps/base/base.h"
Expand Down
18 changes: 0 additions & 18 deletions src/mca/ess/hnp/ess_hnp_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@
#include "src/mca/prtereachable/base/base.h"
#include "src/mca/ras/base/base.h"
#include "src/mca/rmaps/base/base.h"
#include "src/mca/routed/base/base.h"
#include "src/mca/routed/routed.h"
#include "src/mca/rtc/base/base.h"
#include "src/mca/schizo/base/base.h"
#include "src/mca/state/base/base.h"
Expand Down Expand Up @@ -220,21 +218,6 @@ static int rte_init(int argc, char **argv)
goto error;
}
/* Setup the communication infrastructure */
/*
* Routed system
*/
if (PRTE_SUCCESS
!= (ret = prte_mca_base_framework_open(&prte_routed_base_framework,
PRTE_MCA_BASE_OPEN_DEFAULT))) {
PRTE_ERROR_LOG(ret);
error = "prte_routed_base_open";
goto error;
}
if (PRTE_SUCCESS != (ret = prte_routed_base_select())) {
PRTE_ERROR_LOG(ret);
error = "prte_routed_base_select";
goto error;
}
if (PRTE_SUCCESS
!= (ret = prte_mca_base_framework_open(&prte_prtereachable_base_framework,
PRTE_MCA_BASE_OPEN_DEFAULT))) {
Expand Down Expand Up @@ -598,7 +581,6 @@ static int rte_finalize(void)
prte_odls.kill_local_procs(NULL);
(void) prte_mca_base_framework_close(&prte_rtc_base_framework);
(void) prte_mca_base_framework_close(&prte_odls_base_framework);
(void) prte_mca_base_framework_close(&prte_routed_base_framework);
prte_rml_close();
(void) prte_mca_base_framework_close(&prte_oob_base_framework);
(void) prte_mca_base_framework_close(&prte_prtereachable_base_framework);
Expand Down
4 changes: 2 additions & 2 deletions src/mca/filem/base/filem_base_receive.c
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ static void filem_base_process_get_proc_node_name_cmd(pmix_proc_t *sender,
return;
}

PRTE_RML_SEND(rc, sender, answer, PRTE_RML_TAG_FILEM_BASE_RESP);
PRTE_RML_SEND(rc, sender->rank, answer, PRTE_RML_TAG_FILEM_BASE_RESP);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT);
Expand Down Expand Up @@ -294,7 +294,7 @@ static void filem_base_process_get_remote_path_cmd(pmix_proc_t *sender, pmix_dat
goto CLEANUP;
}

PRTE_RML_SEND(rc, sender, answer, PRTE_RML_TAG_FILEM_BASE_RESP);
PRTE_RML_SEND(rc, sender->rank, answer, PRTE_RML_TAG_FILEM_BASE_RESP);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
PRTE_ACTIVATE_JOB_STATE(NULL, PRTE_JOB_STATE_FORCED_EXIT);
Expand Down
2 changes: 1 addition & 1 deletion src/mca/filem/raw/filem_raw_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ static void send_complete(char *file, int status)
PMIX_DATA_BUFFER_RELEASE(buf);
return;
}
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP, buf, PRTE_RML_TAG_FILEM_BASE_RESP);
PRTE_RML_SEND(rc, PRTE_PROC_MY_HNP->rank, buf, PRTE_RML_TAG_FILEM_BASE_RESP);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
PMIX_RELEASE(buf);
Expand Down
3 changes: 1 addition & 2 deletions src/mca/grpcomm/base/grpcomm_base_stubs.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
#include "src/mca/odls/base/base.h"
#include "src/mca/rmaps/rmaps_types.h"
#include "src/rml/rml.h"
#include "src/mca/routed/routed.h"
#include "src/mca/state/state.h"
#include "src/pmix/pmix-internal.h"
#include "src/runtime/prte_globals.h"
Expand Down Expand Up @@ -331,7 +330,7 @@ prte_grpcomm_coll_t *prte_grpcomm_base_get_tracker(prte_grpcomm_signature_t *sig
}

/* count the number of contributions we should get */
coll->nexpected = prte_routed.get_num_contributors(coll->dmns, coll->ndmns);
coll->nexpected = prte_rml_get_num_contributors(coll->dmns, coll->ndmns);

/* see if I am in the array of participants - note that I may
* be in the rollup tree even though I'm not participating
Expand Down
2 changes: 0 additions & 2 deletions src/mca/grpcomm/bmg/grpcomm_bmg_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@
#include "src/mca/errmgr/errmgr.h"
#include "src/mca/grpcomm/base/base.h"
#include "src/rml/rml.h"
#include "src/mca/routed/base/base.h"
#include "src/mca/routed/routed.h"
#include "src/mca/state/state.h"
#include "src/util/name_fns.h"
#include "src/util/nidmap.h"
Expand Down
Loading

0 comments on commit dbbcb66

Please sign in to comment.