Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add accelerator and part frameworks to mca lists #10949

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions ompi/instance/instance.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,15 @@ static void ompi_instance_destruct(ompi_instance_t *instance)

OBJ_CLASS_INSTANCE(ompi_instance_t, opal_infosubscriber_t, ompi_instance_construct, ompi_instance_destruct);

/* NTH: frameworks needed by MPI */
/* OMPI MCA frameworks needed by MPI. New frameworks need to be added to this list */
static mca_base_framework_t *ompi_framework_dependencies[] = {
&ompi_hook_base_framework, &ompi_op_base_framework,
&opal_allocator_base_framework, &opal_rcache_base_framework, &opal_mpool_base_framework, &opal_smsc_base_framework,
&ompi_bml_base_framework, &ompi_pml_base_framework, &ompi_coll_base_framework,
&ompi_osc_base_framework, NULL,
&ompi_osc_base_framework, &ompi_part_base_framework, NULL,
};

/* OMPI MCA frameworks that can be opened multiple times need to be added to this list */
static mca_base_framework_t *ompi_lazy_frameworks[] = {
&ompi_io_base_framework, &ompi_topo_base_framework, NULL,
};
Expand Down Expand Up @@ -642,11 +643,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
return ompi_instance_print_error ("ompi_win_init() failed", ret);
}

/* initialize partcomm */
if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_part_base_framework, 0))) {
return ompi_instance_print_error ("mca_part_base_select() failed", ret);
}

/* select partcomm */
if (OMPI_SUCCESS != (ret = mca_part_base_select (true, true))) {
return ompi_instance_print_error ("mca_part_base_select() failed", ret);
}
Expand Down
5 changes: 4 additions & 1 deletion opal/mca/btl/smcuda/btl_smcuda_accelerator.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ static int accelerator_event_max = 400;
static int accelerator_event_ipc_most = 0;
static bool smcuda_accelerator_initialized = false;

static void mca_btl_smcuda_accelerator_fini(void);

int mca_btl_smcuda_accelerator_init(void)
{
int rc = OPAL_SUCCESS;
Expand Down Expand Up @@ -79,6 +81,7 @@ int mca_btl_smcuda_accelerator_init(void)
goto cleanup_and_error;
}

opal_finalize_register_cleanup(mca_btl_smcuda_accelerator_fini);
smcuda_accelerator_initialized = true;

cleanup_and_error:
Expand All @@ -103,7 +106,7 @@ int mca_btl_smcuda_accelerator_init(void)
return rc;
}

void mca_btl_smcuda_accelerator_fini(void)
static void mca_btl_smcuda_accelerator_fini(void)
{
int i;

Expand Down
1 change: 0 additions & 1 deletion opal/mca/btl/smcuda/btl_smcuda_accelerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include "opal/mca/btl/btl.h"

OPAL_DECLSPEC int mca_btl_smcuda_accelerator_init(void);
OPAL_DECLSPEC void mca_btl_smcuda_accelerator_fini(void);
OPAL_DECLSPEC int mca_btl_smcuda_progress_one_ipc_event(struct mca_btl_base_descriptor_t **frag);
OPAL_DECLSPEC int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg,
struct mca_btl_base_descriptor_t *frag);
Expand Down
27 changes: 17 additions & 10 deletions opal/mca/btl/smcuda/btl_smcuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* Copyright (c) 2022 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -68,6 +70,7 @@ static int mca_btl_smcuda_component_close(void);
static int smcuda_register(void);
static mca_btl_base_module_t **
mca_btl_smcuda_component_init(int *num_btls, bool enable_progress_threads, bool enable_mpi_threads);
static void mca_btl_smcuda_component_fini(void);

typedef enum {
MCA_BTL_SM_RNDV_MOD_SM = 0,
Expand Down Expand Up @@ -260,6 +263,9 @@ static int mca_btl_smcuda_component_open(void)
OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_max, opal_free_list_t);
OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_user, opal_free_list_t);
OBJ_CONSTRUCT(&mca_btl_smcuda_component.pending_send_fl, opal_free_list_t);

opal_finalize_register_cleanup(mca_btl_smcuda_component_fini);

return OPAL_SUCCESS;
}

Expand All @@ -269,7 +275,12 @@ static int mca_btl_smcuda_component_open(void)

static int mca_btl_smcuda_component_close(void)
{
int return_value = OPAL_SUCCESS;
return OPAL_SUCCESS;
}

static void mca_btl_smcuda_component_fini(void)
{
int rc;

OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_lock);
/**
Expand All @@ -282,11 +293,11 @@ static int mca_btl_smcuda_component_close(void)

/* unmap the shared memory control structure */
if (mca_btl_smcuda_component.sm_seg != NULL) {
return_value = mca_common_sm_fini(mca_btl_smcuda_component.sm_seg);
if (OPAL_SUCCESS != return_value) {
return_value = OPAL_ERROR;
rc = mca_common_sm_fini(mca_btl_smcuda_component.sm_seg);
if (OPAL_SUCCESS != rc) {
rc = OPAL_ERROR;
opal_output(0, " mca_common_sm_fini failed\n");
goto CLEANUP;
return;
}

/* unlink file, so that it will be deleted when all references
Expand All @@ -311,12 +322,8 @@ static int mca_btl_smcuda_component_close(void)
}
#endif

CLEANUP:

mca_btl_smcuda_accelerator_fini();

/* return */
return return_value;
return;
}

/*
Expand Down
18 changes: 13 additions & 5 deletions opal/runtime/opal_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
* All Rights reserved.
* Copyright (c) 2018 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2018-2019 Triad National Security, LLC. All rights
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2020 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
Expand Down Expand Up @@ -120,6 +120,8 @@ int opal_init_psm(void)
return OPAL_SUCCESS;
}

/* MCA frameworks needed by OPAL. New frameworks need to be added to this list */

/* the memcpy component should be one of the first who get
* loaded in order to make sure we have all the available
* versions of memcpy correctly configured.
Expand All @@ -130,7 +132,7 @@ static mca_base_framework_t *opal_init_frameworks[] = {
&opal_memcpy_base_framework, &opal_memchecker_base_framework,
&opal_backtrace_base_framework, &opal_timer_base_framework,
&opal_shmem_base_framework, &opal_reachable_base_framework,
&opal_pmix_base_framework,
&opal_pmix_base_framework, &opal_accelerator_base_framework,
NULL,
};

Expand Down Expand Up @@ -160,6 +162,11 @@ int opal_init(int *pargc, char ***pargv)
return ret;
}

/*
* Trap direct usage of mca_base_framework_open through out the rest of this file.
*/
#define mca_base_framework_open "add new frameworks to the opal_init_frameworks list"

OPAL_TIMING_ENV_NEXT(otmng, "opal_if_init");

/* register PMIx cleanup function for output streams */
Expand Down Expand Up @@ -194,13 +201,14 @@ int opal_init(int *pargc, char ***pargv)
return opal_init_error("opal_init framework open", ret);
}

/* Intitialize Accelerator framework
/* select Accelerator framework component
* The datatype convertor code has a dependency on the accelerator framework
* being initialized. */
ret = mca_base_framework_open(&opal_accelerator_base_framework, 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand how this change fixes anything. mca_base_framework_open_list() is literally just a for loop around the framework list calling mca_base_framework_open() on each item in the list. It has the same flag argument, so there's no change there. I don't think this was the important change.

I thought it might be a different behavior in the registered cleanup argument, but reading the code for mca_base_framework_close_list, it is just calling mca_base_framework_close(). I think the real problem is not any of the list usage stuff, but that the registered cleanup function was/is calling the finalize function directly rather than calling mca_base_framework_close, which was a pretty silly mistake on our part in the review. It kind of makes me wonder where @wckzhang found that cleanup example and if we don't have other issues lurking elsewhere.

Either way, this patch is wrong. Either the opal_finalize_register_cleanup on line 203 needs to be removed or the whole patch could become changing that to opal_finalize_register_cleanup_arg(mca_base_framework_close, opal_accelerator_base_framework). The first obviously seems simpler, but one of the two needs to happen.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

before the patch here's the dlopening/closing activity related to accelerator framework:


calling MPI_INit
dlopnening /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_rocm
dlopnening /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_null

calling MPI_Finalize
dlclosing /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_rocm
dlclosing /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_null

calling MPI_Session_init
hit segfault in the accelerator framework selection function as there are pointers to memory which has been munmapped.

behavior with the patch

MPI_Init
dlopnening /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_rocm
dlopnening /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_null
MPI_Finalize
dlclosing /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_rocm
dlclosing /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_null
MPI_Session_init
dlopnening /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_rocm
dlopnening /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_null
MPI_Session_finalize
dlclosing /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_rocm
dlclosing /usr/projects/artab/users/hpp/ompi3/test_install/lib/openmpi/mca_accelerator_null

a framework has to be closed in opal_finalize when using enable-mca-dso otherwise a reopen of the framework is a no op owing to a check at line 173 of mca_base_framework.c but the component *.so's were unloaded as part of the MPI_Finalize. Without the patch the accelerator framework was not being closed.

Copy link
Member

@bwbarrett bwbarrett Oct 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, like I said, the issue is not the use / non-use of the list. The issue is that the code was registering a call to the finalize call directly, rather than component close. If line 203 was opal_finalize_register_cleanup_arg(mca_base_framework_close, opal_accelerator_base_framework), the existing code would have worked just as well.

I have no problem with the list change, but in opal, that was not the root cause of the problem. Deregisters are called in reverse order, so I think what's happening is that accelerator_base_selected_component.accelerator_finalize() is called and then shortly later mca_base_framework_close(opal_accelerator_base_framework) is called, which calls accelerator_base_selected_component.accelerator_finalize() again and then closes the framework correctly. But we document that finalize will be called once per open, so we're getting lucky that the component doesn't do anything bad in that second call to finalize.

Hence, for the patch to be correct, line 203 (opal_finalize_register_cleanup(accelerator_base_selected_component.accelerator_finalize);) needs to be removed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep I agree about removing the accelerator finalize call.
@wckzhang do you recall what motivated having the accelerator component finalize handled in a separate function rather than in the close of the component? Is there some cuda behavior that motivated this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just thought that was how we do cleanup

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hppritcha I don't understand that. mca_base_framework_close() is still the right answer; calling <framework>.finalize() directly leaves the base framework in a not-clean state.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually the accelerator framework was mostly doing the right thing. As described in section IIIB of 10.1109/CLUSTER.2019.8891002 the complex interlinking of framework dependencies that existed prior to sessions was replaced with a cleanup callback framework.

@hjelmn can provide more details if needed.

What is the not-clean state that the accelerator framework is being left in?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code calls:

opal_finalize_register_cleanup(opal_accelerator_base_selected_component.accelerator_finalize);

instead of the correct:

opal_finalize_register_cleanup_arg(mca_base_framework_close, opal_accelerator_base_framework);

So while the component gets cleaned up, the framework itself does not.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would opal_finalize_register_cleanup_arg(mca_base_framework_close, opal_accelerator_base_framework) be needed? If it is in the list of frameworks (which it should be if you want it always opened) then it will get called automatically when the last call to finalize is made.

I plan to take a closer look at this PR and the existing code tomorrow. Will probably have more comments then.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hjelmn we've gone around so long that it doesn't matter. But the current PR is wrong and needs to be fixed. Today, on line 212, it calls opal_finalize_register_cleanup(opal_accelerator_base_selected_component.accelerator_finalize);. This line just needs to be removed.

I was trying to make the point to Howard that adding the accelerator framework to the list of frameworks wasn't strictly necessary to fixing the issue, because I was pushing on how we missed this issue in the original CR. That just overcomplicated things. If the patch is extended to remove line 212, I'm happy to ship this patch. Until then, the patch is wrong, because we're calling the component's accelerator_finalize() function twice. Once from the base cleanup function that is invoked by adding the framework to the opal framework list and once explicitly on line 212.

if (OPAL_SUCCESS == ret && OPAL_SUCCESS != (ret = opal_accelerator_base_select())) {
if (OPAL_SUCCESS != (ret = opal_accelerator_base_select())) {
return opal_init_error("opal_accelerator_base_select", ret);
}

/* register accelerator cleanup function */
opal_finalize_register_cleanup(opal_accelerator_base_selected_component.accelerator_finalize);

/* initialize the datatype engine */
Expand Down