From d77a8ffb24ac69b31781be3c18adfe8c88d8bc96 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Fri, 24 Feb 2023 15:41:15 -0700 Subject: [PATCH] smcuda: fixes when using enable-mca-dso related to #11354 Signed-off-by: Howard Pritchard --- opal/mca/btl/smcuda/btl_smcuda_accelerator.c | 14 ++++++++- opal/mca/btl/smcuda/btl_smcuda_accelerator.h | 3 +- opal/mca/btl/smcuda/btl_smcuda_component.c | 33 +++++++++++--------- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/opal/mca/btl/smcuda/btl_smcuda_accelerator.c b/opal/mca/btl/smcuda/btl_smcuda_accelerator.c index d9116cb8340..07c3c4d9a55 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_accelerator.c +++ b/opal/mca/btl/smcuda/btl_smcuda_accelerator.c @@ -1,6 +1,8 @@ /* * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. + * Copyright (c) 2023 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -35,6 +37,8 @@ static int accelerator_event_max = 400; static int accelerator_event_ipc_most = 0; static bool smcuda_accelerator_initialized = false; +static void mca_btl_smcuda_accelerator_fini(void); + int mca_btl_smcuda_accelerator_init(void) { int rc = OPAL_SUCCESS; @@ -79,6 +83,14 @@ int mca_btl_smcuda_accelerator_init(void) goto cleanup_and_error; } + /* + * add smcuda acclerator fini code to opal's list of cleanup functions. + * Cleanups are called before all the MCA frameworks are closed, so by + * adding this function to the callback list, we avoid issues with ordering + * of the closing of the BTL framework with the accelerator framework, etc. etc. + */ + opal_finalize_register_cleanup(mca_btl_smcuda_accelerator_fini); + smcuda_accelerator_initialized = true; cleanup_and_error: @@ -103,7 +115,7 @@ int mca_btl_smcuda_accelerator_init(void) return rc; } -void mca_btl_smcuda_accelerator_fini(void) +static void mca_btl_smcuda_accelerator_fini(void) { int i; diff --git a/opal/mca/btl/smcuda/btl_smcuda_accelerator.h b/opal/mca/btl/smcuda/btl_smcuda_accelerator.h index 7b039381a2e..2395f710705 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_accelerator.h +++ b/opal/mca/btl/smcuda/btl_smcuda_accelerator.h @@ -1,6 +1,8 @@ /* * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights * reserved. + * Copyright (c) 2023 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,7 +19,6 @@ #include "opal/mca/btl/btl.h" OPAL_DECLSPEC int mca_btl_smcuda_accelerator_init(void); -OPAL_DECLSPEC void mca_btl_smcuda_accelerator_fini(void); OPAL_DECLSPEC int mca_btl_smcuda_progress_one_ipc_event(struct mca_btl_base_descriptor_t **frag); OPAL_DECLSPEC int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg, struct mca_btl_base_descriptor_t *frag); diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c index 92c23286c0c..3e72bea5a4c 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_component.c +++ b/opal/mca/btl/smcuda/btl_smcuda_component.c @@ -18,6 +18,8 @@ * Copyright (c) 2014 Intel, Inc. All rights reserved. * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. + * Copyright (c) 2023 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -69,6 +71,8 @@ static int smcuda_register(void); static mca_btl_base_module_t ** mca_btl_smcuda_component_init(int *num_btls, bool enable_progress_threads, bool enable_mpi_threads); +static void mca_btl_smcuda_component_fini(void); + typedef enum { MCA_BTL_SM_RNDV_MOD_SM = 0, MCA_BTL_SM_RNDV_MOD_MPOOL @@ -214,7 +218,7 @@ static int smcuda_register(void) if (0 == mca_btl_smcuda.super.btl_accelerator_eager_limit) { mca_btl_smcuda.super.btl_accelerator_eager_limit = SIZE_MAX; /* magic number */ } -#endif +#endif /* OPAL_CUDA_SUPPORT */ return mca_btl_smcuda_component_verify(); } @@ -260,6 +264,14 @@ static int mca_btl_smcuda_component_open(void) OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_max, opal_free_list_t); OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_user, opal_free_list_t); OBJ_CONSTRUCT(&mca_btl_smcuda_component.pending_send_fl, opal_free_list_t); + + opal_finalize_register_cleanup(mca_btl_smcuda_component_fini); + + return OPAL_SUCCESS; +} + +static int mca_btl_smcuda_component_close(void) +{ return OPAL_SUCCESS; } @@ -267,9 +279,9 @@ static int mca_btl_smcuda_component_open(void) * component cleanup - sanity checking of queue lengths */ -static int mca_btl_smcuda_component_close(void) +static void mca_btl_smcuda_component_fini(void) { - int return_value = OPAL_SUCCESS; + int rc; OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_lock); /** @@ -282,11 +294,10 @@ static int mca_btl_smcuda_component_close(void) /* unmap the shared memory control structure */ if (mca_btl_smcuda_component.sm_seg != NULL) { - return_value = mca_common_sm_fini(mca_btl_smcuda_component.sm_seg); - if (OPAL_SUCCESS != return_value) { - return_value = OPAL_ERROR; + rc = mca_common_sm_fini(mca_btl_smcuda_component.sm_seg); + if (OPAL_SUCCESS != rc) { opal_output(0, " mca_common_sm_fini failed\n"); - goto CLEANUP; + return; } /* unlink file, so that it will be deleted when all references @@ -310,13 +321,7 @@ static int mca_btl_smcuda_component_close(void) unlink(mca_btl_smcuda_component.sm_fifo_path); } #endif - -CLEANUP: - - mca_btl_smcuda_accelerator_fini(); - - /* return */ - return return_value; + return; } /*