Skip to content

Commit

Permalink
accelerator/cuda: Add delayed initialization logic
Browse files Browse the repository at this point in the history
The current implementation requires the application to
do cudaInit before calling MPI_Init. Added delayed
initilization logic to wait as long as possible
before creating resources requiring a cuContext.

Signed-off-by: William Zhang <wilzhang@amazon.com>
  • Loading branch information
wckzhang committed Jan 10, 2023
1 parent 4e8bc42 commit b751060
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 15 deletions.
58 changes: 56 additions & 2 deletions opal/mca/accelerator/cuda/accelerator_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,14 +195,18 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
return 0;
}
}

/* First access on a device pointer finalizes CUDA support initialization. */
opal_accelerator_cuda_delayed_init();
return 1;
}

static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream)
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}
*stream = (opal_accelerator_stream_t*)OBJ_NEW(opal_accelerator_cuda_stream_t);
if (NULL == *stream) {
return OPAL_ERR_OUT_OF_RESOURCE;
Expand Down Expand Up @@ -248,6 +252,10 @@ OBJ_CLASS_INSTANCE(
static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t **event)
{
CUresult result;
int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

*event = (opal_accelerator_event_t*)OBJ_NEW(opal_accelerator_cuda_event_t);
if (NULL == *event) {
Expand Down Expand Up @@ -340,6 +348,11 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == stream || NULL == dest || NULL == src || size <= 0) {
return OPAL_ERR_BAD_PARAM;
}
Expand All @@ -358,6 +371,11 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
{
CUresult result;

delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == dest || NULL == src || size <= 0) {
return OPAL_ERR_BAD_PARAM;
}
Expand Down Expand Up @@ -391,6 +409,11 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
CUdeviceptr tmp;
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == dest || NULL == src || size <= 0) {
return OPAL_ERR_BAD_PARAM;
}
Expand Down Expand Up @@ -425,6 +448,11 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == ptr || 0 == size) {
return OPAL_ERR_BAD_PARAM;
}
Expand Down Expand Up @@ -459,6 +487,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == ptr || NULL == base || NULL == size) {
return OPAL_ERR_BAD_PARAM;
}
Expand All @@ -479,6 +512,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size)
{
CUresult result;
int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == ptr && size > 0) {
return OPAL_ERR_BAD_PARAM;
}
Expand Down Expand Up @@ -512,6 +550,11 @@ static int accelerator_cuda_get_device(int *dev_id)
CUdevice cuDev;
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == dev_id) {
return OPAL_ERR_BAD_PARAM;
}
Expand All @@ -530,6 +573,11 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == access) {
return OPAL_ERR_BAD_PARAM;
}
Expand All @@ -554,6 +602,12 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
{
CUresult result;
int enable = 1;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

result = cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,
Expand Down
2 changes: 2 additions & 0 deletions opal/mca/accelerator/cuda/accelerator_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_comp

OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;

OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);

END_C_DECLS

#endif /* MCA_ACCELERATOR_CUDA_H */
54 changes: 41 additions & 13 deletions opal/mca/accelerator/cuda/accelerator_cuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,16 @@
#include "opal/util/printf.h"
#include "opal/util/proc.h"
#include "opal/util/show_help.h"

#include "opal/sys/atomic.h"

/* Define global variables, used in accelerator_cuda.c */
CUstream opal_accelerator_cuda_memcpy_stream = NULL;
opal_mutex_t opal_accelerator_cuda_stream_lock = {0};

/* Initialization lock for delayed cuda initialization */
static opal_mutex_t accelerator_cuda_init_lock;
static bool accelerator_cuda_init_complete = false;

#define STRINGIFY2(x) #x
#define STRINGIFY(x) STRINGIFY2(x)

Expand Down Expand Up @@ -115,30 +119,34 @@ static int accelerator_cuda_component_register(void)
return OPAL_SUCCESS;
}

static opal_accelerator_base_module_t* accelerator_cuda_init(void)
int opal_accelerator_cuda_delayed_init()
{
int retval, i, j;
CUresult result;
int result = OPAL_SUCCESS;
CUcontext cuContext;

OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
/* Double checked locking to avoid having to
* grab locks post lazy-initialization. */
opal_atomic_rmb();
if (true == accelerator_cuda_init_complete) {
return OPAL_SUCCESS;
}
OPAL_THREAD_LOCK(&accelerator_cuda_init_lock);

/* First check if the support is enabled. In the case that the user has
* turned it off, we do not need to continue with any CUDA specific
* initialization. Do this after MCA parameter registration. */
if (!opal_cuda_support) {
return NULL;
/* If already initialized, just exit */
if (true == accelerator_cuda_init_complete) {
goto out;
}

/* Check to see if this process is running in a CUDA context. If
* so, all is good. If not, then disable registration of memory. */
result = cuCtxGetCurrent(&cuContext);
if (CUDA_SUCCESS != result) {
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent failed");
return NULL;
goto out;
} else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) {
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context");
return NULL;
result = OPAL_ERROR;
goto out;
} else {
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
}
Expand All @@ -148,7 +156,7 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
OPAL_PROC_MY_HOSTNAME, result);
return NULL;
goto out;
}

result = cuMemHostRegister(&checkmem, sizeof(int), 0);
Expand All @@ -162,7 +170,26 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
"CUDA: cuMemHostRegister OK on test region");
}
result = OPAL_SUCCESS;
opal_atomic_wmb();
accelerator_cuda_init_complete = true;
out:
OPAL_THREAD_UNLOCK(&accelerator_cuda_init_lock);
return result;
}

static opal_accelerator_base_module_t* accelerator_cuda_init(void)
{
OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t);
/* First check if the support is enabled. In the case that the user has
* turned it off, we do not need to continue with any CUDA specific
* initialization. Do this after MCA parameter registration. */
if (!opal_cuda_support) {
return NULL;
}

opal_accelerator_cuda_delayed_init();
return &opal_accelerator_cuda_module;
}

Expand All @@ -183,5 +210,6 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
}

OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
OBJ_DESTRUCT(&accelerator_cuda_init_lock);
return;
}

0 comments on commit b751060

Please sign in to comment.