Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v5.0.x accelerator/cuda: Add delayed initialization logic #11297

Merged
merged 2 commits into from
Jan 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 64 additions & 10 deletions opal/mca/accelerator/cuda/accelerator_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,14 +195,18 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
return 0;
}
}

/* First access on a device pointer finalizes CUDA support initialization. */
opal_accelerator_cuda_delayed_init();
return 1;
}

static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream)
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}
*stream = (opal_accelerator_stream_t*)OBJ_NEW(opal_accelerator_cuda_stream_t);
if (NULL == *stream) {
return OPAL_ERR_OUT_OF_RESOURCE;
Expand Down Expand Up @@ -248,6 +252,10 @@ OBJ_CLASS_INSTANCE(
static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t **event)
{
CUresult result;
int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

*event = (opal_accelerator_event_t*)OBJ_NEW(opal_accelerator_cuda_event_t);
if (NULL == *event) {
Expand Down Expand Up @@ -340,6 +348,11 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == stream || NULL == dest || NULL == src || size <= 0) {
return OPAL_ERR_BAD_PARAM;
}
Expand All @@ -358,6 +371,11 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == dest || NULL == src || size <= 0) {
return OPAL_ERR_BAD_PARAM;
}
Expand Down Expand Up @@ -391,6 +409,11 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
CUdeviceptr tmp;
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == dest || NULL == src || size <= 0) {
return OPAL_ERR_BAD_PARAM;
}
Expand Down Expand Up @@ -425,6 +448,11 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == ptr || 0 == size) {
return OPAL_ERR_BAD_PARAM;
}
Expand All @@ -434,7 +462,7 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
OPAL_PROC_MY_HOSTNAME, result);
return result;
return OPAL_ERROR;
}
}
return 0;
Expand All @@ -448,7 +476,7 @@ static int accelerator_cuda_mem_release(int dev_id, void *ptr)
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_show_help("help-accelerator-cuda.txt", "cuMemFree failed", true,
OPAL_PROC_MY_HOSTNAME, result);
return result;
return OPAL_ERROR;
}
}
return 0;
Expand All @@ -459,6 +487,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == ptr || NULL == base || NULL == size) {
return OPAL_ERR_BAD_PARAM;
}
Expand All @@ -479,6 +512,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size)
{
CUresult result;
int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == ptr && size > 0) {
return OPAL_ERR_BAD_PARAM;
}
Expand All @@ -487,7 +525,7 @@ static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size)
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister failed", true,
ptr, size, OPAL_PROC_MY_HOSTNAME, result);
return result;
return OPAL_ERROR;
}

return OPAL_SUCCESS;
Expand All @@ -501,7 +539,7 @@ static int accelerator_cuda_host_unregister(int dev_id, void *ptr)
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_show_help("help-accelerator-cuda.txt", "cuMemHostUnregister failed", true,
ptr, OPAL_PROC_MY_HOSTNAME, result);
return result;
return OPAL_ERROR;
}
}
return OPAL_SUCCESS;
Expand All @@ -512,6 +550,11 @@ static int accelerator_cuda_get_device(int *dev_id)
CUdevice cuDev;
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == dev_id) {
return OPAL_ERR_BAD_PARAM;
}
Expand All @@ -520,7 +563,7 @@ static int accelerator_cuda_get_device(int *dev_id)
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_show_help("help-accelerator-cuda.txt", "cuCtxGetDevice failed", true,
result);
return result;
return OPAL_ERROR;
}
*dev_id = cuDev;
return 0;
Expand All @@ -530,6 +573,11 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
{
CUresult result;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

if (NULL == access) {
return OPAL_ERR_BAD_PARAM;
}
Expand All @@ -538,7 +586,7 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_show_help("help-accelerator-cuda.txt", "cuDeviceCanAccessPeer failed", true,
OPAL_PROC_MY_HOSTNAME, result);
return result;
return OPAL_ERROR;
}
return 0;
}
Expand All @@ -554,18 +602,24 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
{
CUresult result;
int enable = 1;

int delayed_init = opal_accelerator_cuda_delayed_init();
if (OPAL_UNLIKELY(0 != delayed_init)) {
return delayed_init;
}

result = cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,
result);
return result;
return OPAL_ERROR;
}
result = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
(CUdeviceptr) addr);
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
opal_show_help("help-accelerator-cuda.txt", "cuPointerSetAttribute failed", true,
OPAL_PROC_MY_HOSTNAME, result, addr);
return result;
return OPAL_ERROR;
}
return OPAL_SUCCESS;
}
2 changes: 2 additions & 0 deletions opal/mca/accelerator/cuda/accelerator_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_comp

OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;

OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);

END_C_DECLS

#endif /* MCA_ACCELERATOR_CUDA_H */
54 changes: 41 additions & 13 deletions opal/mca/accelerator/cuda/accelerator_cuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,16 @@
#include "opal/util/printf.h"
#include "opal/util/proc.h"
#include "opal/util/show_help.h"

#include "opal/sys/atomic.h"

/* Define global variables, used in accelerator_cuda.c */
CUstream opal_accelerator_cuda_memcpy_stream = NULL;
opal_mutex_t opal_accelerator_cuda_stream_lock = {0};

/* Initialization lock for delayed cuda initialization */
static opal_mutex_t accelerator_cuda_init_lock;
static bool accelerator_cuda_init_complete = false;

#define STRINGIFY2(x) #x
#define STRINGIFY(x) STRINGIFY2(x)

Expand Down Expand Up @@ -115,30 +119,34 @@ static int accelerator_cuda_component_register(void)
return OPAL_SUCCESS;
}

static opal_accelerator_base_module_t* accelerator_cuda_init(void)
int opal_accelerator_cuda_delayed_init()
{
int retval, i, j;
CUresult result;
int result = OPAL_SUCCESS;
CUcontext cuContext;

OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
/* Double checked locking to avoid having to
* grab locks post lazy-initialization. */
opal_atomic_rmb();
if (true == accelerator_cuda_init_complete) {
return OPAL_SUCCESS;
}
OPAL_THREAD_LOCK(&accelerator_cuda_init_lock);

/* First check if the support is enabled. In the case that the user has
* turned it off, we do not need to continue with any CUDA specific
* initialization. Do this after MCA parameter registration. */
if (!opal_cuda_support) {
return NULL;
/* If already initialized, just exit */
if (true == accelerator_cuda_init_complete) {
goto out;
}

/* Check to see if this process is running in a CUDA context. If
* so, all is good. If not, then disable registration of memory. */
result = cuCtxGetCurrent(&cuContext);
if (CUDA_SUCCESS != result) {
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent failed");
return NULL;
goto out;
} else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) {
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context");
return NULL;
result = OPAL_ERROR;
goto out;
} else {
opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
}
Expand All @@ -148,7 +156,7 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
OPAL_PROC_MY_HOSTNAME, result);
return NULL;
goto out;
}

result = cuMemHostRegister(&checkmem, sizeof(int), 0);
Expand All @@ -162,7 +170,26 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
"CUDA: cuMemHostRegister OK on test region");
}
result = OPAL_SUCCESS;
opal_atomic_wmb();
accelerator_cuda_init_complete = true;
out:
OPAL_THREAD_UNLOCK(&accelerator_cuda_init_lock);
return result;
}

static opal_accelerator_base_module_t* accelerator_cuda_init(void)
{
OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t);
/* First check if the support is enabled. In the case that the user has
* turned it off, we do not need to continue with any CUDA specific
* initialization. Do this after MCA parameter registration. */
if (!opal_cuda_support) {
return NULL;
}

opal_accelerator_cuda_delayed_init();
return &opal_accelerator_cuda_module;
}

Expand All @@ -183,5 +210,6 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
}

OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
OBJ_DESTRUCT(&accelerator_cuda_init_lock);
return;
}