diff --git a/src/uct/cuda/cuda_copy/cuda_copy_ep.c b/src/uct/cuda/cuda_copy/cuda_copy_ep.c index 05b13a2f959..46cab05d84a 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_ep.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_ep.c @@ -55,15 +55,13 @@ uct_cuda_copy_post_cuda_async_copy(uct_ep_h tl_ep, void *dst, void *src, size_t return UCS_ERR_NO_MEMORY; } - status = CUDA_FUNC(cudaMemcpyAsync(dst, src, length, direction, stream)); + status = UCT_CUDA_FUNC(cudaMemcpyAsync(dst, src, length, direction, stream)); if (UCS_OK != status) { - ucs_error("cudaMemcpyAsync Failed "); return UCS_ERR_IO_ERROR; } - status = CUDA_FUNC(cudaEventRecord(cuda_event->event, stream)); + status = UCT_CUDA_FUNC(cudaEventRecord(cuda_event->event, stream)); if (UCS_OK != status) { - ucs_error("cudaEventRecord Failed "); return UCS_ERR_IO_ERROR; } ucs_queue_push(outstanding_queue, &cuda_event->queue); @@ -81,6 +79,14 @@ ucs_status_t uct_cuda_copy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, si uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t); ucs_status_t status; + if (iface->stream_d2h == 0) { + status = UCT_CUDA_FUNC(cudaStreamCreateWithFlags(&iface->stream_d2h, + cudaStreamNonBlocking)); + if (UCS_OK != status) { + return UCS_ERR_IO_ERROR; + } + } + status = uct_cuda_copy_post_cuda_async_copy(tl_ep, iov[0].buffer, (void *)remote_addr, iov[0].length, cudaMemcpyDeviceToHost, iface->stream_d2h, @@ -101,6 +107,14 @@ ucs_status_t uct_cuda_copy_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, si uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t); ucs_status_t status; + if (iface->stream_h2d == 0) { + status = UCT_CUDA_FUNC(cudaStreamCreateWithFlags(&iface->stream_h2d, + cudaStreamNonBlocking)); + if (UCS_OK != status) { + return UCS_ERR_IO_ERROR; + } + } + status = uct_cuda_copy_post_cuda_async_copy(tl_ep, (void *)remote_addr, iov[0].buffer, iov[0].length, cudaMemcpyHostToDevice, iface->stream_h2d, diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.c b/src/uct/cuda/cuda_copy/cuda_copy_iface.c index ca09b1498d8..f5e393b5828 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_iface.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.c @@ -177,7 +177,8 @@ static void uct_cuda_copy_event_desc_init(ucs_mpool_t *mp, void *obj, void *chun ucs_status_t status; memset(base, 0 , sizeof(*base)); - status = CUDA_FUNC(cudaEventCreateWithFlags(&(base->event), cudaEventDisableTiming)); + status = UCT_CUDA_FUNC(cudaEventCreateWithFlags(&(base->event), + cudaEventDisableTiming)); if (UCS_OK != status) { ucs_error("cudaEventCreateWithFlags Failed"); } @@ -188,7 +189,7 @@ static void uct_cuda_copy_event_desc_cleanup(ucs_mpool_t *mp, void *obj) uct_cuda_copy_event_desc_t *base = (uct_cuda_copy_event_desc_t *) obj; ucs_status_t status; - status = CUDA_FUNC(cudaEventDestroy(base->event)); + status = UCT_CUDA_FUNC(cudaEventDestroy(base->event)); if (UCS_OK != status) { ucs_error("cudaEventDestroy Failed"); } @@ -215,7 +216,7 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work if (strncmp(params->mode.device.dev_name, UCT_CUDA_DEV_NAME, strlen(UCT_CUDA_DEV_NAME)) != 0) { - ucs_error("No device was found: %s", params->mode.device.dev_name); + ucs_error("no device was found: %s", params->mode.device.dev_name); return UCS_ERR_NO_DEVICE; } @@ -232,20 +233,12 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work "CUDA EVENT objects"); if (UCS_OK != status) { - ucs_error("Mpool creation failed"); + ucs_error("mpool creation failed"); return UCS_ERR_IO_ERROR; } - status = CUDA_FUNC(cudaStreamCreateWithFlags(&self->stream_d2h, cudaStreamNonBlocking)); - if (UCS_OK != status) { - ucs_error("cudaStreamCreateWithFlags creation failed"); - return UCS_ERR_IO_ERROR; - } - status = CUDA_FUNC(cudaStreamCreateWithFlags(&self->stream_h2d, cudaStreamNonBlocking)); - if (UCS_OK != status) { - ucs_error("cudaStreamCreateWithFlags creation failed"); - return UCS_ERR_IO_ERROR; - } + self->stream_d2h = 0; + self->stream_h2d = 0; ucs_queue_head_init(&self->outstanding_d2h_cuda_event_q); ucs_queue_head_init(&self->outstanding_h2d_cuda_event_q); @@ -257,6 +250,14 @@ static UCS_CLASS_CLEANUP_FUNC(uct_cuda_copy_iface_t) { uct_base_iface_progress_disable(&self->super.super, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); + if (self->stream_h2d != 0) { + UCT_CUDA_FUNC(cudaStreamDestroy(self->stream_h2d)); + } + + if (self->stream_d2h != 0) { + UCT_CUDA_FUNC(cudaStreamDestroy(self->stream_d2h)); + } + ucs_mpool_cleanup(&self->cuda_event_desc, 1); } diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.h b/src/uct/cuda/cuda_copy/cuda_copy_iface.h index c5b718deb67..47ce350c856 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_iface.h +++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.h @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -15,13 +16,13 @@ #define UCT_CUDA_COPY_TL_NAME "cuda_copy" #define UCT_CUDA_DEV_NAME "cudacopy0" -#define CUDA_FUNC(func) ({ \ +#define UCT_CUDA_FUNC(_func) ({ \ ucs_status_t _status = UCS_OK; \ do { \ - cudaError_t _result = (func); \ + cudaError_t _result = (_func); \ if (cudaSuccess != _result) { \ - ucs_error("[%s:%d] cuda failed with %d \n", \ - __FILE__, __LINE__,_result); \ + ucs_error("%s failed with %d \n", \ + UCS_PP_MAKE_STRING(_func), _result); \ _status = UCS_ERR_IO_ERROR; \ } \ } while (0); \