Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCM/CUDA/TEST: Install memory hooks for async Cuda allocations #7204

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions config/m4/cuda.m4
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,20 @@ AS_IF([test "x$cuda_checked" != "xyes"],

LDFLAGS="$save_LDFLAGS"

# Check for cuda static library
have_cuda_static="no"
AS_IF([test "x$cuda_happy" = "xyes"],
[AC_CHECK_LIB([cudart_static], [cudaGetDeviceCount],
[
# Check for cuda static library
AC_CHECK_LIB([cudart_static], [cudaGetDeviceCount],
[CUDA_STATIC_LIBS="$CUDA_STATIC_LIBS -lcudart_static"
have_cuda_static="yes"],
[], [-ldl -lrt -lpthread])])
[], [-ldl -lrt -lpthread])
# Check for async allocation APIs
AC_CHECK_DECLS([cuMemAllocAsync, cuMemFreeAsync], [], [],
[[#include <cuda.h>]])
AC_CHECK_DECLS([cudaMallocAsync, cudaFreeAsync], [], [],
[[#include <cuda_runtime.h>]])
])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean that HAVE_CUDA is not set if *Async APIs aren't detected at configure time? That would disallow CUDA for slightly older versions of cuda wouldn't it?

I'm probably missing the commit that defines HAVE_DECL_CUMEMALLOCASYNC/HAVE_DECL_CUMEMFREEASYNC

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it does not affect HAVE_CUDA . it sets a different set of macros, specific for async APIs


CPPFLAGS="$save_CPPFLAGS"
LDFLAGS="$save_LDFLAGS"
Expand Down
44 changes: 33 additions & 11 deletions src/ucm/cuda/cudamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@
}

/* Create a body of CUDA memory release replacement function */
#define UCM_CUDA_FREE_FUNC(_name, _retval, _ptr_type, _mem_type) \
_retval ucm_##_name(_ptr_type ptr) \
#define UCM_CUDA_FREE_FUNC(_name, _retval, _mem_type, ...) \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, the memory should be freed when the stream moves past FreeAsync. When the API itself returns, this may not be true so in that sense it may not be exactly right to change the attributes of the memory range or remove the memory range from pointer cache. But as we don't have a callback per se when free actually occurs, this should be ok for now as users would be very unlikely to issue ucp transfer operations after freeasync knowing that it may not be actually freed yet.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, i guess once this is submitted it's no longer legal to issue data transfer from CPU.
do you know at which exact point the GPU can map a new physical memory to same virtual address?

Copy link
Contributor

@Akshay-Venkatesh Akshay-Venkatesh Aug 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would have to be at the next cu*Alloc* call. Since we intercept all of those, I guess we don't have to worry about stream semantics on FreeAsync.

_retval ucm_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \
{ \
_retval ret; \
\
ucm_event_enter(); \
ucm_trace("%s(ptr=%p)", __FUNCTION__, (void*)ptr); \
ucm_cuda_dispatch_mem_free((CUdeviceptr)ptr, _mem_type, #_name); \
ret = ucm_orig_##_name(ptr); \
ucm_trace("%s(ptr=%p)", __FUNCTION__, (void*)arg0); \
ucm_cuda_dispatch_mem_free((CUdeviceptr)arg0, _mem_type, #_name); \
ret = ucm_orig_##_name(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \
ucm_event_leave(); \
return ret; \
}
Expand All @@ -75,6 +75,8 @@ UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAlloc, CUresult, -1, CUdeviceptr*,
size_t)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAlloc_v2, CUresult, -1, CUdeviceptr*,
size_t)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocAsync, CUresult, -1, CUdeviceptr*,
size_t, CUstream)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yosefe I think we should also intercept cuMemAllocFromPoolAsync

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, will add

UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocManaged, CUresult, -1, CUdeviceptr*,
size_t, unsigned int)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocPitch, CUresult, -1, CUdeviceptr*,
Expand All @@ -84,13 +86,19 @@ UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocPitch_v2, CUresult, -1,
unsigned int)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree, CUresult, -1, CUdeviceptr)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree_v2, CUresult, -1, CUdeviceptr)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeAsync, CUresult, -1, CUdeviceptr,
CUstream)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost, CUresult, -1, void*)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost_v2, CUresult, -1, void*)

/* Runtime API */
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFree, cudaError_t, -1, void*)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFreeAsync, cudaError_t, -1, void*,
cudaStream_t)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFreeHost, cudaError_t, -1, void*)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMalloc, cudaError_t, -1, void**, size_t)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocAsync, cudaError_t, -1, void**,
size_t, cudaStream_t)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocManaged, cudaError_t, -1, void**,
size_t, unsigned int)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocPitch, cudaError_t, -1, void**,
Expand Down Expand Up @@ -156,6 +164,9 @@ UCM_CUDA_ALLOC_FUNC(cuMemAlloc, UCS_MEMORY_TYPE_CUDA, CUresult, CUDA_SUCCESS,
arg0, CUdeviceptr, "size=%zu", size_t)
UCM_CUDA_ALLOC_FUNC(cuMemAlloc_v2, UCS_MEMORY_TYPE_CUDA, CUresult, CUDA_SUCCESS,
arg0, CUdeviceptr, "size=%zu", size_t)
UCM_CUDA_ALLOC_FUNC(cuMemAllocAsync, UCS_MEMORY_TYPE_CUDA, CUresult,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now this is fine because cuMemAllocAsync can only allocate pinned memory but setting default memory pool to user created pool can alter the behavior in the future when other memory types are supported.

In the future, it would be better to get memory pool associated with current device and examine allocation properties to decide the memory type instead of hard coding to MEMORY_TYPE_CUDA as the same API may be used for other memory types as well. I don't see an API to get MemPool properties from MemPool yet so we'll need to intercept MemPoolCreate/Destroy API for this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or we can just set memory type to UKNOWN like we planned to anyway?

CUDA_SUCCESS, arg0, CUdeviceptr, "size=%zu stream=%p",
size_t, CUstream)
UCM_CUDA_ALLOC_FUNC(cuMemAllocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED, CUresult,
CUDA_SUCCESS, arg0, CUdeviceptr, "size=%zu flags=0x%x",
size_t, unsigned)
Expand All @@ -167,19 +178,23 @@ UCM_CUDA_ALLOC_FUNC(cuMemAllocPitch_v2, UCS_MEMORY_TYPE_CUDA, CUresult,
CUDA_SUCCESS, (size_t)arg1 * arg2, CUdeviceptr,
"pitch=%p width=%zu height=%zu elem=%u", size_t*, size_t,
size_t, unsigned)
UCM_CUDA_FREE_FUNC(cuMemFree, CUresult, CUdeviceptr, UCS_MEMORY_TYPE_CUDA)
UCM_CUDA_FREE_FUNC(cuMemFree_v2, CUresult, CUdeviceptr, UCS_MEMORY_TYPE_CUDA)
UCM_CUDA_FREE_FUNC(cuMemFreeHost, CUresult, void*, UCS_MEMORY_TYPE_HOST)
UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, CUresult, void*, UCS_MEMORY_TYPE_HOST)
UCM_CUDA_FREE_FUNC(cuMemFree, CUresult, UCS_MEMORY_TYPE_CUDA, CUdeviceptr)
UCM_CUDA_FREE_FUNC(cuMemFree_v2, CUresult, UCS_MEMORY_TYPE_CUDA, CUdeviceptr)
UCM_CUDA_FREE_FUNC(cuMemFreeAsync, CUresult, UCS_MEMORY_TYPE_CUDA, CUdeviceptr,
CUstream)
UCM_CUDA_FREE_FUNC(cuMemFreeHost, CUresult, UCS_MEMORY_TYPE_HOST, void*)
UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, CUresult, UCS_MEMORY_TYPE_HOST, void*)

static ucm_cuda_func_t ucm_cuda_driver_funcs[] = {
UCM_CUDA_FUNC_ENTRY(cuMemAlloc),
UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2),
UCM_CUDA_FUNC_ENTRY(cuMemAllocAsync),
UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged),
UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch),
UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2),
UCM_CUDA_FUNC_ENTRY(cuMemFree),
UCM_CUDA_FUNC_ENTRY(cuMemFree_v2),
UCM_CUDA_FUNC_ENTRY(cuMemFreeAsync),
UCM_CUDA_FUNC_ENTRY(cuMemFreeHost),
UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2),
{{NULL}, NULL}
Expand All @@ -188,19 +203,26 @@ static ucm_cuda_func_t ucm_cuda_driver_funcs[] = {
/* Runtime API replacements */
UCM_CUDA_ALLOC_FUNC(cudaMalloc, UCS_MEMORY_TYPE_CUDA, cudaError_t, cudaSuccess,
arg0, void*, "size=%zu", size_t)
UCM_CUDA_ALLOC_FUNC(cudaMallocAsync, UCS_MEMORY_TYPE_CUDA, cudaError_t,
cudaSuccess, arg0, void*, "size=%zu stream=%p", size_t,
cudaStream_t)
UCM_CUDA_ALLOC_FUNC(cudaMallocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED,
cudaError_t, cudaSuccess, arg0, void*,
"size=%zu flags=0x%x", size_t, unsigned)
UCM_CUDA_ALLOC_FUNC(cudaMallocPitch, UCS_MEMORY_TYPE_CUDA, cudaError_t,
cudaSuccess, (size_t)arg1 * arg2, void*,
"pitch=%p width=%zu height=%zu", size_t*, size_t, size_t)
UCM_CUDA_FREE_FUNC(cudaFree, cudaError_t, void*, UCS_MEMORY_TYPE_CUDA)
UCM_CUDA_FREE_FUNC(cudaFreeHost, cudaError_t, void*, UCS_MEMORY_TYPE_HOST)
UCM_CUDA_FREE_FUNC(cudaFree, cudaError_t, UCS_MEMORY_TYPE_CUDA, void*)
UCM_CUDA_FREE_FUNC(cudaFreeAsync, cudaError_t, UCS_MEMORY_TYPE_CUDA, void*,
cudaStream_t)
UCM_CUDA_FREE_FUNC(cudaFreeHost, cudaError_t, UCS_MEMORY_TYPE_HOST, void*)

static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = {
UCM_CUDA_FUNC_ENTRY(cudaFree),
UCM_CUDA_FUNC_ENTRY(cudaFreeAsync),
UCM_CUDA_FUNC_ENTRY(cudaFreeHost),
UCM_CUDA_FUNC_ENTRY(cudaMalloc),
UCM_CUDA_FUNC_ENTRY(cudaMallocAsync),
UCM_CUDA_FUNC_ENTRY(cudaMallocManaged),
UCM_CUDA_FUNC_ENTRY(cudaMallocPitch),
{{NULL}, NULL}
Expand Down
4 changes: 4 additions & 0 deletions src/ucm/cuda/cudamem.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

CUresult ucm_cuMemAlloc(CUdeviceptr *dptr, size_t size);
CUresult ucm_cuMemAlloc_v2(CUdeviceptr *dptr, size_t size);
CUresult ucm_cuMemAllocAsync(CUdeviceptr *dptr, size_t size, CUstream hstream);
CUresult ucm_cuMemAllocManaged(CUdeviceptr *dptr, size_t size, unsigned int flags);
CUresult ucm_cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
size_t WidthInBytes, size_t Height,
Expand All @@ -22,12 +23,15 @@ CUresult ucm_cuMemAllocPitch_v2(CUdeviceptr *dptr, size_t *pPitch,
unsigned int ElementSizeBytes);
CUresult ucm_cuMemFree(CUdeviceptr dptr);
CUresult ucm_cuMemFree_v2(CUdeviceptr dptr);
CUresult ucm_cuMemFreeAsync(CUdeviceptr dptr, CUstream stream);
CUresult ucm_cuMemFreeHost(void *p);
CUresult ucm_cuMemFreeHost_v2(void *p);

cudaError_t ucm_cudaFree(void *devPtr);
cudaError_t ucm_cudaFreeAsync(void *devPtr, cudaStream_t stream);
cudaError_t ucm_cudaFreeHost(void *ptr);
cudaError_t ucm_cudaMalloc(void **devPtr, size_t size);
cudaError_t ucm_cudaMallocAsync(void **devPtr, size_t size, cudaStream_t stream);
cudaError_t ucm_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags);
cudaError_t ucm_cudaMallocPitch(void **devPtr, size_t *pitch,
size_t width, size_t height);
Expand Down
90 changes: 64 additions & 26 deletions test/gtest/ucm/cuda_hooks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,56 +96,75 @@ class cuda_hooks : public ucs::test {
CUcontext context;
};

UCS_TEST_F(cuda_hooks, test_cuMem_Alloc_Free) {
UCS_TEST_F(cuda_hooks, test_cuMemAllocFree) {
CUresult ret;
CUdeviceptr dptr, dptr1;

/* small allocation */
ret = cuMemAlloc(&dptr, 64);
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_alloc_events((void *)dptr, 64);

ret = cuMemFree(dptr);
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_free_events((void *)dptr, 64);

/* large allocation */
ret = cuMemAlloc(&dptr, (256 * 1024 *1024));
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_alloc_events((void *)dptr, (256 * 1024 *1024));

ret = cuMemFree(dptr);
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_free_events((void *)dptr, (256 * 1024 *1024));

/* multiple allocations, cudafree in reverse order */
ret = cuMemAlloc(&dptr, (1 * 1024 *1024));
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_alloc_events((void *)dptr, (1 * 1024 *1024));

ret = cuMemAlloc(&dptr1, (1 * 1024 *1024));
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_alloc_events((void *)dptr1, (1 * 1024 *1024));

ret = cuMemFree(dptr1);
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_free_events((void *)dptr1, (1 * 1024 *1024));

ret = cuMemFree(dptr);
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_free_events((void *)dptr, (1 * 1024 *1024));
}

UCS_TEST_F(cuda_hooks, test_cuMemAllocAsync) {
#if HAVE_DECL_CUMEMALLOCASYNC && HAVE_DECL_CUMEMFREEASYNC
CUresult ret;
CUdeviceptr dptr;

ret = cuMemAllocAsync(&dptr, 1024, CU_STREAM_PER_THREAD);
ASSERT_EQ(CUDA_SUCCESS, ret);
cuStreamSynchronize(CU_STREAM_PER_THREAD);
check_mem_alloc_events((void*)dptr, 1024);

ret = cuMemFreeAsync(dptr, CU_STREAM_PER_THREAD);
ASSERT_EQ(CUDA_SUCCESS, ret);
cuStreamSynchronize(CU_STREAM_PER_THREAD);
check_mem_free_events((void*)dptr, 1024);
#else
UCS_TEST_SKIP_R("cuMemAllocAsync is unsupported");
#endif
}

UCS_TEST_F(cuda_hooks, test_cuMemAllocManaged) {
CUresult ret;
CUdeviceptr dptr;

ret = cuMemAllocManaged(&dptr, 64, CU_MEM_ATTACH_GLOBAL);
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_alloc_events((void *)dptr, 64, UCS_MEMORY_TYPE_CUDA_MANAGED);

ret = cuMemFree(dptr);
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_free_events((void *)dptr, 0);
}

Expand All @@ -155,11 +174,11 @@ UCS_TEST_F(cuda_hooks, test_cuMemAllocPitch) {
size_t pitch;

ret = cuMemAllocPitch(&dptr, &pitch, 4, 8, 4);
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_alloc_events((void *)dptr, (4 * 8));

ret = cuMemFree(dptr);
ASSERT_EQ(ret, CUDA_SUCCESS);
ASSERT_EQ(CUDA_SUCCESS, ret);
check_mem_free_events((void *)dptr, 0);
}

Expand All @@ -169,54 +188,54 @@ UCS_TEST_F(cuda_hooks, test_cuda_Malloc_Free) {

/* small allocation */
ret = cudaMalloc(&ptr, 64);
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_alloc_events(ptr, 64);

ret = cudaFree(ptr);
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_free_events(ptr, 64);

/* large allocation */
ret = cudaMalloc(&ptr, (256 * 1024 *1024));
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_alloc_events(ptr, (256 * 1024 *1024));

ret = cudaFree(ptr);
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_free_events(ptr, (256 * 1024 *1024));

/* multiple allocations, cudafree in reverse order */
ret = cudaMalloc(&ptr, (1 * 1024 *1024));
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_alloc_events(ptr, (1 * 1024 *1024));

ret = cudaMalloc(&ptr1, (1 * 1024 *1024));
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_alloc_events(ptr1, (1 * 1024 *1024));

ret = cudaFree(ptr1);
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_free_events(ptr1, (1 * 1024 *1024));

ret = cudaFree(ptr);
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_free_events(ptr, (1 * 1024 *1024));

/* cudaFree with NULL */
ret = cudaFree(NULL);
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
}

UCS_TEST_F(cuda_hooks, test_cudaMallocManaged) {
cudaError_t ret;
void *ptr;

ret = cudaMallocManaged(&ptr, 64, cudaMemAttachGlobal);
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_alloc_events(ptr, 64, UCS_MEMORY_TYPE_CUDA_MANAGED);

ret = cudaFree(ptr);
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_free_events(ptr, 0);
}

Expand All @@ -226,10 +245,29 @@ UCS_TEST_F(cuda_hooks, test_cudaMallocPitch) {
size_t pitch;

ret = cudaMallocPitch(&devPtr, &pitch, 4, 8);
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_alloc_events(devPtr, (4 * 8));

ret = cudaFree(devPtr);
ASSERT_EQ(ret, cudaSuccess);
ASSERT_EQ(cudaSuccess, ret);
check_mem_free_events(devPtr, 0);
}

UCS_TEST_F(cuda_hooks, test_cudaMallocAsync) {
#if HAVE_DECL_CUDAMALLOCASYNC && HAVE_DECL_CUDAFREEASYNC
cudaError_t ret;
void *devPtr;

ret = cudaMallocAsync(&devPtr, 1024, cudaStreamPerThread);
ASSERT_EQ(ret, cudaSuccess);
cudaStreamSynchronize(cudaStreamPerThread);
check_mem_alloc_events(devPtr, 1024);

ret = cudaFreeAsync(devPtr, cudaStreamPerThread);
ASSERT_EQ(ret, cudaSuccess);
cudaStreamSynchronize(cudaStreamPerThread);
check_mem_free_events(devPtr, 1024);
#else
UCS_TEST_SKIP_R("cudaMallocAsync is unsupported");
#endif
}