-
Notifications
You must be signed in to change notification settings - Fork 423
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
UCM/CUDA/TEST: Install memory hooks for async Cuda allocations #7204
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,15 +46,15 @@ | |
} | ||
|
||
/* Create a body of CUDA memory release replacement function */ | ||
#define UCM_CUDA_FREE_FUNC(_name, _retval, _ptr_type, _mem_type) \ | ||
_retval ucm_##_name(_ptr_type ptr) \ | ||
#define UCM_CUDA_FREE_FUNC(_name, _retval, _mem_type, ...) \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Technically, the memory should be freed when the stream moves past FreeAsync. When the API itself returns, this may not be true so in that sense it may not be exactly right to change the attributes of the memory range or remove the memory range from pointer cache. But as we don't have a callback per se when free actually occurs, this should be ok for now as users would be very unlikely to issue ucp transfer operations after freeasync knowing that it may not be actually freed yet. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, i guess once this is submitted it's no longer legal to issue data transfer from CPU. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would have to be at the next cu*Alloc* call. Since we intercept all of those, I guess we don't have to worry about stream semantics on FreeAsync. |
||
_retval ucm_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \ | ||
{ \ | ||
_retval ret; \ | ||
\ | ||
ucm_event_enter(); \ | ||
ucm_trace("%s(ptr=%p)", __FUNCTION__, (void*)ptr); \ | ||
ucm_cuda_dispatch_mem_free((CUdeviceptr)ptr, _mem_type, #_name); \ | ||
ret = ucm_orig_##_name(ptr); \ | ||
ucm_trace("%s(ptr=%p)", __FUNCTION__, (void*)arg0); \ | ||
ucm_cuda_dispatch_mem_free((CUdeviceptr)arg0, _mem_type, #_name); \ | ||
ret = ucm_orig_##_name(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \ | ||
ucm_event_leave(); \ | ||
return ret; \ | ||
} | ||
|
@@ -75,6 +75,8 @@ UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAlloc, CUresult, -1, CUdeviceptr*, | |
size_t) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAlloc_v2, CUresult, -1, CUdeviceptr*, | ||
size_t) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocAsync, CUresult, -1, CUdeviceptr*, | ||
size_t, CUstream) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @yosefe I think we should also intercept cuMemAllocFromPoolAsync There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, will add |
||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocManaged, CUresult, -1, CUdeviceptr*, | ||
size_t, unsigned int) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocPitch, CUresult, -1, CUdeviceptr*, | ||
|
@@ -84,13 +86,19 @@ UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocPitch_v2, CUresult, -1, | |
unsigned int) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree, CUresult, -1, CUdeviceptr) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree_v2, CUresult, -1, CUdeviceptr) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeAsync, CUresult, -1, CUdeviceptr, | ||
CUstream) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost, CUresult, -1, void*) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost_v2, CUresult, -1, void*) | ||
|
||
/* Runtime API */ | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFree, cudaError_t, -1, void*) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFreeAsync, cudaError_t, -1, void*, | ||
cudaStream_t) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFreeHost, cudaError_t, -1, void*) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMalloc, cudaError_t, -1, void**, size_t) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocAsync, cudaError_t, -1, void**, | ||
size_t, cudaStream_t) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocManaged, cudaError_t, -1, void**, | ||
size_t, unsigned int) | ||
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocPitch, cudaError_t, -1, void**, | ||
|
@@ -156,6 +164,9 @@ UCM_CUDA_ALLOC_FUNC(cuMemAlloc, UCS_MEMORY_TYPE_CUDA, CUresult, CUDA_SUCCESS, | |
arg0, CUdeviceptr, "size=%zu", size_t) | ||
UCM_CUDA_ALLOC_FUNC(cuMemAlloc_v2, UCS_MEMORY_TYPE_CUDA, CUresult, CUDA_SUCCESS, | ||
arg0, CUdeviceptr, "size=%zu", size_t) | ||
UCM_CUDA_ALLOC_FUNC(cuMemAllocAsync, UCS_MEMORY_TYPE_CUDA, CUresult, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For now this is fine because cuMemAllocAsync can only allocate pinned memory but setting default memory pool to user created pool can alter the behavior in the future when other memory types are supported. In the future, it would be better to get memory pool associated with current device and examine allocation properties to decide the memory type instead of hard coding to MEMORY_TYPE_CUDA as the same API may be used for other memory types as well. I don't see an API to get MemPool properties from MemPool yet so we'll need to intercept MemPoolCreate/Destroy API for this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or we can just set memory type to UKNOWN like we planned to anyway? |
||
CUDA_SUCCESS, arg0, CUdeviceptr, "size=%zu stream=%p", | ||
size_t, CUstream) | ||
UCM_CUDA_ALLOC_FUNC(cuMemAllocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED, CUresult, | ||
CUDA_SUCCESS, arg0, CUdeviceptr, "size=%zu flags=0x%x", | ||
size_t, unsigned) | ||
|
@@ -167,19 +178,23 @@ UCM_CUDA_ALLOC_FUNC(cuMemAllocPitch_v2, UCS_MEMORY_TYPE_CUDA, CUresult, | |
CUDA_SUCCESS, (size_t)arg1 * arg2, CUdeviceptr, | ||
"pitch=%p width=%zu height=%zu elem=%u", size_t*, size_t, | ||
size_t, unsigned) | ||
UCM_CUDA_FREE_FUNC(cuMemFree, CUresult, CUdeviceptr, UCS_MEMORY_TYPE_CUDA) | ||
UCM_CUDA_FREE_FUNC(cuMemFree_v2, CUresult, CUdeviceptr, UCS_MEMORY_TYPE_CUDA) | ||
UCM_CUDA_FREE_FUNC(cuMemFreeHost, CUresult, void*, UCS_MEMORY_TYPE_HOST) | ||
UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, CUresult, void*, UCS_MEMORY_TYPE_HOST) | ||
UCM_CUDA_FREE_FUNC(cuMemFree, CUresult, UCS_MEMORY_TYPE_CUDA, CUdeviceptr) | ||
UCM_CUDA_FREE_FUNC(cuMemFree_v2, CUresult, UCS_MEMORY_TYPE_CUDA, CUdeviceptr) | ||
UCM_CUDA_FREE_FUNC(cuMemFreeAsync, CUresult, UCS_MEMORY_TYPE_CUDA, CUdeviceptr, | ||
CUstream) | ||
UCM_CUDA_FREE_FUNC(cuMemFreeHost, CUresult, UCS_MEMORY_TYPE_HOST, void*) | ||
UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, CUresult, UCS_MEMORY_TYPE_HOST, void*) | ||
|
||
static ucm_cuda_func_t ucm_cuda_driver_funcs[] = { | ||
UCM_CUDA_FUNC_ENTRY(cuMemAlloc), | ||
UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2), | ||
UCM_CUDA_FUNC_ENTRY(cuMemAllocAsync), | ||
UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged), | ||
UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch), | ||
UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2), | ||
UCM_CUDA_FUNC_ENTRY(cuMemFree), | ||
UCM_CUDA_FUNC_ENTRY(cuMemFree_v2), | ||
UCM_CUDA_FUNC_ENTRY(cuMemFreeAsync), | ||
UCM_CUDA_FUNC_ENTRY(cuMemFreeHost), | ||
UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2), | ||
{{NULL}, NULL} | ||
|
@@ -188,19 +203,26 @@ static ucm_cuda_func_t ucm_cuda_driver_funcs[] = { | |
/* Runtime API replacements */ | ||
UCM_CUDA_ALLOC_FUNC(cudaMalloc, UCS_MEMORY_TYPE_CUDA, cudaError_t, cudaSuccess, | ||
arg0, void*, "size=%zu", size_t) | ||
UCM_CUDA_ALLOC_FUNC(cudaMallocAsync, UCS_MEMORY_TYPE_CUDA, cudaError_t, | ||
cudaSuccess, arg0, void*, "size=%zu stream=%p", size_t, | ||
cudaStream_t) | ||
UCM_CUDA_ALLOC_FUNC(cudaMallocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED, | ||
cudaError_t, cudaSuccess, arg0, void*, | ||
"size=%zu flags=0x%x", size_t, unsigned) | ||
UCM_CUDA_ALLOC_FUNC(cudaMallocPitch, UCS_MEMORY_TYPE_CUDA, cudaError_t, | ||
cudaSuccess, (size_t)arg1 * arg2, void*, | ||
"pitch=%p width=%zu height=%zu", size_t*, size_t, size_t) | ||
UCM_CUDA_FREE_FUNC(cudaFree, cudaError_t, void*, UCS_MEMORY_TYPE_CUDA) | ||
UCM_CUDA_FREE_FUNC(cudaFreeHost, cudaError_t, void*, UCS_MEMORY_TYPE_HOST) | ||
UCM_CUDA_FREE_FUNC(cudaFree, cudaError_t, UCS_MEMORY_TYPE_CUDA, void*) | ||
UCM_CUDA_FREE_FUNC(cudaFreeAsync, cudaError_t, UCS_MEMORY_TYPE_CUDA, void*, | ||
cudaStream_t) | ||
UCM_CUDA_FREE_FUNC(cudaFreeHost, cudaError_t, UCS_MEMORY_TYPE_HOST, void*) | ||
|
||
static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = { | ||
UCM_CUDA_FUNC_ENTRY(cudaFree), | ||
UCM_CUDA_FUNC_ENTRY(cudaFreeAsync), | ||
UCM_CUDA_FUNC_ENTRY(cudaFreeHost), | ||
UCM_CUDA_FUNC_ENTRY(cudaMalloc), | ||
UCM_CUDA_FUNC_ENTRY(cudaMallocAsync), | ||
UCM_CUDA_FUNC_ENTRY(cudaMallocManaged), | ||
UCM_CUDA_FUNC_ENTRY(cudaMallocPitch), | ||
{{NULL}, NULL} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this mean that HAVE_CUDA is not set if *Async APIs aren't detected at configure time? That would disallow CUDA for slightly older versions of cuda wouldn't it?
I'm probably missing the commit that defines HAVE_DECL_CUMEMALLOCASYNC/HAVE_DECL_CUMEMFREEASYNC
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it does not affect HAVE_CUDA . it sets a different set of macros, specific for async APIs