diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index 1b9fc54cec02..c9b2696b9fd2 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -1060,40 +1060,33 @@ test_malloc_hook() { cuda_dynamic_exe=./test/apps/test_cuda_hook_dynamic cuda_static_exe=./test/apps/test_cuda_hook_static - for mode in reloc bistro - do - export UCX_MEM_CUDA_HOOK_MODE=${mode} - - # Run cuda memory hooks with dynamic link - ${cuda_dynamic_exe} + # Run cuda memory hooks with dynamic link + ${cuda_dynamic_exe} - # Run cuda memory hooks with static link, if exists. If the static - # library 'libcudart_static.a' is not present, static test will not - # be built. - if [ -x ${cuda_static_exe} ] + # Run cuda memory hooks with static link, if exists. If the static + # library 'libcudart_static.a' is not present, static test will not + # be built. + if [ -x ${cuda_static_exe} ] + then + ${cuda_static_exe} && status="pass" || status="fail" + [ ${mode} == "bistro" ] && exp_status="pass" || exp_status="fail" + if [ ${status} == ${exp_status} ] then - ${cuda_static_exe} && status="pass" || status="fail" - [ ${mode} == "bistro" ] && exp_status="pass" || exp_status="fail" - if [ ${status} == ${exp_status} ] - then - echo "Static link with cuda ${status}, as expected" - else - echo "Static link with cuda is expected to ${exp_status}, actual: ${status}" - exit 1 - fi + echo "Static link with cuda ${status}, as expected" + else + echo "Static link with cuda is expected to ${exp_status}, actual: ${status}" + exit 1 fi + fi - # Test that driver API hooks work in both reloc and bistro modes, - # since we call them directly from the test - ${cuda_dynamic_exe} -d - [ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d - - # Test hooks in gtest - UCX_MEM_LOG_LEVEL=diag \ - ./test/gtest/gtest --gtest_filter='cuda_hooks.*' + # Test that driver API hooks work in both reloc and bistro modes, + # since we call them directly from the test + ${cuda_dynamic_exe} -d + [ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d - unset UCX_MEM_CUDA_HOOK_MODE - done + # Test hooks in gtest + UCX_MEM_LOG_LEVEL=diag \ + ./test/gtest/gtest --gtest_filter='cuda_hooks.*' fi } diff --git a/src/ucm/cuda/cudamem.c b/src/ucm/cuda/cudamem.c index 23a9c5d18061..66309a115bcc 100644 --- a/src/ucm/cuda/cudamem.c +++ b/src/ucm/cuda/cudamem.c @@ -87,15 +87,6 @@ UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree_v2, CUresult, -1, CUdeviceptr) UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost, CUresult, -1, void*) UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost_v2, CUresult, -1, void*) -/* Runtime API */ -UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFree, cudaError_t, -1, void*) -UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFreeHost, cudaError_t, -1, void*) -UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMalloc, cudaError_t, -1, void**, size_t) -UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocManaged, cudaError_t, -1, void**, - size_t, unsigned int) -UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocPitch, cudaError_t, -1, void**, - size_t*, size_t, size_t) - static void ucm_cuda_dispatch_mem_alloc(CUdeviceptr ptr, size_t length, ucs_memory_type_t mem_type) { @@ -160,100 +151,55 @@ UCM_CUDA_FREE_FUNC(cuMemFree_v2, CUresult, CUdeviceptr, UCS_MEMORY_TYPE_CUDA) UCM_CUDA_FREE_FUNC(cuMemFreeHost, CUresult, void*, UCS_MEMORY_TYPE_HOST) UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, CUresult, void*, UCS_MEMORY_TYPE_HOST) -static ucm_cuda_func_t ucm_cuda_driver_funcs[] = { - UCM_CUDA_FUNC_ENTRY(cuMemAlloc), - UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2), - UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged), - UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch), - UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2), - UCM_CUDA_FUNC_ENTRY(cuMemFree), - UCM_CUDA_FUNC_ENTRY(cuMemFree_v2), - UCM_CUDA_FUNC_ENTRY(cuMemFreeHost), - UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2), - {{NULL}, NULL} -}; - -/* Runtime API replacements */ -UCM_CUDA_ALLOC_FUNC(cudaMalloc, UCS_MEMORY_TYPE_CUDA, cudaError_t, cudaSuccess, - arg0, void*, "size=%zu", size_t) -UCM_CUDA_ALLOC_FUNC(cudaMallocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED, - cudaError_t, cudaSuccess, arg0, void*, - "size=%zu flags=0x%x", size_t, unsigned) -UCM_CUDA_ALLOC_FUNC(cudaMallocPitch, UCS_MEMORY_TYPE_CUDA, cudaError_t, - cudaSuccess, (size_t)arg1 * arg2, void*, - "pitch=%p width=%zu height=%zu", size_t*, size_t, size_t) -UCM_CUDA_FREE_FUNC(cudaFree, cudaError_t, void*, UCS_MEMORY_TYPE_CUDA) -UCM_CUDA_FREE_FUNC(cudaFreeHost, cudaError_t, void*, UCS_MEMORY_TYPE_HOST) - -static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = { - UCM_CUDA_FUNC_ENTRY(cudaFree), - UCM_CUDA_FUNC_ENTRY(cudaFreeHost), - UCM_CUDA_FUNC_ENTRY(cudaMalloc), - UCM_CUDA_FUNC_ENTRY(cudaMallocManaged), - UCM_CUDA_FUNC_ENTRY(cudaMallocPitch), - {{NULL}, NULL} -}; - static int ucm_cuda_allow_hook_mode(ucm_mmap_hook_mode_t mode) { return (ucm_global_opts.cuda_hook_modes & UCS_BIT(mode)) && (ucm_get_hook_mode(mode) == mode); } -static ucs_status_t -ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc, - const char *name) +static ucs_status_t ucm_cuda_install_hooks() { + static ucm_cuda_func_t funcs[] = { + UCM_CUDA_FUNC_ENTRY(cuMemAlloc), + UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2), + UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged), + UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch), + UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2), + UCM_CUDA_FUNC_ENTRY(cuMemFree), + UCM_CUDA_FUNC_ENTRY(cuMemFree_v2), + UCM_CUDA_FUNC_ENTRY(cuMemFreeHost), + UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2), + {{NULL}, NULL} + }; const char UCS_V_UNUSED *hook_mode; - unsigned num_bistro, num_reloc; ucm_cuda_func_t *func; ucs_status_t status; + unsigned num_hooks; void *func_ptr; - num_bistro = 0; - num_reloc = 0; + num_hooks = 0; for (func = funcs; func->patch.symbol != NULL; ++func) { func_ptr = ucm_reloc_get_orig(func->patch.symbol, func->patch.value); if (func_ptr == NULL) { continue; } - status = UCS_ERR_UNSUPPORTED; - - if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) { - status = ucm_bistro_patch(func_ptr, func->patch.value, - func->patch.symbol, func->orig_func_ptr, - NULL); - if (status == UCS_OK) { - ucm_debug("installed bistro hook for '%s': %d", - func->patch.symbol, status); - ++num_bistro; - continue; - } - + status = ucm_bistro_patch(func_ptr, func->patch.value, + func->patch.symbol, func->orig_func_ptr, + NULL); + if (status != UCS_OK) { ucm_debug("failed to install bistro hook for '%s'", func->patch.symbol); + return status; } - if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_RELOC)) { - status = ucm_reloc_modify(&func->patch); - if (status == UCS_OK) { - ++num_reloc; - ucm_debug("installed reloc hook on '%s'", func->patch.symbol); - continue; - } - - ucm_debug("failed to install relocation table hook for '%s'", - func->patch.symbol); - } - - ucm_diag("failed to install hook for '%s'", func->patch.symbol); - return status; + ucm_debug("installed bistro hook for '%s': %d", func->patch.symbol, + status); + ++num_hooks; } - *used_reloc = num_reloc > 0; - ucm_info("cuda memory hooks on %s API: installed %u bistro and %u reloc", - name, num_bistro, num_reloc); + ucm_info("installed %u/%zu cuda memory hooks", num_hooks, + ucs_static_array_size(funcs) - 1); return UCS_OK; } @@ -262,7 +208,6 @@ static ucs_status_t ucm_cudamem_install(int events) static int ucm_cudamem_installed = 0; static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER; ucs_status_t status = UCS_OK; - int used_reloc; if (!(events & (UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE))) { goto out; @@ -274,30 +219,26 @@ static ucs_status_t ucm_cudamem_install(int events) goto out; } + if (!ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) { + ucm_diag("cuda hooks require bistro mode enabled"); + status = UCS_ERR_UNSUPPORTED; + goto out; + } + pthread_mutex_lock(&install_mutex); if (ucm_cudamem_installed) { goto out_unlock; } - status = ucm_cuda_install_hooks(ucm_cuda_driver_funcs, &used_reloc, - "driver"); + status = ucm_cuda_install_hooks(); if (status != UCS_OK) { - ucm_warn("failed to install cuda memory hooks on driver API"); - } else if (!used_reloc) { - ucm_cudamem_installed = 1; - } else if (status == UCS_OK) { - /* Failed to install bistro hooks on all driver APIs, so need to install - hooks on runtime APIs. */ - status = ucm_cuda_install_hooks(ucm_cuda_runtime_funcs, &used_reloc, - "runtime"); - if (status == UCS_OK) { - ucm_cudamem_installed = 1; - } else { - ucm_warn("failed to install cuda memory hooks on runtime API") - } + ucm_diag("failed to install cuda memory hooks on driver API"); + goto out_unlock; } + ucm_cudamem_installed = 1; + out_unlock: pthread_mutex_unlock(&install_mutex); out: diff --git a/src/ucm/cuda/cudamem.h b/src/ucm/cuda/cudamem.h index cd5a4087e27e..0939d201a8c7 100644 --- a/src/ucm/cuda/cudamem.h +++ b/src/ucm/cuda/cudamem.h @@ -7,7 +7,6 @@ #ifndef UCM_CUDAMEM_H_ #define UCM_CUDAMEM_H_ -#include #include @@ -25,11 +24,4 @@ CUresult ucm_cuMemFree_v2(CUdeviceptr dptr); CUresult ucm_cuMemFreeHost(void *p); CUresult ucm_cuMemFreeHost_v2(void *p); -cudaError_t ucm_cudaFree(void *devPtr); -cudaError_t ucm_cudaFreeHost(void *ptr); -cudaError_t ucm_cudaMalloc(void **devPtr, size_t size); -cudaError_t ucm_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags); -cudaError_t ucm_cudaMallocPitch(void **devPtr, size_t *pitch, - size_t width, size_t height); - #endif diff --git a/src/ucm/util/reloc.c b/src/ucm/util/reloc.c index bee825b95b76..ab4404f716c3 100644 --- a/src/ucm/util/reloc.c +++ b/src/ucm/util/reloc.c @@ -28,6 +28,7 @@ #include #include #include +#include /* Ensure this macro is defined (from ) - otherwise, cppcheck might fail with an "unknown macro" warning */ @@ -116,6 +117,13 @@ static ucs_status_t ucm_reloc_get_aux_phsize(int *phsize_p) return UCS_OK; } + phsize = getauxval(AT_PHENT); + if (phsize > 0) { + ucm_debug("phsize=%d from api", phsize); + *phsize_p = phsize; + return UCS_OK; + } + fd = open(proc_auxv_filename, O_RDONLY); if (fd < 0) { ucm_error("failed to open '%s' for reading: %m", proc_auxv_filename); diff --git a/src/ucs/config/global_opts.c b/src/ucs/config/global_opts.c index d0c32af2fed7..49794804364b 100644 --- a/src/ucs/config/global_opts.c +++ b/src/ucs/config/global_opts.c @@ -37,7 +37,7 @@ ucs_global_opts_t ucs_global_opts = { .debug_signo = SIGHUP, .log_level_trigger = UCS_LOG_LEVEL_FATAL, .warn_unused_env_vars = 1, - .enable_memtype_cache = 1, + .enable_memtype_cache = UCS_TRY, .async_max_events = 64, .async_signo = SIGALRM, .stats_dest = "", @@ -145,9 +145,9 @@ static ucs_config_field_t ucs_global_opts_table[] = { "configuration parser.", ucs_offsetof(ucs_global_opts_t, warn_unused_env_vars), UCS_CONFIG_TYPE_BOOL}, - {"MEMTYPE_CACHE", "y", + {"MEMTYPE_CACHE", "try", "Enable memory type (cuda/rocm) cache", - ucs_offsetof(ucs_global_opts_t, enable_memtype_cache), UCS_CONFIG_TYPE_BOOL}, + ucs_offsetof(ucs_global_opts_t, enable_memtype_cache), UCS_CONFIG_TYPE_TERNARY}, {"ASYNC_MAX_EVENTS", "1024", /* TODO remove this; resize mpmc */ "Maximal number of events which can be handled from one context", diff --git a/src/ucs/config/global_opts.h b/src/ucs/config/global_opts.h index 6f7a00658b63..1a55c0603038 100644 --- a/src/ucs/config/global_opts.h +++ b/src/ucs/config/global_opts.h @@ -80,7 +80,7 @@ typedef struct { unsigned async_max_events; /** Memtype cache */ - int enable_memtype_cache; + ucs_ternary_auto_value_t enable_memtype_cache; /* Destination for statistics: udp:host:port / file:path / stdout */ diff --git a/src/ucs/memory/memtype_cache.c b/src/ucs/memory/memtype_cache.c index bc3eacf7ec2c..ebec3ffd47e2 100644 --- a/src/ucs/memory/memtype_cache.c +++ b/src/ucs/memory/memtype_cache.c @@ -24,6 +24,7 @@ static ucs_spinlock_t ucs_memtype_cache_global_instance_lock; +static int ucs_memtype_cache_failed = 0; ucs_memtype_cache_t *ucs_memtype_cache_global_instance = NULL; @@ -42,16 +43,23 @@ static UCS_F_ALWAYS_INLINE ucs_memtype_cache_t *ucs_memtype_cache_get_global() ucs_memtype_cache_t *memtype_cache = NULL; ucs_status_t status; - if (!ucs_global_opts.enable_memtype_cache) { + if (ucs_global_opts.enable_memtype_cache == UCS_NO) { return NULL; } /* Double-check lock scheme */ - if (ucs_unlikely(ucs_memtype_cache_global_instance == NULL)) { + if (ucs_unlikely(ucs_memtype_cache_global_instance == NULL) && + !ucs_memtype_cache_failed) { /* Create the memtype cache outside the lock, to avoid a Coverity error of lock inversion with UCS_INIT_ONCE from ucm_set_event_handler() */ status = UCS_CLASS_NEW(ucs_memtype_cache_t, &memtype_cache); if (status != UCS_OK) { + /* If we failed to create the memtype cache once, do not try again */ + ucs_memtype_cache_failed = 1; + if (ucs_global_opts.enable_memtype_cache == UCS_YES) { + ucs_warn("failed to create memtype cache: %s", + ucs_status_string(status)); + } return NULL; } @@ -377,9 +385,9 @@ static UCS_CLASS_INIT_FUNC(ucs_memtype_cache_t) UCM_EVENT_FLAG_EXISTING_ALLOC, 1000, ucs_memtype_cache_event_callback, self); - if ((status != UCS_OK) && (status != UCS_ERR_UNSUPPORTED)) { - ucs_error("failed to set UCM memtype event handler: %s", - ucs_status_string(status)); + if (status != UCS_OK) { + ucs_diag("failed to set UCM memtype event handler: %s", + ucs_status_string(status)); goto err_cleanup_pgtable; } diff --git a/src/ucs/memory/rcache.c b/src/ucs/memory/rcache.c index 302b457a0a92..80d5d3820353 100644 --- a/src/ucs/memory/rcache.c +++ b/src/ucs/memory/rcache.c @@ -1285,6 +1285,8 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, status = ucm_set_event_handler(params->ucm_events, params->ucm_event_priority, ucs_rcache_unmapped_callback, self); if (status != UCS_OK) { + ucs_diag("rcache failed to install UCM event handler: %s", + ucs_status_string(status)); goto err_remove_vfs; }