From 8e1b3be5d4211bce303a3b6a7e3719d6bf04f067 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Sat, 22 Jan 2022 23:15:46 +0200 Subject: [PATCH] UCM/UCS: Fail to create memtype cache if cannot patch Cuda driver API - Drop reloc-based cuda hooks, use only bistro on driver API. - Propagate error from UCM to memtype cache initialization. - If failed to create memtype cache once, don't try again. - Use getauxv() API if possible instead of reading /proc/self/auxv directly - fixes permissions errors on some systems. - Enable Cuda bistro hooks also with valgrind, since it doesn't affect heap memory allocations. - Don't run Cuda reloc hooks test in CI. - Fix error message in tests. --- config/m4/ucm.m4 | 7 ++ contrib/test_jenkins.sh | 51 +++++----- src/ucm/cuda/cudamem.c | 134 +++++++-------------------- src/ucm/cuda/cudamem.h | 8 -- src/ucm/mmap/mmap.h | 18 +++- src/ucm/util/reloc.c | 13 +++ src/ucm/util/sys.h | 21 ----- src/ucs/config/global_opts.c | 6 +- src/ucs/config/global_opts.h | 2 +- src/ucs/memory/memtype_cache.c | 18 +++- src/ucs/memory/rcache.c | 2 + test/gtest/ucs/test_memtype_cache.cc | 4 +- test/gtest/uct/test_p2p_err.cc | 2 +- 13 files changed, 113 insertions(+), 173 deletions(-) diff --git a/config/m4/ucm.m4 b/config/m4/ucm.m4 index 1e229edc51f2..8d7a9e40ec06 100644 --- a/config/m4/ucm.m4 +++ b/config/m4/ucm.m4 @@ -46,6 +46,13 @@ AC_CHECK_DECLS([MADV_FREE, [#include ]) +# +# getauxval() +# +AC_CHECK_DECLS([getauxval], [], [], + [#include ]) + + # BISTRO hooks infrastructure # # SYS_xxx macro diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index 1b9fc54cec02..c9b2696b9fd2 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -1060,40 +1060,33 @@ test_malloc_hook() { cuda_dynamic_exe=./test/apps/test_cuda_hook_dynamic cuda_static_exe=./test/apps/test_cuda_hook_static - for mode in reloc bistro - do - export UCX_MEM_CUDA_HOOK_MODE=${mode} - - # Run cuda memory hooks with dynamic link - ${cuda_dynamic_exe} + # Run cuda memory hooks with dynamic link + ${cuda_dynamic_exe} - # Run cuda memory hooks with static link, if exists. If the static - # library 'libcudart_static.a' is not present, static test will not - # be built. - if [ -x ${cuda_static_exe} ] + # Run cuda memory hooks with static link, if exists. If the static + # library 'libcudart_static.a' is not present, static test will not + # be built. + if [ -x ${cuda_static_exe} ] + then + ${cuda_static_exe} && status="pass" || status="fail" + [ ${mode} == "bistro" ] && exp_status="pass" || exp_status="fail" + if [ ${status} == ${exp_status} ] then - ${cuda_static_exe} && status="pass" || status="fail" - [ ${mode} == "bistro" ] && exp_status="pass" || exp_status="fail" - if [ ${status} == ${exp_status} ] - then - echo "Static link with cuda ${status}, as expected" - else - echo "Static link with cuda is expected to ${exp_status}, actual: ${status}" - exit 1 - fi + echo "Static link with cuda ${status}, as expected" + else + echo "Static link with cuda is expected to ${exp_status}, actual: ${status}" + exit 1 fi + fi - # Test that driver API hooks work in both reloc and bistro modes, - # since we call them directly from the test - ${cuda_dynamic_exe} -d - [ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d - - # Test hooks in gtest - UCX_MEM_LOG_LEVEL=diag \ - ./test/gtest/gtest --gtest_filter='cuda_hooks.*' + # Test that driver API hooks work in both reloc and bistro modes, + # since we call them directly from the test + ${cuda_dynamic_exe} -d + [ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d - unset UCX_MEM_CUDA_HOOK_MODE - done + # Test hooks in gtest + UCX_MEM_LOG_LEVEL=diag \ + ./test/gtest/gtest --gtest_filter='cuda_hooks.*' fi } diff --git a/src/ucm/cuda/cudamem.c b/src/ucm/cuda/cudamem.c index 23a9c5d18061..106a85d1cd0b 100644 --- a/src/ucm/cuda/cudamem.c +++ b/src/ucm/cuda/cudamem.c @@ -87,15 +87,6 @@ UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree_v2, CUresult, -1, CUdeviceptr) UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost, CUresult, -1, void*) UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost_v2, CUresult, -1, void*) -/* Runtime API */ -UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFree, cudaError_t, -1, void*) -UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFreeHost, cudaError_t, -1, void*) -UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMalloc, cudaError_t, -1, void**, size_t) -UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocManaged, cudaError_t, -1, void**, - size_t, unsigned int) -UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocPitch, cudaError_t, -1, void**, - size_t*, size_t, size_t) - static void ucm_cuda_dispatch_mem_alloc(CUdeviceptr ptr, size_t length, ucs_memory_type_t mem_type) { @@ -160,100 +151,50 @@ UCM_CUDA_FREE_FUNC(cuMemFree_v2, CUresult, CUdeviceptr, UCS_MEMORY_TYPE_CUDA) UCM_CUDA_FREE_FUNC(cuMemFreeHost, CUresult, void*, UCS_MEMORY_TYPE_HOST) UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, CUresult, void*, UCS_MEMORY_TYPE_HOST) -static ucm_cuda_func_t ucm_cuda_driver_funcs[] = { - UCM_CUDA_FUNC_ENTRY(cuMemAlloc), - UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2), - UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged), - UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch), - UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2), - UCM_CUDA_FUNC_ENTRY(cuMemFree), - UCM_CUDA_FUNC_ENTRY(cuMemFree_v2), - UCM_CUDA_FUNC_ENTRY(cuMemFreeHost), - UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2), - {{NULL}, NULL} -}; - -/* Runtime API replacements */ -UCM_CUDA_ALLOC_FUNC(cudaMalloc, UCS_MEMORY_TYPE_CUDA, cudaError_t, cudaSuccess, - arg0, void*, "size=%zu", size_t) -UCM_CUDA_ALLOC_FUNC(cudaMallocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED, - cudaError_t, cudaSuccess, arg0, void*, - "size=%zu flags=0x%x", size_t, unsigned) -UCM_CUDA_ALLOC_FUNC(cudaMallocPitch, UCS_MEMORY_TYPE_CUDA, cudaError_t, - cudaSuccess, (size_t)arg1 * arg2, void*, - "pitch=%p width=%zu height=%zu", size_t*, size_t, size_t) -UCM_CUDA_FREE_FUNC(cudaFree, cudaError_t, void*, UCS_MEMORY_TYPE_CUDA) -UCM_CUDA_FREE_FUNC(cudaFreeHost, cudaError_t, void*, UCS_MEMORY_TYPE_HOST) - -static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = { - UCM_CUDA_FUNC_ENTRY(cudaFree), - UCM_CUDA_FUNC_ENTRY(cudaFreeHost), - UCM_CUDA_FUNC_ENTRY(cudaMalloc), - UCM_CUDA_FUNC_ENTRY(cudaMallocManaged), - UCM_CUDA_FUNC_ENTRY(cudaMallocPitch), - {{NULL}, NULL} -}; -static int ucm_cuda_allow_hook_mode(ucm_mmap_hook_mode_t mode) -{ - return (ucm_global_opts.cuda_hook_modes & UCS_BIT(mode)) && - (ucm_get_hook_mode(mode) == mode); -} - -static ucs_status_t -ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc, - const char *name) +static ucs_status_t ucm_cuda_install_hooks() { + static ucm_cuda_func_t funcs[] = { + UCM_CUDA_FUNC_ENTRY(cuMemAlloc), + UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2), + UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged), + UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch), + UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2), + UCM_CUDA_FUNC_ENTRY(cuMemFree), + UCM_CUDA_FUNC_ENTRY(cuMemFree_v2), + UCM_CUDA_FUNC_ENTRY(cuMemFreeHost), + UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2), + {{NULL}, NULL} + }; const char UCS_V_UNUSED *hook_mode; - unsigned num_bistro, num_reloc; ucm_cuda_func_t *func; ucs_status_t status; + unsigned num_hooks; void *func_ptr; - num_bistro = 0; - num_reloc = 0; + num_hooks = 0; for (func = funcs; func->patch.symbol != NULL; ++func) { func_ptr = ucm_reloc_get_orig(func->patch.symbol, func->patch.value); if (func_ptr == NULL) { continue; } - status = UCS_ERR_UNSUPPORTED; - - if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) { - status = ucm_bistro_patch(func_ptr, func->patch.value, - func->patch.symbol, func->orig_func_ptr, - NULL); - if (status == UCS_OK) { - ucm_debug("installed bistro hook for '%s': %d", - func->patch.symbol, status); - ++num_bistro; - continue; - } - + status = ucm_bistro_patch(func_ptr, func->patch.value, + func->patch.symbol, func->orig_func_ptr, + NULL); + if (status != UCS_OK) { ucm_debug("failed to install bistro hook for '%s'", func->patch.symbol); + return status; } - if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_RELOC)) { - status = ucm_reloc_modify(&func->patch); - if (status == UCS_OK) { - ++num_reloc; - ucm_debug("installed reloc hook on '%s'", func->patch.symbol); - continue; - } - - ucm_debug("failed to install relocation table hook for '%s'", - func->patch.symbol); - } - - ucm_diag("failed to install hook for '%s'", func->patch.symbol); - return status; + ucm_debug("installed bistro hook for '%s': %d", func->patch.symbol, + status); + ++num_hooks; } - *used_reloc = num_reloc > 0; - ucm_info("cuda memory hooks on %s API: installed %u bistro and %u reloc", - name, num_bistro, num_reloc); + ucm_info("installed %u/%zu cuda memory hooks", num_hooks, + ucs_static_array_size(funcs) - 1); return UCS_OK; } @@ -262,14 +203,13 @@ static ucs_status_t ucm_cudamem_install(int events) static int ucm_cudamem_installed = 0; static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER; ucs_status_t status = UCS_OK; - int used_reloc; if (!(events & (UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE))) { goto out; } - if (ucm_global_opts.cuda_hook_modes == 0) { - ucm_info("cuda memory hooks are disabled by configuration"); + if (!(ucm_global_opts.cuda_hook_modes & UCS_BIT(UCM_MMAP_HOOK_BISTRO))) { + ucm_diag("cuda memory hooks require bistro mode"); status = UCS_ERR_UNSUPPORTED; goto out; } @@ -280,24 +220,14 @@ static ucs_status_t ucm_cudamem_install(int events) goto out_unlock; } - status = ucm_cuda_install_hooks(ucm_cuda_driver_funcs, &used_reloc, - "driver"); + status = ucm_cuda_install_hooks(); if (status != UCS_OK) { - ucm_warn("failed to install cuda memory hooks on driver API"); - } else if (!used_reloc) { - ucm_cudamem_installed = 1; - } else if (status == UCS_OK) { - /* Failed to install bistro hooks on all driver APIs, so need to install - hooks on runtime APIs. */ - status = ucm_cuda_install_hooks(ucm_cuda_runtime_funcs, &used_reloc, - "runtime"); - if (status == UCS_OK) { - ucm_cudamem_installed = 1; - } else { - ucm_warn("failed to install cuda memory hooks on runtime API") - } + ucm_diag("failed to install cuda memory hooks on driver API"); + goto out_unlock; } + ucm_cudamem_installed = 1; + out_unlock: pthread_mutex_unlock(&install_mutex); out: diff --git a/src/ucm/cuda/cudamem.h b/src/ucm/cuda/cudamem.h index cd5a4087e27e..0939d201a8c7 100644 --- a/src/ucm/cuda/cudamem.h +++ b/src/ucm/cuda/cudamem.h @@ -7,7 +7,6 @@ #ifndef UCM_CUDAMEM_H_ #define UCM_CUDAMEM_H_ -#include #include @@ -25,11 +24,4 @@ CUresult ucm_cuMemFree_v2(CUdeviceptr dptr); CUresult ucm_cuMemFreeHost(void *p); CUresult ucm_cuMemFreeHost_v2(void *p); -cudaError_t ucm_cudaFree(void *devPtr); -cudaError_t ucm_cudaFreeHost(void *ptr); -cudaError_t ucm_cudaMalloc(void **devPtr, size_t size); -cudaError_t ucm_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags); -cudaError_t ucm_cudaMallocPitch(void **devPtr, size_t *pitch, - size_t width, size_t height); - #endif diff --git a/src/ucm/mmap/mmap.h b/src/ucm/mmap/mmap.h index c7b4e4e37b21..f825172350d7 100644 --- a/src/ucm/mmap/mmap.h +++ b/src/ucm/mmap/mmap.h @@ -39,9 +39,25 @@ ucs_status_t ucm_mmap_test_installed_events(int events); ucs_status_t ucm_mmap_test_events(int events, const char *event_type); void ucm_mmap_init(); +/** + * Get memory hooks mode to use, based on the configured mode and runtime. + * + * @param config_mode Configured memory hook mode. + * + * @return Memory hook mode to use. + */ static UCS_F_ALWAYS_INLINE ucm_mmap_hook_mode_t ucm_mmap_hook_mode(void) { - return ucm_get_hook_mode(ucm_global_opts.mmap_hook_mode); +#ifdef __SANITIZE_ADDRESS__ + return UCM_MMAP_HOOK_NONE; +#else + if (RUNNING_ON_VALGRIND && + (ucm_global_opts.mmap_hook_mode == UCM_MMAP_HOOK_BISTRO)) { + return UCM_MMAP_HOOK_RELOC; + } + + return ucm_global_opts.mmap_hook_mode; +#endif } #endif diff --git a/src/ucm/util/reloc.c b/src/ucm/util/reloc.c index bee825b95b76..de890bf19e5f 100644 --- a/src/ucm/util/reloc.c +++ b/src/ucm/util/reloc.c @@ -29,6 +29,11 @@ #include #include +#ifdef HAVE_DECL_GETAUXVAL +#include +#endif + + /* Ensure this macro is defined (from ) - otherwise, cppcheck might fail with an "unknown macro" warning */ #ifndef ElfW @@ -116,6 +121,14 @@ static ucs_status_t ucm_reloc_get_aux_phsize(int *phsize_p) return UCS_OK; } +#ifdef HAVE_DECL_GETAUXVAL + phsize = getauxval(AT_PHENT); + if (phsize > 0) { + *phsize_p = phsize; + return UCS_OK; + } +#endif + fd = open(proc_auxv_filename, O_RDONLY); if (fd < 0) { ucm_error("failed to open '%s' for reading: %m", proc_auxv_filename); diff --git a/src/ucm/util/sys.h b/src/ucm/util/sys.h index 838fb61a1c57..e00650ea0eed 100644 --- a/src/ucm/util/sys.h +++ b/src/ucm/util/sys.h @@ -110,25 +110,4 @@ void *ucm_brk_syscall(void *addr); pid_t ucm_get_tid(); -/** - * Get memory hooks mode to use, based on the configured mode and runtime. - * - * @param config_mode Configured memory hook mode. - * - * @return Memory hook mode to use. - */ -static UCS_F_ALWAYS_INLINE ucm_mmap_hook_mode_t -ucm_get_hook_mode(ucm_mmap_hook_mode_t config_mode) -{ -#ifdef __SANITIZE_ADDRESS__ - return UCM_MMAP_HOOK_NONE; -#else - if (RUNNING_ON_VALGRIND && (config_mode == UCM_MMAP_HOOK_BISTRO)) { - return UCM_MMAP_HOOK_RELOC; - } - - return config_mode; -#endif -} - #endif diff --git a/src/ucs/config/global_opts.c b/src/ucs/config/global_opts.c index d0c32af2fed7..49794804364b 100644 --- a/src/ucs/config/global_opts.c +++ b/src/ucs/config/global_opts.c @@ -37,7 +37,7 @@ ucs_global_opts_t ucs_global_opts = { .debug_signo = SIGHUP, .log_level_trigger = UCS_LOG_LEVEL_FATAL, .warn_unused_env_vars = 1, - .enable_memtype_cache = 1, + .enable_memtype_cache = UCS_TRY, .async_max_events = 64, .async_signo = SIGALRM, .stats_dest = "", @@ -145,9 +145,9 @@ static ucs_config_field_t ucs_global_opts_table[] = { "configuration parser.", ucs_offsetof(ucs_global_opts_t, warn_unused_env_vars), UCS_CONFIG_TYPE_BOOL}, - {"MEMTYPE_CACHE", "y", + {"MEMTYPE_CACHE", "try", "Enable memory type (cuda/rocm) cache", - ucs_offsetof(ucs_global_opts_t, enable_memtype_cache), UCS_CONFIG_TYPE_BOOL}, + ucs_offsetof(ucs_global_opts_t, enable_memtype_cache), UCS_CONFIG_TYPE_TERNARY}, {"ASYNC_MAX_EVENTS", "1024", /* TODO remove this; resize mpmc */ "Maximal number of events which can be handled from one context", diff --git a/src/ucs/config/global_opts.h b/src/ucs/config/global_opts.h index 6f7a00658b63..1a55c0603038 100644 --- a/src/ucs/config/global_opts.h +++ b/src/ucs/config/global_opts.h @@ -80,7 +80,7 @@ typedef struct { unsigned async_max_events; /** Memtype cache */ - int enable_memtype_cache; + ucs_ternary_auto_value_t enable_memtype_cache; /* Destination for statistics: udp:host:port / file:path / stdout */ diff --git a/src/ucs/memory/memtype_cache.c b/src/ucs/memory/memtype_cache.c index bc3eacf7ec2c..ebec3ffd47e2 100644 --- a/src/ucs/memory/memtype_cache.c +++ b/src/ucs/memory/memtype_cache.c @@ -24,6 +24,7 @@ static ucs_spinlock_t ucs_memtype_cache_global_instance_lock; +static int ucs_memtype_cache_failed = 0; ucs_memtype_cache_t *ucs_memtype_cache_global_instance = NULL; @@ -42,16 +43,23 @@ static UCS_F_ALWAYS_INLINE ucs_memtype_cache_t *ucs_memtype_cache_get_global() ucs_memtype_cache_t *memtype_cache = NULL; ucs_status_t status; - if (!ucs_global_opts.enable_memtype_cache) { + if (ucs_global_opts.enable_memtype_cache == UCS_NO) { return NULL; } /* Double-check lock scheme */ - if (ucs_unlikely(ucs_memtype_cache_global_instance == NULL)) { + if (ucs_unlikely(ucs_memtype_cache_global_instance == NULL) && + !ucs_memtype_cache_failed) { /* Create the memtype cache outside the lock, to avoid a Coverity error of lock inversion with UCS_INIT_ONCE from ucm_set_event_handler() */ status = UCS_CLASS_NEW(ucs_memtype_cache_t, &memtype_cache); if (status != UCS_OK) { + /* If we failed to create the memtype cache once, do not try again */ + ucs_memtype_cache_failed = 1; + if (ucs_global_opts.enable_memtype_cache == UCS_YES) { + ucs_warn("failed to create memtype cache: %s", + ucs_status_string(status)); + } return NULL; } @@ -377,9 +385,9 @@ static UCS_CLASS_INIT_FUNC(ucs_memtype_cache_t) UCM_EVENT_FLAG_EXISTING_ALLOC, 1000, ucs_memtype_cache_event_callback, self); - if ((status != UCS_OK) && (status != UCS_ERR_UNSUPPORTED)) { - ucs_error("failed to set UCM memtype event handler: %s", - ucs_status_string(status)); + if (status != UCS_OK) { + ucs_diag("failed to set UCM memtype event handler: %s", + ucs_status_string(status)); goto err_cleanup_pgtable; } diff --git a/src/ucs/memory/rcache.c b/src/ucs/memory/rcache.c index 302b457a0a92..80d5d3820353 100644 --- a/src/ucs/memory/rcache.c +++ b/src/ucs/memory/rcache.c @@ -1285,6 +1285,8 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, status = ucm_set_event_handler(params->ucm_events, params->ucm_event_priority, ucs_rcache_unmapped_callback, self); if (status != UCS_OK) { + ucs_diag("rcache failed to install UCM event handler: %s", + ucs_status_string(status)); goto err_remove_vfs; } diff --git a/test/gtest/ucs/test_memtype_cache.cc b/test/gtest/ucs/test_memtype_cache.cc index 75d750a928f5..79fdfac44065 100644 --- a/test/gtest/ucs/test_memtype_cache.cc +++ b/test/gtest/ucs/test_memtype_cache.cc @@ -45,13 +45,13 @@ class test_memtype_cache : public ucs::test_with_param { if (!expect_found || (expected_type == UCS_MEMORY_TYPE_HOST)) { /* memory type should be not found or unknown */ if (status != UCS_ERR_NO_ELEM) { - ASSERT_UCS_OK(status, << "ptr=" << ptr << " size=" << size); + ASSERT_UCS_OK(status, << " ptr=" << ptr << " size=" << size); EXPECT_EQ(UCS_MEMORY_TYPE_UNKNOWN, mem_info.type) << "ptr=" << ptr << " size=" << size << mem_buffer::mem_type_name(mem_info.type); } } else { - ASSERT_UCS_OK(status, << "ptr=" << ptr << " size=" << size); + ASSERT_UCS_OK(status, << " ptr=" << ptr << " size=" << size); EXPECT_TRUE((UCS_MEMORY_TYPE_UNKNOWN == mem_info.type) || (expected_type == mem_info.type)) << "ptr=" << ptr << " size=" << size diff --git a/test/gtest/uct/test_p2p_err.cc b/test/gtest/uct/test_p2p_err.cc index 1a4471ac3bc4..5825ec0b84c2 100644 --- a/test/gtest/uct/test_p2p_err.cc +++ b/test/gtest/uct/test_p2p_err.cc @@ -126,7 +126,7 @@ class uct_p2p_err_test : public uct_p2p_test { { void *address = NULL; ucs_status_t status = ucs_mmap_alloc(&length, &address, 0, "test_dummy"); - ASSERT_UCS_OK(status, << "length = " << length); + ASSERT_UCS_OK(status, << " length = " << length); status = ucs_mmap_free(address, length); ASSERT_UCS_OK(status); /* coverity[use_after_free] */