Skip to content

Commit

Permalink
UCM/UCS: Fail to create memtype cache if cannot patch Cuda driver API
Browse files Browse the repository at this point in the history
- Drop reloc-based cuda hooks, use only bistro on driver API.
- Propagate error from UCM to memtype cache initialization.
- If failed to create memtype cache once, don't try again.
- Use getauxv() API if possible instead of reading /proc/self/auxv
  directly - fixes permissions errors on some systems.
- Enable Cuda bistro hooks also with valgrind, since it doesn't affect
  heap memory allocations.
- Don't run Cuda reloc hooks test in CI.
- Fix error message in tests.
  • Loading branch information
yosefe committed Jan 23, 2022
1 parent ea16f70 commit 8e1b3be
Show file tree
Hide file tree
Showing 13 changed files with 113 additions and 173 deletions.
7 changes: 7 additions & 0 deletions config/m4/ucm.m4
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ AC_CHECK_DECLS([MADV_FREE,
[#include <sys/mman.h>])


#
# getauxval()
#
AC_CHECK_DECLS([getauxval], [], [],
[#include <sys/auxv.h>])


# BISTRO hooks infrastructure
#
# SYS_xxx macro
Expand Down
51 changes: 22 additions & 29 deletions contrib/test_jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1060,40 +1060,33 @@ test_malloc_hook() {
cuda_dynamic_exe=./test/apps/test_cuda_hook_dynamic
cuda_static_exe=./test/apps/test_cuda_hook_static

for mode in reloc bistro
do
export UCX_MEM_CUDA_HOOK_MODE=${mode}

# Run cuda memory hooks with dynamic link
${cuda_dynamic_exe}
# Run cuda memory hooks with dynamic link
${cuda_dynamic_exe}

# Run cuda memory hooks with static link, if exists. If the static
# library 'libcudart_static.a' is not present, static test will not
# be built.
if [ -x ${cuda_static_exe} ]
# Run cuda memory hooks with static link, if exists. If the static
# library 'libcudart_static.a' is not present, static test will not
# be built.
if [ -x ${cuda_static_exe} ]
then
${cuda_static_exe} && status="pass" || status="fail"
[ ${mode} == "bistro" ] && exp_status="pass" || exp_status="fail"
if [ ${status} == ${exp_status} ]
then
${cuda_static_exe} && status="pass" || status="fail"
[ ${mode} == "bistro" ] && exp_status="pass" || exp_status="fail"
if [ ${status} == ${exp_status} ]
then
echo "Static link with cuda ${status}, as expected"
else
echo "Static link with cuda is expected to ${exp_status}, actual: ${status}"
exit 1
fi
echo "Static link with cuda ${status}, as expected"
else
echo "Static link with cuda is expected to ${exp_status}, actual: ${status}"
exit 1
fi
fi

# Test that driver API hooks work in both reloc and bistro modes,
# since we call them directly from the test
${cuda_dynamic_exe} -d
[ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d

# Test hooks in gtest
UCX_MEM_LOG_LEVEL=diag \
./test/gtest/gtest --gtest_filter='cuda_hooks.*'
# Test that driver API hooks work in both reloc and bistro modes,
# since we call them directly from the test
${cuda_dynamic_exe} -d
[ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d

unset UCX_MEM_CUDA_HOOK_MODE
done
# Test hooks in gtest
UCX_MEM_LOG_LEVEL=diag \
./test/gtest/gtest --gtest_filter='cuda_hooks.*'
fi
}

Expand Down
134 changes: 32 additions & 102 deletions src/ucm/cuda/cudamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,6 @@ UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree_v2, CUresult, -1, CUdeviceptr)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost, CUresult, -1, void*)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost_v2, CUresult, -1, void*)

/* Runtime API */
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFree, cudaError_t, -1, void*)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFreeHost, cudaError_t, -1, void*)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMalloc, cudaError_t, -1, void**, size_t)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocManaged, cudaError_t, -1, void**,
size_t, unsigned int)
UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocPitch, cudaError_t, -1, void**,
size_t*, size_t, size_t)

static void ucm_cuda_dispatch_mem_alloc(CUdeviceptr ptr, size_t length,
ucs_memory_type_t mem_type)
{
Expand Down Expand Up @@ -160,100 +151,50 @@ UCM_CUDA_FREE_FUNC(cuMemFree_v2, CUresult, CUdeviceptr, UCS_MEMORY_TYPE_CUDA)
UCM_CUDA_FREE_FUNC(cuMemFreeHost, CUresult, void*, UCS_MEMORY_TYPE_HOST)
UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, CUresult, void*, UCS_MEMORY_TYPE_HOST)

static ucm_cuda_func_t ucm_cuda_driver_funcs[] = {
UCM_CUDA_FUNC_ENTRY(cuMemAlloc),
UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2),
UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged),
UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch),
UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2),
UCM_CUDA_FUNC_ENTRY(cuMemFree),
UCM_CUDA_FUNC_ENTRY(cuMemFree_v2),
UCM_CUDA_FUNC_ENTRY(cuMemFreeHost),
UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2),
{{NULL}, NULL}
};

/* Runtime API replacements */
UCM_CUDA_ALLOC_FUNC(cudaMalloc, UCS_MEMORY_TYPE_CUDA, cudaError_t, cudaSuccess,
arg0, void*, "size=%zu", size_t)
UCM_CUDA_ALLOC_FUNC(cudaMallocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED,
cudaError_t, cudaSuccess, arg0, void*,
"size=%zu flags=0x%x", size_t, unsigned)
UCM_CUDA_ALLOC_FUNC(cudaMallocPitch, UCS_MEMORY_TYPE_CUDA, cudaError_t,
cudaSuccess, (size_t)arg1 * arg2, void*,
"pitch=%p width=%zu height=%zu", size_t*, size_t, size_t)
UCM_CUDA_FREE_FUNC(cudaFree, cudaError_t, void*, UCS_MEMORY_TYPE_CUDA)
UCM_CUDA_FREE_FUNC(cudaFreeHost, cudaError_t, void*, UCS_MEMORY_TYPE_HOST)

static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = {
UCM_CUDA_FUNC_ENTRY(cudaFree),
UCM_CUDA_FUNC_ENTRY(cudaFreeHost),
UCM_CUDA_FUNC_ENTRY(cudaMalloc),
UCM_CUDA_FUNC_ENTRY(cudaMallocManaged),
UCM_CUDA_FUNC_ENTRY(cudaMallocPitch),
{{NULL}, NULL}
};

static int ucm_cuda_allow_hook_mode(ucm_mmap_hook_mode_t mode)
{
return (ucm_global_opts.cuda_hook_modes & UCS_BIT(mode)) &&
(ucm_get_hook_mode(mode) == mode);
}

static ucs_status_t
ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc,
const char *name)
static ucs_status_t ucm_cuda_install_hooks()
{
static ucm_cuda_func_t funcs[] = {
UCM_CUDA_FUNC_ENTRY(cuMemAlloc),
UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2),
UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged),
UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch),
UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2),
UCM_CUDA_FUNC_ENTRY(cuMemFree),
UCM_CUDA_FUNC_ENTRY(cuMemFree_v2),
UCM_CUDA_FUNC_ENTRY(cuMemFreeHost),
UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2),
{{NULL}, NULL}
};
const char UCS_V_UNUSED *hook_mode;
unsigned num_bistro, num_reloc;
ucm_cuda_func_t *func;
ucs_status_t status;
unsigned num_hooks;
void *func_ptr;

num_bistro = 0;
num_reloc = 0;
num_hooks = 0;
for (func = funcs; func->patch.symbol != NULL; ++func) {
func_ptr = ucm_reloc_get_orig(func->patch.symbol, func->patch.value);
if (func_ptr == NULL) {
continue;
}

status = UCS_ERR_UNSUPPORTED;

if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) {
status = ucm_bistro_patch(func_ptr, func->patch.value,
func->patch.symbol, func->orig_func_ptr,
NULL);
if (status == UCS_OK) {
ucm_debug("installed bistro hook for '%s': %d",
func->patch.symbol, status);
++num_bistro;
continue;
}

status = ucm_bistro_patch(func_ptr, func->patch.value,
func->patch.symbol, func->orig_func_ptr,
NULL);
if (status != UCS_OK) {
ucm_debug("failed to install bistro hook for '%s'",
func->patch.symbol);
return status;
}

if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_RELOC)) {
status = ucm_reloc_modify(&func->patch);
if (status == UCS_OK) {
++num_reloc;
ucm_debug("installed reloc hook on '%s'", func->patch.symbol);
continue;
}

ucm_debug("failed to install relocation table hook for '%s'",
func->patch.symbol);
}

ucm_diag("failed to install hook for '%s'", func->patch.symbol);
return status;
ucm_debug("installed bistro hook for '%s': %d", func->patch.symbol,
status);
++num_hooks;
}

*used_reloc = num_reloc > 0;
ucm_info("cuda memory hooks on %s API: installed %u bistro and %u reloc",
name, num_bistro, num_reloc);
ucm_info("installed %u/%zu cuda memory hooks", num_hooks,
ucs_static_array_size(funcs) - 1);
return UCS_OK;
}

Expand All @@ -262,14 +203,13 @@ static ucs_status_t ucm_cudamem_install(int events)
static int ucm_cudamem_installed = 0;
static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER;
ucs_status_t status = UCS_OK;
int used_reloc;

if (!(events & (UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE))) {
goto out;
}

if (ucm_global_opts.cuda_hook_modes == 0) {
ucm_info("cuda memory hooks are disabled by configuration");
if (!(ucm_global_opts.cuda_hook_modes & UCS_BIT(UCM_MMAP_HOOK_BISTRO))) {
ucm_diag("cuda memory hooks require bistro mode");
status = UCS_ERR_UNSUPPORTED;
goto out;
}
Expand All @@ -280,24 +220,14 @@ static ucs_status_t ucm_cudamem_install(int events)
goto out_unlock;
}

status = ucm_cuda_install_hooks(ucm_cuda_driver_funcs, &used_reloc,
"driver");
status = ucm_cuda_install_hooks();
if (status != UCS_OK) {
ucm_warn("failed to install cuda memory hooks on driver API");
} else if (!used_reloc) {
ucm_cudamem_installed = 1;
} else if (status == UCS_OK) {
/* Failed to install bistro hooks on all driver APIs, so need to install
hooks on runtime APIs. */
status = ucm_cuda_install_hooks(ucm_cuda_runtime_funcs, &used_reloc,
"runtime");
if (status == UCS_OK) {
ucm_cudamem_installed = 1;
} else {
ucm_warn("failed to install cuda memory hooks on runtime API")
}
ucm_diag("failed to install cuda memory hooks on driver API");
goto out_unlock;
}

ucm_cudamem_installed = 1;

out_unlock:
pthread_mutex_unlock(&install_mutex);
out:
Expand Down
8 changes: 0 additions & 8 deletions src/ucm/cuda/cudamem.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#ifndef UCM_CUDAMEM_H_
#define UCM_CUDAMEM_H_

#include <cuda_runtime.h>
#include <cuda.h>


Expand All @@ -25,11 +24,4 @@ CUresult ucm_cuMemFree_v2(CUdeviceptr dptr);
CUresult ucm_cuMemFreeHost(void *p);
CUresult ucm_cuMemFreeHost_v2(void *p);

cudaError_t ucm_cudaFree(void *devPtr);
cudaError_t ucm_cudaFreeHost(void *ptr);
cudaError_t ucm_cudaMalloc(void **devPtr, size_t size);
cudaError_t ucm_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags);
cudaError_t ucm_cudaMallocPitch(void **devPtr, size_t *pitch,
size_t width, size_t height);

#endif
18 changes: 17 additions & 1 deletion src/ucm/mmap/mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,25 @@ ucs_status_t ucm_mmap_test_installed_events(int events);
ucs_status_t ucm_mmap_test_events(int events, const char *event_type);
void ucm_mmap_init();

/**
* Get memory hooks mode to use, based on the configured mode and runtime.
*
* @param config_mode Configured memory hook mode.
*
* @return Memory hook mode to use.
*/
static UCS_F_ALWAYS_INLINE ucm_mmap_hook_mode_t ucm_mmap_hook_mode(void)
{
return ucm_get_hook_mode(ucm_global_opts.mmap_hook_mode);
#ifdef __SANITIZE_ADDRESS__
return UCM_MMAP_HOOK_NONE;
#else
if (RUNNING_ON_VALGRIND &&
(ucm_global_opts.mmap_hook_mode == UCM_MMAP_HOOK_BISTRO)) {
return UCM_MMAP_HOOK_RELOC;
}

return ucm_global_opts.mmap_hook_mode;
#endif
}

#endif
13 changes: 13 additions & 0 deletions src/ucm/util/reloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
#include <link.h>
#include <limits.h>

#ifdef HAVE_DECL_GETAUXVAL
#include <sys/auxv.h>
#endif


/* Ensure this macro is defined (from <link.h>) - otherwise, cppcheck might
fail with an "unknown macro" warning */
#ifndef ElfW
Expand Down Expand Up @@ -116,6 +121,14 @@ static ucs_status_t ucm_reloc_get_aux_phsize(int *phsize_p)
return UCS_OK;
}

#ifdef HAVE_DECL_GETAUXVAL
phsize = getauxval(AT_PHENT);
if (phsize > 0) {
*phsize_p = phsize;
return UCS_OK;
}
#endif

fd = open(proc_auxv_filename, O_RDONLY);
if (fd < 0) {
ucm_error("failed to open '%s' for reading: %m", proc_auxv_filename);
Expand Down
21 changes: 0 additions & 21 deletions src/ucm/util/sys.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,25 +110,4 @@ void *ucm_brk_syscall(void *addr);
pid_t ucm_get_tid();


/**
* Get memory hooks mode to use, based on the configured mode and runtime.
*
* @param config_mode Configured memory hook mode.
*
* @return Memory hook mode to use.
*/
static UCS_F_ALWAYS_INLINE ucm_mmap_hook_mode_t
ucm_get_hook_mode(ucm_mmap_hook_mode_t config_mode)
{
#ifdef __SANITIZE_ADDRESS__
return UCM_MMAP_HOOK_NONE;
#else
if (RUNNING_ON_VALGRIND && (config_mode == UCM_MMAP_HOOK_BISTRO)) {
return UCM_MMAP_HOOK_RELOC;
}

return config_mode;
#endif
}

#endif
6 changes: 3 additions & 3 deletions src/ucs/config/global_opts.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ ucs_global_opts_t ucs_global_opts = {
.debug_signo = SIGHUP,
.log_level_trigger = UCS_LOG_LEVEL_FATAL,
.warn_unused_env_vars = 1,
.enable_memtype_cache = 1,
.enable_memtype_cache = UCS_TRY,
.async_max_events = 64,
.async_signo = SIGALRM,
.stats_dest = "",
Expand Down Expand Up @@ -145,9 +145,9 @@ static ucs_config_field_t ucs_global_opts_table[] = {
"configuration parser.",
ucs_offsetof(ucs_global_opts_t, warn_unused_env_vars), UCS_CONFIG_TYPE_BOOL},

{"MEMTYPE_CACHE", "y",
{"MEMTYPE_CACHE", "try",
"Enable memory type (cuda/rocm) cache",
ucs_offsetof(ucs_global_opts_t, enable_memtype_cache), UCS_CONFIG_TYPE_BOOL},
ucs_offsetof(ucs_global_opts_t, enable_memtype_cache), UCS_CONFIG_TYPE_TERNARY},

{"ASYNC_MAX_EVENTS", "1024", /* TODO remove this; resize mpmc */
"Maximal number of events which can be handled from one context",
Expand Down
2 changes: 1 addition & 1 deletion src/ucs/config/global_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ typedef struct {
unsigned async_max_events;

/** Memtype cache */
int enable_memtype_cache;
ucs_ternary_auto_value_t enable_memtype_cache;

/* Destination for statistics: udp:host:port / file:path / stdout
*/
Expand Down
Loading

0 comments on commit 8e1b3be

Please sign in to comment.