Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCM/UCS: Fail to create memtype cache if cannot patch Cuda driver API #7865

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions config/m4/ucm.m4
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ AC_CHECK_DECLS([MADV_FREE,
[#include <sys/mman.h>])


#
# getauxval()
#
AC_CHECK_DECLS([getauxval], [], [],
[#include <sys/auxv.h>])


# BISTRO hooks infrastructure
#
# SYS_xxx macro
Expand Down
101 changes: 44 additions & 57 deletions src/ucm/cuda/cudamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -194,75 +194,66 @@ static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = {
{{NULL}, NULL}
};

static int ucm_cuda_allow_hook_mode(ucm_mmap_hook_mode_t mode)
{
return (ucm_global_opts.cuda_hook_modes & UCS_BIT(mode)) &&
(ucm_get_hook_mode(mode) == mode);
}

static ucs_status_t
ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc,
const char *name)
ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, const char *name,
ucm_mmap_hook_mode_t mode, int *installed_hooks_p)
{
const char UCS_V_UNUSED *hook_mode;
unsigned num_bistro, num_reloc;
ucm_cuda_func_t *func;
ucs_status_t status;
void *func_ptr;
int count;

if (*installed_hooks_p & UCS_BIT(mode)) {
return UCS_OK;
}

num_bistro = 0;
num_reloc = 0;
if (!(ucm_global_opts.cuda_hook_modes & UCS_BIT(mode))) {
/* Disabled by configuration */
ucm_debug("cuda memory hooks mode %s is disabled for %s API",
ucm_mmap_hook_modes[mode], name);
return UCS_OK;
}

count = 0;
for (func = funcs; func->patch.symbol != NULL; ++func) {
func_ptr = ucm_reloc_get_orig(func->patch.symbol, func->patch.value);
if (func_ptr == NULL) {
continue;
}

status = UCS_ERR_UNSUPPORTED;

if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) {
if (mode == UCM_MMAP_HOOK_BISTRO) {
status = ucm_bistro_patch(func_ptr, func->patch.value,
func->patch.symbol, func->orig_func_ptr,
NULL);
if (status == UCS_OK) {
ucm_debug("installed bistro hook for '%s': %d",
func->patch.symbol, status);
++num_bistro;
continue;
}

ucm_debug("failed to install bistro hook for '%s'",
func->patch.symbol);
} else if (mode == UCM_MMAP_HOOK_RELOC) {
status = ucm_reloc_modify(&func->patch);
} else {
break;
}

if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_RELOC)) {
status = ucm_reloc_modify(&func->patch);
if (status == UCS_OK) {
++num_reloc;
ucm_debug("installed reloc hook on '%s'", func->patch.symbol);
continue;
}

ucm_debug("failed to install relocation table hook for '%s'",
func->patch.symbol);
if (status != UCS_OK) {
ucm_diag("failed to install %s hook for '%s'",
ucm_mmap_hook_modes[mode], func->patch.symbol);
return status;
}

ucm_diag("failed to install hook for '%s'", func->patch.symbol);
return status;
ucm_debug("installed %s hook for '%s'", ucm_mmap_hook_modes[mode],
func->patch.symbol);
++count;
}

*used_reloc = num_reloc > 0;
ucm_info("cuda memory hooks on %s API: installed %u bistro and %u reloc",
name, num_bistro, num_reloc);
*installed_hooks_p |= UCS_BIT(mode);
ucm_info("cuda memory hooks mode %s: installed %d on %s API",
ucm_mmap_hook_modes[mode], count, name);
return UCS_OK;
}

static ucs_status_t ucm_cudamem_install(int events)
{
static int ucm_cudamem_installed = 0;
static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER;
static int driver_api_hooks = 0;
static int runtime_api_hooks = 0;
ucs_status_t status = UCS_OK;
int used_reloc;

if (!(events & (UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE))) {
goto out;
Expand All @@ -276,26 +267,22 @@ static ucs_status_t ucm_cudamem_install(int events)

pthread_mutex_lock(&install_mutex);

if (ucm_cudamem_installed) {
status = ucm_cuda_install_hooks(ucm_cuda_driver_funcs, "driver",
UCM_MMAP_HOOK_BISTRO, &driver_api_hooks);
if (status != UCS_OK) {
goto out_unlock;
}

status = ucm_cuda_install_hooks(ucm_cuda_driver_funcs, &used_reloc,
"driver");
status = ucm_cuda_install_hooks(ucm_cuda_driver_funcs, "driver",
UCM_MMAP_HOOK_RELOC, &driver_api_hooks);
if (status != UCS_OK) {
ucm_warn("failed to install cuda memory hooks on driver API");
} else if (!used_reloc) {
ucm_cudamem_installed = 1;
} else if (status == UCS_OK) {
/* Failed to install bistro hooks on all driver APIs, so need to install
hooks on runtime APIs. */
status = ucm_cuda_install_hooks(ucm_cuda_runtime_funcs, &used_reloc,
"runtime");
if (status == UCS_OK) {
ucm_cudamem_installed = 1;
} else {
ucm_warn("failed to install cuda memory hooks on runtime API")
}
goto out_unlock;
}

status = ucm_cuda_install_hooks(ucm_cuda_runtime_funcs, "runtime",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need to attempt installation of runtime hooks here? If we're not going to create memtype cache if driver hooks failed, can we not skip this step?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if driver bistro hooks failed, we do skip the other steps:

    status = ucm_cuda_install_hooks(ucm_cuda_driver_funcs, "driver",
                                    UCM_MMAP_HOOK_BISTRO, &driver_api_hooks);
    if (status != UCS_OK) {
        **goto out_unlock;**
    }

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean can we not remove lines 282-285?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these lines are needed in case the user would like to set UCX_MEM_CUDA_HOOK_MODE=reloc manually

UCM_MMAP_HOOK_RELOC, &runtime_api_hooks);
if (status != UCS_OK) {
goto out_unlock;
}

out_unlock:
Expand Down
10 changes: 9 additions & 1 deletion src/ucm/mmap/install.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
((UCM_MMAP_MAX_EVENT_NAME_LEN + 2) * \
ucs_static_array_size(ucm_mmap_event_name))

extern const char *ucm_mmap_hook_modes[];

typedef struct ucm_mmap_func {
ucm_reloc_patch_t patch;
Expand Down Expand Up @@ -89,6 +88,15 @@ static ucm_mmap_func_t ucm_mmap_funcs[] = {
static pthread_mutex_t ucm_mmap_install_mutex = PTHREAD_MUTEX_INITIALIZER;
static int ucm_mmap_installed_events = 0; /* events that were reported as installed */

const char *ucm_mmap_hook_modes[] = {
[UCM_MMAP_HOOK_NONE] = "none",
[UCM_MMAP_HOOK_RELOC] = UCM_MMAP_HOOK_RELOC_STR,
#if UCM_BISTRO_HOOKS
[UCM_MMAP_HOOK_BISTRO] = UCM_MMAP_HOOK_BISTRO_STR,
#endif
[UCM_MMAP_HOOK_LAST] = NULL
};

static const char *ucm_mmap_event_name[] = {
/* Native events */
UCM_MMAP_EVENT_NAME_ENTRY(MMAP),
Expand Down
7 changes: 7 additions & 0 deletions src/ucm/mmap/mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ ucs_status_t ucm_mmap_test_installed_events(int events);
ucs_status_t ucm_mmap_test_events(int events, const char *event_type);
void ucm_mmap_init();


/**
* Memory hooks mode names.
*/
extern const char *ucm_mmap_hook_modes[];


static UCS_F_ALWAYS_INLINE ucm_mmap_hook_mode_t ucm_mmap_hook_mode(void)
{
return ucm_get_hook_mode(ucm_global_opts.mmap_hook_mode);
Expand Down
13 changes: 13 additions & 0 deletions src/ucm/util/reloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
#include <link.h>
#include <limits.h>

#ifdef HAVE_DECL_GETAUXVAL
#include <sys/auxv.h>
#endif


/* Ensure this macro is defined (from <link.h>) - otherwise, cppcheck might
fail with an "unknown macro" warning */
#ifndef ElfW
Expand Down Expand Up @@ -116,6 +121,14 @@ static ucs_status_t ucm_reloc_get_aux_phsize(int *phsize_p)
return UCS_OK;
}

#ifdef HAVE_DECL_GETAUXVAL
phsize = getauxval(AT_PHENT);
if (phsize > 0) {
*phsize_p = phsize;
return UCS_OK;
}
#endif

fd = open(proc_auxv_filename, O_RDONLY);
if (fd < 0) {
ucm_error("failed to open '%s' for reading: %m", proc_auxv_filename);
Expand Down
6 changes: 3 additions & 3 deletions src/ucs/config/global_opts.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ ucs_global_opts_t ucs_global_opts = {
.debug_signo = SIGHUP,
.log_level_trigger = UCS_LOG_LEVEL_FATAL,
.warn_unused_env_vars = 1,
.enable_memtype_cache = 1,
.enable_memtype_cache = UCS_TRY,
.async_max_events = 64,
.async_signo = SIGALRM,
.stats_dest = "",
Expand Down Expand Up @@ -147,9 +147,9 @@ static ucs_config_field_t ucs_global_opts_table[] = {
"configuration parser.",
ucs_offsetof(ucs_global_opts_t, warn_unused_env_vars), UCS_CONFIG_TYPE_BOOL},

{"MEMTYPE_CACHE", "y",
{"MEMTYPE_CACHE", "try",
"Enable memory type (cuda/rocm) cache",
ucs_offsetof(ucs_global_opts_t, enable_memtype_cache), UCS_CONFIG_TYPE_BOOL},
ucs_offsetof(ucs_global_opts_t, enable_memtype_cache), UCS_CONFIG_TYPE_TERNARY},

{"ASYNC_MAX_EVENTS", "1024", /* TODO remove this; resize mpmc */
"Maximal number of events which can be handled from one context",
Expand Down
2 changes: 1 addition & 1 deletion src/ucs/config/global_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ typedef struct {
unsigned async_max_events;

/** Memtype cache */
int enable_memtype_cache;
ucs_ternary_auto_value_t enable_memtype_cache;

/* Destination for statistics: udp:host:port / file:path / stdout
*/
Expand Down
15 changes: 4 additions & 11 deletions src/ucs/config/ucm_opts.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,13 @@

#include <ucm/api/ucm.h>
#include <ucm/util/log.h>
#include <ucm/util/sys.h>
#include <ucm/mmap/mmap.h>
#include <ucs/sys/compiler.h>


#define UCM_CONFIG_PREFIX "MEM_"

static const char *ucm_mmap_hook_modes[] = {
[UCM_MMAP_HOOK_NONE] = "none",
[UCM_MMAP_HOOK_RELOC] = UCM_MMAP_HOOK_RELOC_STR,
#if UCM_BISTRO_HOOKS
[UCM_MMAP_HOOK_BISTRO] = UCM_MMAP_HOOK_BISTRO_STR,
#endif
[UCM_MMAP_HOOK_LAST] = NULL
};

static const char *ucm_module_unload_prevent_modes[] = {
[UCM_UNLOAD_PREVENT_MODE_LAZY] = "lazy",
[UCM_UNLOAD_PREVENT_MODE_NOW] = "now",
Expand Down Expand Up @@ -71,9 +63,10 @@ static ucs_config_field_t ucm_global_config_table[] = {

{"CUDA_HOOK_MODE",
#if UCM_BISTRO_HOOKS
UCM_MMAP_HOOK_BISTRO_STR ","
#endif
UCM_MMAP_HOOK_BISTRO_STR,
#else
UCM_MMAP_HOOK_RELOC_STR,
#endif
"Cuda memory hook modes. A combination of:\n"
" none - Don't set Cuda hooks.\n"
" reloc - Use ELF relocation table to set hooks. In this mode, if any\n"
Expand Down
18 changes: 13 additions & 5 deletions src/ucs/memory/memtype_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@


static ucs_spinlock_t ucs_memtype_cache_global_instance_lock;
static int ucs_memtype_cache_failed = 0;
ucs_memtype_cache_t *ucs_memtype_cache_global_instance = NULL;


Expand Down Expand Up @@ -56,16 +57,23 @@ static UCS_F_ALWAYS_INLINE ucs_memtype_cache_t *ucs_memtype_cache_get_global()
ucs_memtype_cache_t *memtype_cache = NULL;
ucs_status_t status;

if (!ucs_global_opts.enable_memtype_cache) {
if (ucs_global_opts.enable_memtype_cache == UCS_NO) {
return NULL;
}

/* Double-check lock scheme */
if (ucs_unlikely(ucs_memtype_cache_global_instance == NULL)) {
if (ucs_unlikely(ucs_memtype_cache_global_instance == NULL) &&
!ucs_memtype_cache_failed) {
/* Create the memtype cache outside the lock, to avoid a Coverity error
of lock inversion with UCS_INIT_ONCE from ucm_set_event_handler() */
status = UCS_CLASS_NEW(ucs_memtype_cache_t, &memtype_cache);
if (status != UCS_OK) {
/* If we failed to create the memtype cache once, do not try again */
ucs_memtype_cache_failed = 1;
if (ucs_global_opts.enable_memtype_cache == UCS_YES) {
ucs_warn("failed to create memtype cache: %s",
ucs_status_string(status));
Copy link
Contributor

@bureddy bureddy Feb 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should it be a hard error if the user is forced to enable memtype cache?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't think it should be a fatal error since we can still continue the program without memtype cache

}
return NULL;
}

Expand Down Expand Up @@ -386,9 +394,9 @@ static UCS_CLASS_INIT_FUNC(ucs_memtype_cache_t)
UCM_EVENT_FLAG_EXISTING_ALLOC,
1000, ucs_memtype_cache_event_callback,
self);
if ((status != UCS_OK) && (status != UCS_ERR_UNSUPPORTED)) {
ucs_error("failed to set UCM memtype event handler: %s",
ucs_status_string(status));
if (status != UCS_OK) {
ucs_diag("failed to set UCM memtype event handler: %s",
ucs_status_string(status));
goto err_cleanup_pgtable;
}

Expand Down
2 changes: 2 additions & 0 deletions src/ucs/memory/rcache.c
Original file line number Diff line number Diff line change
Expand Up @@ -1294,6 +1294,8 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params,
status = ucm_set_event_handler(params->ucm_events, params->ucm_event_priority,
ucs_rcache_unmapped_callback, self);
if (status != UCS_OK) {
ucs_diag("rcache failed to install UCM event handler: %s",
ucs_status_string(status));
goto err_remove_vfs;
}

Expand Down
4 changes: 2 additions & 2 deletions test/gtest/ucs/test_memtype_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ class test_memtype_cache : public ucs::test_with_param<ucs_memory_type_t> {
if (!expect_found || (expected_type == UCS_MEMORY_TYPE_HOST)) {
/* memory type should be not found or unknown */
if (status != UCS_ERR_NO_ELEM) {
ASSERT_UCS_OK(status, << "ptr=" << ptr << " size=" << size);
ASSERT_UCS_OK(status, << " ptr=" << ptr << " size=" << size);
EXPECT_EQ(UCS_MEMORY_TYPE_UNKNOWN, mem_info.type)
<< "ptr=" << ptr << " size=" << size
<< mem_buffer::mem_type_name(mem_info.type);
}
} else {
ASSERT_UCS_OK(status, << "ptr=" << ptr << " size=" << size);
ASSERT_UCS_OK(status, << " ptr=" << ptr << " size=" << size);
EXPECT_TRUE((UCS_MEMORY_TYPE_UNKNOWN == mem_info.type) ||
(expected_type == mem_info.type))
<< "ptr=" << ptr << " size=" << size
Expand Down
2 changes: 1 addition & 1 deletion test/gtest/uct/test_p2p_err.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class uct_p2p_err_test : public uct_p2p_test {
{
void *address = NULL;
ucs_status_t status = ucs_mmap_alloc(&length, &address, 0, "test_dummy");
ASSERT_UCS_OK(status, << "length = " << length);
ASSERT_UCS_OK(status, << " length = " << length);
status = ucs_mmap_free(address, length);
ASSERT_UCS_OK(status);
/* coverity[use_after_free] */
Expand Down