From 0e815cd36e5d2567c36d6df584c8205ffb904a36 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Mon, 18 Sep 2017 00:10:50 -0700 Subject: [PATCH] cudaFree memory hook --- src/ucm/Makefile.am | 7 ++ src/ucm/api/ucm.h | 21 +++++- src/ucm/cuda/cudamem.h | 22 ++++++ src/ucm/cuda/install.c | 154 ++++++++++++++++++++++++++++++++++++++ src/ucm/cuda/replace.c | 102 +++++++++++++++++++++++++ src/ucm/event/event.c | 42 ++++++++++- src/ucm/util/ucm_config.c | 14 +++- src/ucm/util/ucm_config.h | 3 + 8 files changed, 362 insertions(+), 3 deletions(-) create mode 100644 src/ucm/cuda/cudamem.h create mode 100644 src/ucm/cuda/install.c create mode 100644 src/ucm/cuda/replace.c diff --git a/src/ucm/Makefile.am b/src/ucm/Makefile.am index 2215ddd2111..7db3ea7ccef 100644 --- a/src/ucm/Makefile.am +++ b/src/ucm/Makefile.am @@ -47,6 +47,13 @@ libucm_la_SOURCES = \ util/reloc.c \ util/sys.c +if HAVE_CUDA +libucm_la_SOURCES += \ + cuda/install.c \ + cuda/replace.c + +endif + if HAVE_UCM_PTMALLOC283 libucm_la_CPPFLAGS += \ -I$(srcdir)/ptmalloc283/sysdeps/pthread \ diff --git a/src/ucm/api/ucm.h b/src/ucm/api/ucm.h index 0b024e3d29b..95b963eed73 100644 --- a/src/ucm/api/ucm.h +++ b/src/ucm/api/ucm.h @@ -9,7 +9,10 @@ #define UCM_H_ #include - +#if HAVE_CUDA +#include +#include +#endif BEGIN_C_DECLS #include @@ -32,6 +35,7 @@ typedef enum ucm_event_type { UCM_EVENT_SHMAT = UCS_BIT(3), UCM_EVENT_SHMDT = UCS_BIT(4), UCM_EVENT_SBRK = UCS_BIT(5), + UCM_EVENT_CUDAFREE = UCS_BIT(6), /* Aggregate events */ UCM_EVENT_VM_MAPPED = UCS_BIT(16), @@ -113,6 +117,13 @@ typedef union ucm_event { intptr_t increment; } sbrk; +#if HAVE_CUDA + struct { + int result; + void *address; + } cudaFree; +#endif + /* * UCM_EVENT_VM_MAPPED, UCM_EVENT_VM_UNMAPPED * @@ -296,6 +307,14 @@ int ucm_orig_shmdt(const void *shmaddr); */ void *ucm_orig_sbrk(intptr_t increment); +#if HAVE_CUDA + +cudaError_t ucm_orig_cudaFree(void *address); + +cudaError_t ucm_cudaFree(void *address); + +#endif + /** * @brief Call the original implementation of @ref mmap and all handlers diff --git a/src/ucm/cuda/cudamem.h b/src/ucm/cuda/cudamem.h new file mode 100644 index 00000000000..e6665bc7704 --- /dev/null +++ b/src/ucm/cuda/cudamem.h @@ -0,0 +1,22 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCM_CUDAMEM_H_ +#define UCM_CUDAMEM_H_ + +#include +#include +#include + +ucs_status_t ucm_cudamem_install(int events); + +void ucm_cudamem_event_test_callback(ucm_event_type_t event_type, + ucm_event_t *event, void *arg); + + +cudaError_t ucm_override_cudaFree(void *addr); + +#endif diff --git a/src/ucm/cuda/install.c b/src/ucm/cuda/install.c new file mode 100644 index 00000000000..950803142eb --- /dev/null +++ b/src/ucm/cuda/install.c @@ -0,0 +1,154 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "cudamem.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +typedef struct ucm_cudamem_func { + ucm_reloc_patch_t patch; + ucm_event_type_t event_type; +} ucm_cudamem_func_t; + +static ucm_cudamem_func_t ucm_cudamem_funcs[] = { + { {"cudaFree", ucm_override_cudaFree}, UCM_EVENT_CUDAFREE}, + { {NULL, NULL}, 0} +}; + +void ucm_cudamem_event_test_callback(ucm_event_type_t event_type, + ucm_event_t *event, void *arg) +{ + int *out_events = arg; + *out_events |= event_type; +} + +/* Called with lock held */ +static ucs_status_t ucm_cudamem_test(int events) +{ + static int installed_events = 0; + ucm_event_handler_t handler; + int out_events = 0; + void *p; + + if (ucs_test_all_flags(installed_events, events)) { + /* All requested events are already installed */ + return UCS_OK; + } + + /* Install a temporary event handler which will add the supported event + * type to out_events bitmap. + */ + handler.events = events; + handler.priority = -1; + handler.cb = ucm_cudamem_event_test_callback; + handler.arg = &out_events; + out_events = 0; + + ucm_event_handler_add(&handler); + + if (events & (UCM_EVENT_CUDAFREE)) { + if (cudaSuccess != cudaMalloc(&p, 64)) { + ucm_error("cudaMalloc failed"); + return UCS_ERR_UNSUPPORTED; + } + cudaFree(p); + } + + + ucm_event_handler_remove(&handler); + + /* TODO check address / stop all threads */ + installed_events |= out_events; + ucm_debug("cudamem test: got 0x%x out of 0x%x, total: 0x%x", out_events, events, + installed_events); + + /* Return success iff we caught all wanted events */ + if (!ucs_test_all_flags(out_events, events)) { + return UCS_ERR_UNSUPPORTED; + } + + return UCS_OK; +} + +/* Called with lock held */ +static ucs_status_t ucs_cudamem_install_reloc(int events) +{ + static int installed_events = 0; + ucm_cudamem_func_t *entry; + ucs_status_t status; + + if (!ucm_global_config.enable_cuda_hooks) { + ucm_debug("installing cudamem relocations is disabled by configuration"); + return UCS_ERR_UNSUPPORTED; + } + + for (entry = ucm_cudamem_funcs; entry->patch.symbol != NULL; ++entry) { + if (!(entry->event_type & events)) { + /* Not required */ + continue; + } + + if (entry->event_type & installed_events) { + /* Already installed */ + continue; + } + + ucm_debug("cudamem: installing relocation table entry for %s = %p for event 0x%x", + entry->patch.symbol, entry->patch.value, entry->event_type); + + status = ucm_reloc_modify(&entry->patch); + if (status != UCS_OK) { + ucm_warn("failed to install relocation table entry for '%s'", + entry->patch.symbol); + return status; + } + + installed_events |= entry->event_type; + } + + return UCS_OK; +} + +ucs_status_t ucm_cudamem_install(int events) +{ + static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER; + ucs_status_t status; + + pthread_mutex_lock(&install_mutex); + + status = ucm_cudamem_test(events); + if (status == UCS_OK) { + goto out_unlock; + } + + status = ucs_cudamem_install_reloc(events); + if (status != UCS_OK) { + ucm_debug("failed to install relocations for cudamem"); + goto out_unlock; + } + + status = ucm_cudamem_test(events); + +out_unlock: + pthread_mutex_unlock(&install_mutex); + return status; +} diff --git a/src/ucm/cuda/replace.c b/src/ucm/cuda/replace.c new file mode 100644 index 00000000000..a28d3291dfe --- /dev/null +++ b/src/ucm/cuda/replace.c @@ -0,0 +1,102 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "cudamem.h" + +#include +#include +#include +#include +#include +#include +#include + + +#define MAP_FAILED ((void*)-1) + +static pthread_mutex_t ucm_cudamem_get_orig_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; +static pthread_t volatile ucm_cudamem_get_orig_thread = -1; + + +/** + * Define a replacement function to a memory-mapping function call, which calls + * the event handler, and if event handler returns error code - calls the original + * function. + */ +#define UCM_DEFINE_CUDA_FUNC(_name, _rettype, _fail_val, ...) \ + \ + _rettype ucm_override_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)); \ + \ + /* Call the original function using dlsym(RTLD_NEXT) */ \ + _rettype ucm_orig_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \ + { \ + typedef _rettype (*func_ptr_t) (__VA_ARGS__); \ + static func_ptr_t orig_func_ptr = NULL; \ + \ + ucm_trace("%s()", __FUNCTION__); \ + \ + if (ucs_unlikely(orig_func_ptr == NULL)) { \ + pthread_mutex_lock(&ucm_cudamem_get_orig_lock); \ + ucm_cudamem_get_orig_thread = pthread_self(); \ + orig_func_ptr = ucm_reloc_get_orig(UCS_PP_QUOTE(_name), \ + ucm_override_##_name); \ + ucm_cudamem_get_orig_thread = -1; \ + pthread_mutex_unlock(&ucm_cudamem_get_orig_lock); \ + } \ + return orig_func_ptr(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \ + } \ + \ + /* Define a symbol which goes to the replacement - in case we are loaded first */ \ + _rettype ucm_override_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \ + { \ + ucm_trace("%s()", __FUNCTION__); \ + \ + if (ucs_unlikely(ucm_cudamem_get_orig_thread == pthread_self())) { \ + return _fail_val; \ + } \ + return ucm_##_name(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \ + } + +#define UCM_OVERRIDE_CUDA_FUNC(_name) \ + cudaError_t _name() __attribute__ ((alias ("ucm_override_" UCS_PP_QUOTE(_name)))); \ + + +/* + * Define argument list with given types. + */ +#define UCM_FUNC_DEFINE_ARGS(...) \ + UCS_PP_FOREACH_SEP(_UCM_FUNC_ARG_DEFINE, _, \ + UCS_PP_ZIP((UCS_PP_SEQ(UCS_PP_NUM_ARGS(__VA_ARGS__))), \ + (__VA_ARGS__))) + +/* + * Pass auto-generated arguments to a function call. + */ +#define UCM_FUNC_PASS_ARGS(...) \ + UCS_PP_FOREACH_SEP(_UCM_FUNC_ARG_PASS, _, UCS_PP_SEQ(UCS_PP_NUM_ARGS(__VA_ARGS__))) + + +/* + * Helpers + */ +#define _UCM_FUNC_ARG_DEFINE(_, _bundle) \ + __UCM_FUNC_ARG_DEFINE(_, UCS_PP_TUPLE_0 _bundle, UCS_PP_TUPLE_1 _bundle) +#define __UCM_FUNC_ARG_DEFINE(_, _index, _type) \ + _type UCS_PP_TOKENPASTE(arg, _index) +#define _UCM_FUNC_ARG_PASS(_, _index) \ + UCS_PP_TOKENPASTE(arg, _index) + + +UCM_DEFINE_CUDA_FUNC(cudaFree, cudaError_t, -1, void*) + +#if ENABLE_SYMBOL_OVERRIDE +UCM_OVERRIDE_CUDA_FUNC(cudaFree) +#endif + diff --git a/src/ucm/event/event.c b/src/ucm/event/event.c index f572dae6bbe..daeed27c5d4 100644 --- a/src/ucm/event/event.c +++ b/src/ucm/event/event.c @@ -13,6 +13,9 @@ #include #include #include +#if HAVE_CUDA +#include +#endif #include #include #include @@ -89,6 +92,13 @@ static void ucm_event_call_orig(ucm_event_type_t event_type, ucm_event_t *event, event->sbrk.result = ucm_orig_sbrk(event->sbrk.increment); } break; +#if HAVE_CUDA + case UCM_EVENT_CUDAFREE: + if (event->cudaFree.result == -1) { + event->cudaFree.result = ucm_orig_cudaFree(event->cudaFree.address); + } + break; +#endif default: ucm_warn("Got unknown event %d", event_type); break; @@ -102,7 +112,7 @@ static void ucm_event_call_orig(ucm_event_type_t event_type, ucm_event_t *event, static ucm_event_handler_t ucm_event_orig_handler = { .list = UCS_LIST_INITIALIZER(&ucm_event_handlers, &ucm_event_handlers), .events = UCM_EVENT_MMAP | UCM_EVENT_MUNMAP | UCM_EVENT_MREMAP | - UCM_EVENT_SHMAT | UCM_EVENT_SHMDT | UCM_EVENT_SBRK, /* All events */ + UCM_EVENT_SHMAT | UCM_EVENT_SHMDT | UCM_EVENT_SBRK | UCM_EVENT_CUDAFREE, /* All events */ .priority = 0, /* Between negative and positive handlers */ .cb = ucm_event_call_orig }; @@ -333,7 +343,26 @@ void *ucm_sbrk(intptr_t increment) return event.sbrk.result; } +#if HAVE_CUDA +cudaError_t ucm_cudaFree(void *addr) +{ + ucm_event_t event; + + ucm_event_enter(); + + ucm_trace("ucm_cudaFree(addr=%p )", addr); + + ucm_dispatch_vm_munmap(addr, 0); + event.cudaFree.result = -1; + event.cudaFree.address = addr; + ucm_event_dispatch(UCM_EVENT_CUDAFREE, &event); + + ucm_event_leave(); + + return event.cudaFree.result; +} +#endif void ucm_event_handler_add(ucm_event_handler_t *handler) { ucm_event_handler_t *elem; @@ -390,6 +419,17 @@ static ucs_status_t ucm_event_install(int events) } ucm_debug("malloc hooks are ready"); +#if HAVE_CUDA + + native_events = UCM_EVENT_CUDAFREE; + status = ucm_cudamem_install(native_events); + if (status != UCS_OK) { + ucm_debug("failed to install cudamem events"); + goto out_unlock; + } + ucm_debug("cudaFree hooks are ready"); +#endif + status = UCS_OK; out_unlock: diff --git a/src/ucm/util/ucm_config.c b/src/ucm/util/ucm_config.c index 2d9d2b43301..4035d86382a 100644 --- a/src/ucm/util/ucm_config.c +++ b/src/ucm/util/ucm_config.c @@ -20,6 +20,7 @@ #define UCM_EN_MMAP_RELOC_VAR "MMAP_RELOC" #define UCM_EN_MALLOC_HOOKS_VAR "MALLOC_HOOKS" #define UCM_EN_MALLOC_RELOC_VAR "MALLOC_RELOC" +#define UCM_EN_CUDA_HOOKS_VAR "CUDA_HOOKS" ucm_config_t ucm_global_config = { @@ -28,7 +29,10 @@ ucm_config_t ucm_global_config = { .enable_events = 1, .enable_mmap_reloc = 1, .enable_malloc_hooks = 1, - .enable_malloc_reloc = 0 + .enable_malloc_reloc = 0, +#if HAVE_CUDA + .enable_cuda_hooks = 1 +#endif }; static const char *ucm_config_bool_to_string(int value) @@ -107,6 +111,10 @@ void ucm_config_print(FILE *stream, ucs_config_print_flags_t print_flags) print_flags); fprintf(stream, "%s%s=%s\n", UCM_ENV_PREFIX, UCM_EN_MALLOC_RELOC_VAR, ucm_config_bool_to_string(ucm_global_config.enable_malloc_reloc)); +#if HAVE_CUDA + fprintf(stream, "%s%s=%s\n", UCM_ENV_PREFIX, UCM_EN_CUDA_HOOKS_VAR, + ucm_config_bool_to_string(ucm_global_config.enable_cuda_hooks)); +#endif } static void ucm_config_set_value_table(const char *str_value, const char **table, @@ -157,6 +165,10 @@ ucs_status_t ucm_config_modify(const char *name, const char *value) ucm_config_set_value_bool(value, &ucm_global_config.enable_malloc_hooks); } else if (!strcmp(name, UCM_EN_MALLOC_RELOC_VAR)) { ucm_config_set_value_bool(value, &ucm_global_config.enable_malloc_reloc); +#if HAVE_CUDA + } else if (!strcmp(name, UCM_EN_CUDA_HOOKS_VAR)) { + ucm_config_set_value_bool(value, &ucm_global_config.enable_cuda_hooks); +#endif } else { return UCS_ERR_INVALID_PARAM; } diff --git a/src/ucm/util/ucm_config.h b/src/ucm/util/ucm_config.h index 317608a57c6..bc42a600b12 100644 --- a/src/ucm/util/ucm_config.h +++ b/src/ucm/util/ucm_config.h @@ -18,6 +18,9 @@ typedef struct ucm_config { int enable_mmap_reloc; int enable_malloc_hooks; int enable_malloc_reloc; +#if HAVE_CUDA + int enable_cuda_hooks; +#endif size_t alloc_alignment; } ucm_config_t;