From fb26609742a670e905b758ab090373661ee8494e Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Sat, 7 Aug 2021 03:36:55 +0300 Subject: [PATCH] UCM/BISTRO/TEST: Fix support for cuda memory hooks When a hooked function starts with instruction that refers global data (for example, "cmp $imm32, $disp32(%rip)") - we need to modify the instruction so it could be executed from is new location and still access the same global variable. Change Cuda hooks configuration so it can also support "bistro hooks without fallback to reloc" and test this mode in CI. --- contrib/test_jenkins.sh | 4 + src/ucm/api/ucm.h | 2 +- src/ucm/bistro/bistro_x86_64.c | 250 +++++++++++++++++++++++---------- src/ucm/cuda/cudamem.c | 30 ++-- src/ucm/util/sys.c | 6 +- src/ucs/config/ucm_opts.c | 12 +- 6 files changed, 209 insertions(+), 95 deletions(-) diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index 9965630eeaf..e03e6da7ec9 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -1024,6 +1024,10 @@ test_malloc_hook() { ${cuda_dynamic_exe} -d [ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d + # Test hooks in gtest + UCX_MEM_LOG_LEVEL=diag \ + ./test/gtest/gtest --gtest_filter='cuda_hooks.*' + unset UCX_MEM_CUDA_HOOK_MODE done fi diff --git a/src/ucm/api/ucm.h b/src/ucm/api/ucm.h index 65ff4b29a3b..beccfc91492 100644 --- a/src/ucm/api/ucm.h +++ b/src/ucm/api/ucm.h @@ -213,7 +213,7 @@ typedef struct ucm_global_config { ucm_mmap_hook_mode_t mmap_hook_mode; /* MMAP hook mode */ int enable_malloc_hooks; /* Enable installing malloc hooks */ int enable_malloc_reloc; /* Enable installing malloc relocations */ - ucm_mmap_hook_mode_t cuda_hook_mode; /* Cuda hooks mode */ + int cuda_hook_modes; /* Bitmap of allowed cuda hooks modes */ int enable_dynamic_mmap_thresh; /* Enable adaptive mmap threshold */ size_t alloc_alignment; /* Alignment for memory allocations */ int dlopen_process_rpath; /* Process RPATH section in dlopen hook */ diff --git a/src/ucm/bistro/bistro_x86_64.c b/src/ucm/bistro/bistro_x86_64.c index 6dd6fc36d60..408660da936 100644 --- a/src/ucm/bistro/bistro_x86_64.c +++ b/src/ucm/bistro/bistro_x86_64.c @@ -22,6 +22,7 @@ #include #include #include +#include typedef struct { @@ -35,6 +36,15 @@ typedef struct { int32_t displ; } UCS_S_PACKED ucm_bistro_jmp_indirect_t; +typedef struct { + uint8_t push_rax; + uint8_t movabs_rax[2]; + uint64_t rax_value; + uint8_t cmp_dptr_rax[2]; + uint32_t cmp_value; + uint8_t pop_rax; +} UCS_S_PACKED ucm_bistro_compare_xlt_t; + /* REX prefix */ #define UCM_BISTRO_X86_REX_MASK 0xF0 /* Mask */ @@ -87,109 +97,195 @@ typedef struct { #define UCM_BISTRO_X86_MODRM_CMP_RIP 0x3D /* 11 111 101 */ -/* - * Find the minimal length of initial instructions in the function which can be - * safely executed from any memory location. - * Uses a very simplified disassembler which supports only the typical - * instructions found in function prologue. - */ -static size_t ucm_bistro_detect_pic_prefix(const void *func, size_t min_length) +static ucs_status_t +ucm_bistro_relocate_one(void *dst, const void *src, size_t max_dst_length, + size_t *dst_length, size_t *src_length) { + const void *src_p = src; + ucm_bistro_compare_xlt_t cmp_xlt = { + .push_rax = 0x50, + .movabs_rax = {0x48, 0xb8}, + .cmp_dptr_rax = {0x81, 0x38}, + .pop_rax = 0x58 + }; uint8_t rex, opcode, modrm, mod; - size_t offset, prev_offset; - - offset = 0; - while (offset < min_length) { - prev_offset = offset; - opcode = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++); - - /* check for REX prefix */ - if ((opcode & UCM_BISTRO_X86_REX_MASK) == UCM_BISTRO_X86_REX) { - rex = opcode; - opcode = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++); - } else { - rex = 0; + const void *copy_src; + int32_t disp32; + uint32_t imm32; + + /* Check opcode and REX prefix */ + opcode = *ucs_serialize_next(&src_p, const uint8_t); + if ((opcode & UCM_BISTRO_X86_REX_MASK) == UCM_BISTRO_X86_REX) { + rex = opcode; + opcode = *ucs_serialize_next(&src_p, const uint8_t); + } else { + rex = 0; + } + + if (((rex == 0) || rex == UCM_BISTRO_X86_REX_B) && + ((opcode & UCM_BISTRO_X86_PUSH_R_MASK) == UCM_BISTRO_X86_PUSH_R)) { + /* push reg */ + goto out_copy_src; + } else if ((rex == UCM_BISTRO_X86_REX_W) && + (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) { + modrm = *ucs_serialize_next(&src_p, const uint8_t); + if (modrm == UCM_BISTRO_X86_MODRM_SUB_SP) { + /* sub $imm32, %rsp */ + ucs_serialize_next(&src_p, const uint32_t); + goto out_copy_src; + } + } else if ((rex == UCM_BISTRO_X86_REX_W) && + (opcode == UCM_BISTRO_X86_MOV_EV_GV)) { + modrm = *ucs_serialize_next(&src_p, const uint8_t); + mod = modrm >> UCM_BISTRO_X86_MODRM_MOD_SHIFT; + if (modrm == UCM_BISTRO_X86_MODRM_BP_SP) { + /* mov %rsp, %rbp */ + goto out_copy_src; } - /* check the opcode */ - if (((rex == 0) || rex == UCM_BISTRO_X86_REX_B) && - ((opcode & UCM_BISTRO_X86_PUSH_R_MASK) == UCM_BISTRO_X86_PUSH_R)) { - continue; - } else if ((rex == UCM_BISTRO_X86_REX_W) && - (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) { - modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++); - if (modrm == UCM_BISTRO_X86_MODRM_SUB_SP) { - /* sub $imm32, %rsp */ - offset += sizeof(uint32_t); - continue; - } - } else if ((rex == UCM_BISTRO_X86_REX_W) && - (opcode == UCM_BISTRO_X86_MOV_EV_GV)) { - modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++); - if (modrm == UCM_BISTRO_X86_MODRM_BP_SP) { - /* mov %rsp, %rbp */ - continue; + if ((mod != UCM_BISTRO_X86_MODRM_MOD_REG) && + ((modrm & UCS_MASK(UCM_BISTRO_X86_MODRM_RM_BITS)) == + UCM_BISTRO_X86_MODRM_RM_SIB)) { + /* r/m = 0b100, mod = 0b00/0b01/0b10 */ + ucs_serialize_next(&src_p, const uint8_t); /* skip SIB */ + if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP8) { + ucs_serialize_next(&src_p, const uint8_t); /* skip disp8 */ + goto out_copy_src; + } else if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP32) { + ucs_serialize_next(&src_p, const uint32_t); /* skip disp32 */ + goto out_copy_src; } - mod = modrm >> UCM_BISTRO_X86_MODRM_MOD_SHIFT; - if ((mod != UCM_BISTRO_X86_MODRM_MOD_REG) && - ((modrm & UCS_MASK(UCM_BISTRO_X86_MODRM_RM_BITS)) == - UCM_BISTRO_X86_MODRM_RM_SIB)) { - /* r/m = 0b100, mod = 0b00/0b01/0b10 */ - ++offset; /* skip SIB */ - if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP8) { - offset += sizeof(uint8_t); /* skip disp8 */ - } else if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP32) { - offset += sizeof(uint32_t); /* skip disp32 */ - } - continue; - } - } else if ((rex == 0) && - ((opcode & UCM_BISTRO_X86_MOV_IR_MASK) == UCM_BISTRO_X86_MOV_IR)) { - offset += sizeof(uint32_t); - continue; - } else if ((rex == 0) && (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) { - modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++); - if (modrm == UCM_BISTRO_X86_MODRM_CMP_RIP) { - offset += sizeof(uint32_t) * 2; /* skip disp32 and imm32 */ - } - continue; } + } else if ((rex == 0) && ((opcode & UCM_BISTRO_X86_MOV_IR_MASK) == + UCM_BISTRO_X86_MOV_IR)) { + /* mov $imm32, %reg */ + ucs_serialize_next(&src_p, const uint32_t); + goto out_copy_src; + } else if ((rex == 0) && (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) { + modrm = *ucs_serialize_next(&src_p, const uint8_t); + if (modrm == UCM_BISTRO_X86_MODRM_CMP_RIP) { + /* + * Since we can't assume the new code will be within 32-bit + * range of the global variable argument, we need to translate + * the code from: + * cmpl $imm32, $disp32(%rip) + * to: + * push %rax + * movq $addr64, %rax ; $addr64 is $disp32+%rip + * cmpl $imm32, (%rax) + * pop %rax + */ + disp32 = *ucs_serialize_next(&src_p, const uint32_t); + imm32 = *ucs_serialize_next(&src_p, const uint32_t); + cmp_xlt.rax_value = (uintptr_t)UCS_PTR_BYTE_OFFSET(src_p, disp32); + cmp_xlt.cmp_value = imm32; + copy_src = &cmp_xlt; + *dst_length = sizeof(cmp_xlt); + goto out_copy; + } + } + + /* Could not recognize the instruction */ + return UCS_ERR_UNSUPPORTED; + +out_copy_src: + copy_src = src; + *dst_length = UCS_PTR_BYTE_DIFF(src, src_p); +out_copy: + if (*dst_length > max_dst_length) { + return UCS_ERR_BUFFER_TOO_SMALL; + } + + *src_length = UCS_PTR_BYTE_DIFF(src, src_p); + memcpy(dst, copy_src, *dst_length); + return UCS_OK; +} + +/* + * Relocate at least 'min_src_length' code instructions from 'src' to 'dst', + * possibly changing some of them to new instructions. + * Uses a simplified disassembler which supports only typical instructions + * found in function prologue. + */ +static ucs_status_t +ucm_bistro_relocate_code(void *dst, const void *src, size_t min_src_length, + size_t max_dst_length, size_t *dst_length, + size_t *src_length) +{ + size_t src_length_one, dst_length_one; + ucs_status_t status; + + *src_length = 0; + *dst_length = 0; + while (*src_length < min_src_length) { + status = ucm_bistro_relocate_one(UCS_PTR_BYTE_OFFSET(dst, *dst_length), + UCS_PTR_BYTE_OFFSET(src, *src_length), + max_dst_length - *dst_length, + &dst_length_one, &src_length_one); + if (status != UCS_OK) { + return status; + } + + *dst_length += dst_length_one; + *src_length += src_length_one; + } - /* unsupported instruction - bail */ - return prev_offset; + ucm_assert(*dst_length <= max_dst_length); + return UCS_OK; +} + +static const char * +ucm_bistro_dump_code(const void *code, size_t length, char *str, size_t max) +{ + const void *code_p = code; + char *p = str; + char *endp = str + max; + + while (code_p < UCS_PTR_BYTE_OFFSET(code, length)) { + snprintf(p, endp - p, " %02X", + *ucs_serialize_next(&code_p, const uint8_t)); + p += strlen(p); } - return offset; + return str; } static ucs_status_t ucm_bistro_construct_orig_func(const void *func_ptr, size_t patch_len, const char *symbol, void **orig_func_p) { + size_t code_len, prefix_len, max_code_len; ucm_bistro_jmp_indirect_t *jmp_back; ucm_bistro_orig_func_t *orig_func; - size_t prefix_len, code_size; - - prefix_len = ucm_bistro_detect_pic_prefix(func_ptr, patch_len); - ucm_debug("'%s' at %p prefix length %zu/%zu", symbol, func_ptr, prefix_len, - patch_len); - if (prefix_len < patch_len) { - return UCS_ERR_UNSUPPORTED; - } + ucs_status_t status; + char code_buf[64]; /* Allocate executable page */ - code_size = sizeof(*orig_func) + patch_len + sizeof(*jmp_back); - orig_func = ucm_bistro_allocate_code(code_size); + max_code_len = patch_len + sizeof(ucm_bistro_compare_xlt_t); + orig_func = ucm_bistro_allocate_code(sizeof(*orig_func) + max_code_len + + sizeof(*jmp_back)); if (orig_func == NULL) { return UCS_ERR_NO_MEMORY; } - /* Copy code fragment from original function */ - memcpy(orig_func->code, func_ptr, prefix_len); + /* Copy and translate code from 'func_ptr' to 'orig_func->code'. + 'code_len' is the code size at destination buffer, and 'prefix_len' is + how many bytes were translated from 'func_ptr'. */ + status = ucm_bistro_relocate_code(orig_func->code, func_ptr, patch_len, + max_code_len, &code_len, &prefix_len); + if (status != UCS_OK) { + ucm_diag("'%s' could not patch by bistro, code:%s", symbol, + ucm_bistro_dump_code(func_ptr, 16, code_buf, + sizeof(code_buf))); + return UCS_ERR_UNSUPPORTED; + } + + ucm_debug("'%s' at %p code length %zu/%zu prefix length %zu", symbol, + func_ptr, code_len, patch_len, prefix_len); /* Indirect jump to *orig_func->jmp_address */ orig_func->jmp_addr = UCS_PTR_BYTE_OFFSET(func_ptr, prefix_len); - jmp_back = UCS_PTR_BYTE_OFFSET(orig_func->code, prefix_len); + jmp_back = UCS_PTR_BYTE_OFFSET(orig_func->code, code_len); jmp_back->opcode = 0xff; jmp_back->modrm = 0x25; jmp_back->displ = UCS_PTR_BYTE_DIFF(jmp_back + 1, &orig_func->jmp_addr); diff --git a/src/ucm/cuda/cudamem.c b/src/ucm/cuda/cudamem.c index dfab90b886f..f3e33a62b8b 100644 --- a/src/ucm/cuda/cudamem.c +++ b/src/ucm/cuda/cudamem.c @@ -206,9 +206,10 @@ static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = { {{NULL}, NULL} }; -static ucm_mmap_hook_mode_t ucm_cuda_hook_mode() +static int ucm_cuda_allow_hook_mode(ucm_mmap_hook_mode_t mode) { - return ucm_get_hook_mode(ucm_global_opts.cuda_hook_mode); + return (ucm_global_opts.cuda_hook_modes & UCS_BIT(mode)) && + (ucm_get_hook_mode(mode) == mode); } static ucs_status_t @@ -231,7 +232,7 @@ ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc, status = UCS_ERR_UNSUPPORTED; - if (ucm_cuda_hook_mode() == UCM_MMAP_HOOK_BISTRO) { + if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) { status = ucm_bistro_patch(func_ptr, func->patch.value, func->patch.symbol, func->orig_func_ptr, NULL); @@ -242,19 +243,24 @@ ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc, continue; } - ucm_debug("failed to install bistro hook for '%s', trying reloc", + ucm_debug("failed to install bistro hook for '%s'", func->patch.symbol); } - status = ucm_reloc_modify(&func->patch); - if (status != UCS_OK) { - ucm_diag("failed to install relocation table entry for '%s'", - func->patch.symbol); - return status; + if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_RELOC)) { + status = ucm_reloc_modify(&func->patch); + if (status == UCS_OK) { + ++num_reloc; + ucm_trace("installed reloc hook on '%s'", func->patch.symbol); + continue; + } + + ucm_debug("failed to install relocation table hook for '%s'", + func->patch.symbol); } - ++num_reloc; - ucm_trace("installed reloc hook on '%s'", func->patch.symbol); + ucm_diag("failed to install hook for '%s'", func->patch.symbol); + return status; } *used_reloc = num_reloc > 0; @@ -274,7 +280,7 @@ static ucs_status_t ucm_cudamem_install(int events) goto out; } - if (ucm_cuda_hook_mode() == UCM_MMAP_HOOK_NONE) { + if (ucm_global_opts.cuda_hook_modes == 0) { ucm_info("cuda memory hooks are disabled by configuration"); status = UCS_ERR_UNSUPPORTED; goto out; diff --git a/src/ucm/util/sys.c b/src/ucm/util/sys.c index 9f0bcacdffb..32e07371f2f 100644 --- a/src/ucm/util/sys.c +++ b/src/ucm/util/sys.c @@ -38,7 +38,11 @@ ucm_global_config_t ucm_global_opts = { .mmap_hook_mode = UCM_DEFAULT_HOOK_MODE, .enable_malloc_hooks = 1, .enable_malloc_reloc = 0, - .cuda_hook_mode = UCM_DEFAULT_HOOK_MODE, + .cuda_hook_modes = +#if UCM_BISTRO_HOOKS + UCS_BIT(UCM_MMAP_HOOK_BISTRO) | +#endif + UCS_BIT(UCM_MMAP_HOOK_RELOC), .enable_dynamic_mmap_thresh = 1, .alloc_alignment = 16, .dlopen_process_rpath = 1 diff --git a/src/ucs/config/ucm_opts.c b/src/ucs/config/ucm_opts.c index bee035a4a9c..89cccc48723 100644 --- a/src/ucs/config/ucm_opts.c +++ b/src/ucs/config/ucm_opts.c @@ -69,8 +69,12 @@ static ucs_config_field_t ucm_global_config_table[] = { "which would use the original implementation and not ours.", ucs_offsetof(ucm_global_config_t, enable_malloc_reloc), UCS_CONFIG_TYPE_BOOL}, - {"CUDA_HOOK_MODE", UCM_DEFAULT_HOOK_MODE_STR, - "Cuda memory hook mode\n" + {"CUDA_HOOK_MODE", +#if UCM_BISTRO_HOOKS + UCM_MMAP_HOOK_BISTRO_STR "," +#endif + UCM_MMAP_HOOK_RELOC_STR, + "Cuda memory hook modes. A combination of:\n" " none - Don't set Cuda hooks.\n" " reloc - Use ELF relocation table to set hooks. In this mode, if any\n" " part of the application is linked with Cuda runtime statically,\n" @@ -81,8 +85,8 @@ static ucs_config_field_t ucm_global_config_table[] = { " Cuda driver APIs, so memory events are reported properly even\n" " for statically-linked applications." #endif - ,ucs_offsetof(ucm_global_config_t, cuda_hook_mode), - UCS_CONFIG_TYPE_ENUM(ucm_mmap_hook_modes)}, + ,ucs_offsetof(ucm_global_config_t, cuda_hook_modes), + UCS_CONFIG_TYPE_BITMAP(ucm_mmap_hook_modes)}, {"CUDA_RELOC", "yes", "The configuration parameter replaced by UCX_MEM_CUDA_HOOK_MODE",