diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index 9965630eeaf..e03e6da7ec9 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -1024,6 +1024,10 @@ test_malloc_hook() { ${cuda_dynamic_exe} -d [ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d + # Test hooks in gtest + UCX_MEM_LOG_LEVEL=diag \ + ./test/gtest/gtest --gtest_filter='cuda_hooks.*' + unset UCX_MEM_CUDA_HOOK_MODE done fi diff --git a/src/ucm/api/ucm.h b/src/ucm/api/ucm.h index 65ff4b29a3b..beccfc91492 100644 --- a/src/ucm/api/ucm.h +++ b/src/ucm/api/ucm.h @@ -213,7 +213,7 @@ typedef struct ucm_global_config { ucm_mmap_hook_mode_t mmap_hook_mode; /* MMAP hook mode */ int enable_malloc_hooks; /* Enable installing malloc hooks */ int enable_malloc_reloc; /* Enable installing malloc relocations */ - ucm_mmap_hook_mode_t cuda_hook_mode; /* Cuda hooks mode */ + int cuda_hook_modes; /* Bitmap of allowed cuda hooks modes */ int enable_dynamic_mmap_thresh; /* Enable adaptive mmap threshold */ size_t alloc_alignment; /* Alignment for memory allocations */ int dlopen_process_rpath; /* Process RPATH section in dlopen hook */ diff --git a/src/ucm/bistro/bistro_x86_64.c b/src/ucm/bistro/bistro_x86_64.c index 6dd6fc36d60..408660da936 100644 --- a/src/ucm/bistro/bistro_x86_64.c +++ b/src/ucm/bistro/bistro_x86_64.c @@ -22,6 +22,7 @@ #include #include #include +#include typedef struct { @@ -35,6 +36,15 @@ typedef struct { int32_t displ; } UCS_S_PACKED ucm_bistro_jmp_indirect_t; +typedef struct { + uint8_t push_rax; + uint8_t movabs_rax[2]; + uint64_t rax_value; + uint8_t cmp_dptr_rax[2]; + uint32_t cmp_value; + uint8_t pop_rax; +} UCS_S_PACKED ucm_bistro_compare_xlt_t; + /* REX prefix */ #define UCM_BISTRO_X86_REX_MASK 0xF0 /* Mask */ @@ -87,109 +97,195 @@ typedef struct { #define UCM_BISTRO_X86_MODRM_CMP_RIP 0x3D /* 11 111 101 */ -/* - * Find the minimal length of initial instructions in the function which can be - * safely executed from any memory location. - * Uses a very simplified disassembler which supports only the typical - * instructions found in function prologue. - */ -static size_t ucm_bistro_detect_pic_prefix(const void *func, size_t min_length) +static ucs_status_t +ucm_bistro_relocate_one(void *dst, const void *src, size_t max_dst_length, + size_t *dst_length, size_t *src_length) { + const void *src_p = src; + ucm_bistro_compare_xlt_t cmp_xlt = { + .push_rax = 0x50, + .movabs_rax = {0x48, 0xb8}, + .cmp_dptr_rax = {0x81, 0x38}, + .pop_rax = 0x58 + }; uint8_t rex, opcode, modrm, mod; - size_t offset, prev_offset; - - offset = 0; - while (offset < min_length) { - prev_offset = offset; - opcode = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++); - - /* check for REX prefix */ - if ((opcode & UCM_BISTRO_X86_REX_MASK) == UCM_BISTRO_X86_REX) { - rex = opcode; - opcode = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++); - } else { - rex = 0; + const void *copy_src; + int32_t disp32; + uint32_t imm32; + + /* Check opcode and REX prefix */ + opcode = *ucs_serialize_next(&src_p, const uint8_t); + if ((opcode & UCM_BISTRO_X86_REX_MASK) == UCM_BISTRO_X86_REX) { + rex = opcode; + opcode = *ucs_serialize_next(&src_p, const uint8_t); + } else { + rex = 0; + } + + if (((rex == 0) || rex == UCM_BISTRO_X86_REX_B) && + ((opcode & UCM_BISTRO_X86_PUSH_R_MASK) == UCM_BISTRO_X86_PUSH_R)) { + /* push reg */ + goto out_copy_src; + } else if ((rex == UCM_BISTRO_X86_REX_W) && + (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) { + modrm = *ucs_serialize_next(&src_p, const uint8_t); + if (modrm == UCM_BISTRO_X86_MODRM_SUB_SP) { + /* sub $imm32, %rsp */ + ucs_serialize_next(&src_p, const uint32_t); + goto out_copy_src; + } + } else if ((rex == UCM_BISTRO_X86_REX_W) && + (opcode == UCM_BISTRO_X86_MOV_EV_GV)) { + modrm = *ucs_serialize_next(&src_p, const uint8_t); + mod = modrm >> UCM_BISTRO_X86_MODRM_MOD_SHIFT; + if (modrm == UCM_BISTRO_X86_MODRM_BP_SP) { + /* mov %rsp, %rbp */ + goto out_copy_src; } - /* check the opcode */ - if (((rex == 0) || rex == UCM_BISTRO_X86_REX_B) && - ((opcode & UCM_BISTRO_X86_PUSH_R_MASK) == UCM_BISTRO_X86_PUSH_R)) { - continue; - } else if ((rex == UCM_BISTRO_X86_REX_W) && - (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) { - modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++); - if (modrm == UCM_BISTRO_X86_MODRM_SUB_SP) { - /* sub $imm32, %rsp */ - offset += sizeof(uint32_t); - continue; - } - } else if ((rex == UCM_BISTRO_X86_REX_W) && - (opcode == UCM_BISTRO_X86_MOV_EV_GV)) { - modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++); - if (modrm == UCM_BISTRO_X86_MODRM_BP_SP) { - /* mov %rsp, %rbp */ - continue; + if ((mod != UCM_BISTRO_X86_MODRM_MOD_REG) && + ((modrm & UCS_MASK(UCM_BISTRO_X86_MODRM_RM_BITS)) == + UCM_BISTRO_X86_MODRM_RM_SIB)) { + /* r/m = 0b100, mod = 0b00/0b01/0b10 */ + ucs_serialize_next(&src_p, const uint8_t); /* skip SIB */ + if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP8) { + ucs_serialize_next(&src_p, const uint8_t); /* skip disp8 */ + goto out_copy_src; + } else if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP32) { + ucs_serialize_next(&src_p, const uint32_t); /* skip disp32 */ + goto out_copy_src; } - mod = modrm >> UCM_BISTRO_X86_MODRM_MOD_SHIFT; - if ((mod != UCM_BISTRO_X86_MODRM_MOD_REG) && - ((modrm & UCS_MASK(UCM_BISTRO_X86_MODRM_RM_BITS)) == - UCM_BISTRO_X86_MODRM_RM_SIB)) { - /* r/m = 0b100, mod = 0b00/0b01/0b10 */ - ++offset; /* skip SIB */ - if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP8) { - offset += sizeof(uint8_t); /* skip disp8 */ - } else if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP32) { - offset += sizeof(uint32_t); /* skip disp32 */ - } - continue; - } - } else if ((rex == 0) && - ((opcode & UCM_BISTRO_X86_MOV_IR_MASK) == UCM_BISTRO_X86_MOV_IR)) { - offset += sizeof(uint32_t); - continue; - } else if ((rex == 0) && (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) { - modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++); - if (modrm == UCM_BISTRO_X86_MODRM_CMP_RIP) { - offset += sizeof(uint32_t) * 2; /* skip disp32 and imm32 */ - } - continue; } + } else if ((rex == 0) && ((opcode & UCM_BISTRO_X86_MOV_IR_MASK) == + UCM_BISTRO_X86_MOV_IR)) { + /* mov $imm32, %reg */ + ucs_serialize_next(&src_p, const uint32_t); + goto out_copy_src; + } else if ((rex == 0) && (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) { + modrm = *ucs_serialize_next(&src_p, const uint8_t); + if (modrm == UCM_BISTRO_X86_MODRM_CMP_RIP) { + /* + * Since we can't assume the new code will be within 32-bit + * range of the global variable argument, we need to translate + * the code from: + * cmpl $imm32, $disp32(%rip) + * to: + * push %rax + * movq $addr64, %rax ; $addr64 is $disp32+%rip + * cmpl $imm32, (%rax) + * pop %rax + */ + disp32 = *ucs_serialize_next(&src_p, const uint32_t); + imm32 = *ucs_serialize_next(&src_p, const uint32_t); + cmp_xlt.rax_value = (uintptr_t)UCS_PTR_BYTE_OFFSET(src_p, disp32); + cmp_xlt.cmp_value = imm32; + copy_src = &cmp_xlt; + *dst_length = sizeof(cmp_xlt); + goto out_copy; + } + } + + /* Could not recognize the instruction */ + return UCS_ERR_UNSUPPORTED; + +out_copy_src: + copy_src = src; + *dst_length = UCS_PTR_BYTE_DIFF(src, src_p); +out_copy: + if (*dst_length > max_dst_length) { + return UCS_ERR_BUFFER_TOO_SMALL; + } + + *src_length = UCS_PTR_BYTE_DIFF(src, src_p); + memcpy(dst, copy_src, *dst_length); + return UCS_OK; +} + +/* + * Relocate at least 'min_src_length' code instructions from 'src' to 'dst', + * possibly changing some of them to new instructions. + * Uses a simplified disassembler which supports only typical instructions + * found in function prologue. + */ +static ucs_status_t +ucm_bistro_relocate_code(void *dst, const void *src, size_t min_src_length, + size_t max_dst_length, size_t *dst_length, + size_t *src_length) +{ + size_t src_length_one, dst_length_one; + ucs_status_t status; + + *src_length = 0; + *dst_length = 0; + while (*src_length < min_src_length) { + status = ucm_bistro_relocate_one(UCS_PTR_BYTE_OFFSET(dst, *dst_length), + UCS_PTR_BYTE_OFFSET(src, *src_length), + max_dst_length - *dst_length, + &dst_length_one, &src_length_one); + if (status != UCS_OK) { + return status; + } + + *dst_length += dst_length_one; + *src_length += src_length_one; + } - /* unsupported instruction - bail */ - return prev_offset; + ucm_assert(*dst_length <= max_dst_length); + return UCS_OK; +} + +static const char * +ucm_bistro_dump_code(const void *code, size_t length, char *str, size_t max) +{ + const void *code_p = code; + char *p = str; + char *endp = str + max; + + while (code_p < UCS_PTR_BYTE_OFFSET(code, length)) { + snprintf(p, endp - p, " %02X", + *ucs_serialize_next(&code_p, const uint8_t)); + p += strlen(p); } - return offset; + return str; } static ucs_status_t ucm_bistro_construct_orig_func(const void *func_ptr, size_t patch_len, const char *symbol, void **orig_func_p) { + size_t code_len, prefix_len, max_code_len; ucm_bistro_jmp_indirect_t *jmp_back; ucm_bistro_orig_func_t *orig_func; - size_t prefix_len, code_size; - - prefix_len = ucm_bistro_detect_pic_prefix(func_ptr, patch_len); - ucm_debug("'%s' at %p prefix length %zu/%zu", symbol, func_ptr, prefix_len, - patch_len); - if (prefix_len < patch_len) { - return UCS_ERR_UNSUPPORTED; - } + ucs_status_t status; + char code_buf[64]; /* Allocate executable page */ - code_size = sizeof(*orig_func) + patch_len + sizeof(*jmp_back); - orig_func = ucm_bistro_allocate_code(code_size); + max_code_len = patch_len + sizeof(ucm_bistro_compare_xlt_t); + orig_func = ucm_bistro_allocate_code(sizeof(*orig_func) + max_code_len + + sizeof(*jmp_back)); if (orig_func == NULL) { return UCS_ERR_NO_MEMORY; } - /* Copy code fragment from original function */ - memcpy(orig_func->code, func_ptr, prefix_len); + /* Copy and translate code from 'func_ptr' to 'orig_func->code'. + 'code_len' is the code size at destination buffer, and 'prefix_len' is + how many bytes were translated from 'func_ptr'. */ + status = ucm_bistro_relocate_code(orig_func->code, func_ptr, patch_len, + max_code_len, &code_len, &prefix_len); + if (status != UCS_OK) { + ucm_diag("'%s' could not patch by bistro, code:%s", symbol, + ucm_bistro_dump_code(func_ptr, 16, code_buf, + sizeof(code_buf))); + return UCS_ERR_UNSUPPORTED; + } + + ucm_debug("'%s' at %p code length %zu/%zu prefix length %zu", symbol, + func_ptr, code_len, patch_len, prefix_len); /* Indirect jump to *orig_func->jmp_address */ orig_func->jmp_addr = UCS_PTR_BYTE_OFFSET(func_ptr, prefix_len); - jmp_back = UCS_PTR_BYTE_OFFSET(orig_func->code, prefix_len); + jmp_back = UCS_PTR_BYTE_OFFSET(orig_func->code, code_len); jmp_back->opcode = 0xff; jmp_back->modrm = 0x25; jmp_back->displ = UCS_PTR_BYTE_DIFF(jmp_back + 1, &orig_func->jmp_addr); diff --git a/src/ucm/cuda/cudamem.c b/src/ucm/cuda/cudamem.c index dfab90b886f..f3e33a62b8b 100644 --- a/src/ucm/cuda/cudamem.c +++ b/src/ucm/cuda/cudamem.c @@ -206,9 +206,10 @@ static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = { {{NULL}, NULL} }; -static ucm_mmap_hook_mode_t ucm_cuda_hook_mode() +static int ucm_cuda_allow_hook_mode(ucm_mmap_hook_mode_t mode) { - return ucm_get_hook_mode(ucm_global_opts.cuda_hook_mode); + return (ucm_global_opts.cuda_hook_modes & UCS_BIT(mode)) && + (ucm_get_hook_mode(mode) == mode); } static ucs_status_t @@ -231,7 +232,7 @@ ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc, status = UCS_ERR_UNSUPPORTED; - if (ucm_cuda_hook_mode() == UCM_MMAP_HOOK_BISTRO) { + if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) { status = ucm_bistro_patch(func_ptr, func->patch.value, func->patch.symbol, func->orig_func_ptr, NULL); @@ -242,19 +243,24 @@ ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc, continue; } - ucm_debug("failed to install bistro hook for '%s', trying reloc", + ucm_debug("failed to install bistro hook for '%s'", func->patch.symbol); } - status = ucm_reloc_modify(&func->patch); - if (status != UCS_OK) { - ucm_diag("failed to install relocation table entry for '%s'", - func->patch.symbol); - return status; + if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_RELOC)) { + status = ucm_reloc_modify(&func->patch); + if (status == UCS_OK) { + ++num_reloc; + ucm_trace("installed reloc hook on '%s'", func->patch.symbol); + continue; + } + + ucm_debug("failed to install relocation table hook for '%s'", + func->patch.symbol); } - ++num_reloc; - ucm_trace("installed reloc hook on '%s'", func->patch.symbol); + ucm_diag("failed to install hook for '%s'", func->patch.symbol); + return status; } *used_reloc = num_reloc > 0; @@ -274,7 +280,7 @@ static ucs_status_t ucm_cudamem_install(int events) goto out; } - if (ucm_cuda_hook_mode() == UCM_MMAP_HOOK_NONE) { + if (ucm_global_opts.cuda_hook_modes == 0) { ucm_info("cuda memory hooks are disabled by configuration"); status = UCS_ERR_UNSUPPORTED; goto out; diff --git a/src/ucm/util/sys.c b/src/ucm/util/sys.c index 9f0bcacdffb..32e07371f2f 100644 --- a/src/ucm/util/sys.c +++ b/src/ucm/util/sys.c @@ -38,7 +38,11 @@ ucm_global_config_t ucm_global_opts = { .mmap_hook_mode = UCM_DEFAULT_HOOK_MODE, .enable_malloc_hooks = 1, .enable_malloc_reloc = 0, - .cuda_hook_mode = UCM_DEFAULT_HOOK_MODE, + .cuda_hook_modes = +#if UCM_BISTRO_HOOKS + UCS_BIT(UCM_MMAP_HOOK_BISTRO) | +#endif + UCS_BIT(UCM_MMAP_HOOK_RELOC), .enable_dynamic_mmap_thresh = 1, .alloc_alignment = 16, .dlopen_process_rpath = 1 diff --git a/src/ucs/config/ucm_opts.c b/src/ucs/config/ucm_opts.c index bee035a4a9c..89cccc48723 100644 --- a/src/ucs/config/ucm_opts.c +++ b/src/ucs/config/ucm_opts.c @@ -69,8 +69,12 @@ static ucs_config_field_t ucm_global_config_table[] = { "which would use the original implementation and not ours.", ucs_offsetof(ucm_global_config_t, enable_malloc_reloc), UCS_CONFIG_TYPE_BOOL}, - {"CUDA_HOOK_MODE", UCM_DEFAULT_HOOK_MODE_STR, - "Cuda memory hook mode\n" + {"CUDA_HOOK_MODE", +#if UCM_BISTRO_HOOKS + UCM_MMAP_HOOK_BISTRO_STR "," +#endif + UCM_MMAP_HOOK_RELOC_STR, + "Cuda memory hook modes. A combination of:\n" " none - Don't set Cuda hooks.\n" " reloc - Use ELF relocation table to set hooks. In this mode, if any\n" " part of the application is linked with Cuda runtime statically,\n" @@ -81,8 +85,8 @@ static ucs_config_field_t ucm_global_config_table[] = { " Cuda driver APIs, so memory events are reported properly even\n" " for statically-linked applications." #endif - ,ucs_offsetof(ucm_global_config_t, cuda_hook_mode), - UCS_CONFIG_TYPE_ENUM(ucm_mmap_hook_modes)}, + ,ucs_offsetof(ucm_global_config_t, cuda_hook_modes), + UCS_CONFIG_TYPE_BITMAP(ucm_mmap_hook_modes)}, {"CUDA_RELOC", "yes", "The configuration parameter replaced by UCX_MEM_CUDA_HOOK_MODE",