Skip to content

Commit

Permalink
UCM/BISTRO/TEST: Fix support for cuda memory hooks
Browse files Browse the repository at this point in the history
When a hooked function starts with instruction that refers global data
(for example, "cmp $imm32, $disp32(%rip)") - we need to modify the
instruction so it could be executed from is new location and still
access the same global variable.

Change Cuda hooks configuration so it can also support "bistro hooks
without fallback to reloc" and test this mode in CI.
  • Loading branch information
yosefe committed Aug 9, 2021
1 parent 7057cb3 commit fb26609
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 95 deletions.
4 changes: 4 additions & 0 deletions contrib/test_jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1024,6 +1024,10 @@ test_malloc_hook() {
${cuda_dynamic_exe} -d
[ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d

# Test hooks in gtest
UCX_MEM_LOG_LEVEL=diag \
./test/gtest/gtest --gtest_filter='cuda_hooks.*'

unset UCX_MEM_CUDA_HOOK_MODE
done
fi
Expand Down
2 changes: 1 addition & 1 deletion src/ucm/api/ucm.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ typedef struct ucm_global_config {
ucm_mmap_hook_mode_t mmap_hook_mode; /* MMAP hook mode */
int enable_malloc_hooks; /* Enable installing malloc hooks */
int enable_malloc_reloc; /* Enable installing malloc relocations */
ucm_mmap_hook_mode_t cuda_hook_mode; /* Cuda hooks mode */
int cuda_hook_modes; /* Bitmap of allowed cuda hooks modes */
int enable_dynamic_mmap_thresh; /* Enable adaptive mmap threshold */
size_t alloc_alignment; /* Alignment for memory allocations */
int dlopen_process_rpath; /* Process RPATH section in dlopen hook */
Expand Down
250 changes: 173 additions & 77 deletions src/ucm/bistro/bistro_x86_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <ucm/bistro/bistro_int.h>
#include <ucm/util/sys.h>
#include <ucs/sys/math.h>
#include <ucs/type/serialize.h>


typedef struct {
Expand All @@ -35,6 +36,15 @@ typedef struct {
int32_t displ;
} UCS_S_PACKED ucm_bistro_jmp_indirect_t;

typedef struct {
uint8_t push_rax;
uint8_t movabs_rax[2];
uint64_t rax_value;
uint8_t cmp_dptr_rax[2];
uint32_t cmp_value;
uint8_t pop_rax;
} UCS_S_PACKED ucm_bistro_compare_xlt_t;


/* REX prefix */
#define UCM_BISTRO_X86_REX_MASK 0xF0 /* Mask */
Expand Down Expand Up @@ -87,109 +97,195 @@ typedef struct {
#define UCM_BISTRO_X86_MODRM_CMP_RIP 0x3D /* 11 111 101 */


/*
* Find the minimal length of initial instructions in the function which can be
* safely executed from any memory location.
* Uses a very simplified disassembler which supports only the typical
* instructions found in function prologue.
*/
static size_t ucm_bistro_detect_pic_prefix(const void *func, size_t min_length)
static ucs_status_t
ucm_bistro_relocate_one(void *dst, const void *src, size_t max_dst_length,
size_t *dst_length, size_t *src_length)
{
const void *src_p = src;
ucm_bistro_compare_xlt_t cmp_xlt = {
.push_rax = 0x50,
.movabs_rax = {0x48, 0xb8},
.cmp_dptr_rax = {0x81, 0x38},
.pop_rax = 0x58
};
uint8_t rex, opcode, modrm, mod;
size_t offset, prev_offset;

offset = 0;
while (offset < min_length) {
prev_offset = offset;
opcode = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);

/* check for REX prefix */
if ((opcode & UCM_BISTRO_X86_REX_MASK) == UCM_BISTRO_X86_REX) {
rex = opcode;
opcode = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
} else {
rex = 0;
const void *copy_src;
int32_t disp32;
uint32_t imm32;

/* Check opcode and REX prefix */
opcode = *ucs_serialize_next(&src_p, const uint8_t);
if ((opcode & UCM_BISTRO_X86_REX_MASK) == UCM_BISTRO_X86_REX) {
rex = opcode;
opcode = *ucs_serialize_next(&src_p, const uint8_t);
} else {
rex = 0;
}

if (((rex == 0) || rex == UCM_BISTRO_X86_REX_B) &&
((opcode & UCM_BISTRO_X86_PUSH_R_MASK) == UCM_BISTRO_X86_PUSH_R)) {
/* push reg */
goto out_copy_src;
} else if ((rex == UCM_BISTRO_X86_REX_W) &&
(opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) {
modrm = *ucs_serialize_next(&src_p, const uint8_t);
if (modrm == UCM_BISTRO_X86_MODRM_SUB_SP) {
/* sub $imm32, %rsp */
ucs_serialize_next(&src_p, const uint32_t);
goto out_copy_src;
}
} else if ((rex == UCM_BISTRO_X86_REX_W) &&
(opcode == UCM_BISTRO_X86_MOV_EV_GV)) {
modrm = *ucs_serialize_next(&src_p, const uint8_t);
mod = modrm >> UCM_BISTRO_X86_MODRM_MOD_SHIFT;
if (modrm == UCM_BISTRO_X86_MODRM_BP_SP) {
/* mov %rsp, %rbp */
goto out_copy_src;
}

/* check the opcode */
if (((rex == 0) || rex == UCM_BISTRO_X86_REX_B) &&
((opcode & UCM_BISTRO_X86_PUSH_R_MASK) == UCM_BISTRO_X86_PUSH_R)) {
continue;
} else if ((rex == UCM_BISTRO_X86_REX_W) &&
(opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) {
modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
if (modrm == UCM_BISTRO_X86_MODRM_SUB_SP) {
/* sub $imm32, %rsp */
offset += sizeof(uint32_t);
continue;
}
} else if ((rex == UCM_BISTRO_X86_REX_W) &&
(opcode == UCM_BISTRO_X86_MOV_EV_GV)) {
modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
if (modrm == UCM_BISTRO_X86_MODRM_BP_SP) {
/* mov %rsp, %rbp */
continue;
if ((mod != UCM_BISTRO_X86_MODRM_MOD_REG) &&
((modrm & UCS_MASK(UCM_BISTRO_X86_MODRM_RM_BITS)) ==
UCM_BISTRO_X86_MODRM_RM_SIB)) {
/* r/m = 0b100, mod = 0b00/0b01/0b10 */
ucs_serialize_next(&src_p, const uint8_t); /* skip SIB */
if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP8) {
ucs_serialize_next(&src_p, const uint8_t); /* skip disp8 */
goto out_copy_src;
} else if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP32) {
ucs_serialize_next(&src_p, const uint32_t); /* skip disp32 */
goto out_copy_src;
}
mod = modrm >> UCM_BISTRO_X86_MODRM_MOD_SHIFT;
if ((mod != UCM_BISTRO_X86_MODRM_MOD_REG) &&
((modrm & UCS_MASK(UCM_BISTRO_X86_MODRM_RM_BITS)) ==
UCM_BISTRO_X86_MODRM_RM_SIB)) {
/* r/m = 0b100, mod = 0b00/0b01/0b10 */
++offset; /* skip SIB */
if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP8) {
offset += sizeof(uint8_t); /* skip disp8 */
} else if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP32) {
offset += sizeof(uint32_t); /* skip disp32 */
}
continue;
}
} else if ((rex == 0) &&
((opcode & UCM_BISTRO_X86_MOV_IR_MASK) == UCM_BISTRO_X86_MOV_IR)) {
offset += sizeof(uint32_t);
continue;
} else if ((rex == 0) && (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) {
modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
if (modrm == UCM_BISTRO_X86_MODRM_CMP_RIP) {
offset += sizeof(uint32_t) * 2; /* skip disp32 and imm32 */
}
continue;
}
} else if ((rex == 0) && ((opcode & UCM_BISTRO_X86_MOV_IR_MASK) ==
UCM_BISTRO_X86_MOV_IR)) {
/* mov $imm32, %reg */
ucs_serialize_next(&src_p, const uint32_t);
goto out_copy_src;
} else if ((rex == 0) && (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) {
modrm = *ucs_serialize_next(&src_p, const uint8_t);
if (modrm == UCM_BISTRO_X86_MODRM_CMP_RIP) {
/*
* Since we can't assume the new code will be within 32-bit
* range of the global variable argument, we need to translate
* the code from:
* cmpl $imm32, $disp32(%rip)
* to:
* push %rax
* movq $addr64, %rax ; $addr64 is $disp32+%rip
* cmpl $imm32, (%rax)
* pop %rax
*/
disp32 = *ucs_serialize_next(&src_p, const uint32_t);
imm32 = *ucs_serialize_next(&src_p, const uint32_t);
cmp_xlt.rax_value = (uintptr_t)UCS_PTR_BYTE_OFFSET(src_p, disp32);
cmp_xlt.cmp_value = imm32;
copy_src = &cmp_xlt;
*dst_length = sizeof(cmp_xlt);
goto out_copy;
}
}

/* Could not recognize the instruction */
return UCS_ERR_UNSUPPORTED;

out_copy_src:
copy_src = src;
*dst_length = UCS_PTR_BYTE_DIFF(src, src_p);
out_copy:
if (*dst_length > max_dst_length) {
return UCS_ERR_BUFFER_TOO_SMALL;
}

*src_length = UCS_PTR_BYTE_DIFF(src, src_p);
memcpy(dst, copy_src, *dst_length);
return UCS_OK;
}

/*
* Relocate at least 'min_src_length' code instructions from 'src' to 'dst',
* possibly changing some of them to new instructions.
* Uses a simplified disassembler which supports only typical instructions
* found in function prologue.
*/
static ucs_status_t
ucm_bistro_relocate_code(void *dst, const void *src, size_t min_src_length,
size_t max_dst_length, size_t *dst_length,
size_t *src_length)
{
size_t src_length_one, dst_length_one;
ucs_status_t status;

*src_length = 0;
*dst_length = 0;
while (*src_length < min_src_length) {
status = ucm_bistro_relocate_one(UCS_PTR_BYTE_OFFSET(dst, *dst_length),
UCS_PTR_BYTE_OFFSET(src, *src_length),
max_dst_length - *dst_length,
&dst_length_one, &src_length_one);
if (status != UCS_OK) {
return status;
}

*dst_length += dst_length_one;
*src_length += src_length_one;
}

/* unsupported instruction - bail */
return prev_offset;
ucm_assert(*dst_length <= max_dst_length);
return UCS_OK;
}

static const char *
ucm_bistro_dump_code(const void *code, size_t length, char *str, size_t max)
{
const void *code_p = code;
char *p = str;
char *endp = str + max;

while (code_p < UCS_PTR_BYTE_OFFSET(code, length)) {
snprintf(p, endp - p, " %02X",
*ucs_serialize_next(&code_p, const uint8_t));
p += strlen(p);
}

return offset;
return str;
}

static ucs_status_t
ucm_bistro_construct_orig_func(const void *func_ptr, size_t patch_len,
const char *symbol, void **orig_func_p)
{
size_t code_len, prefix_len, max_code_len;
ucm_bistro_jmp_indirect_t *jmp_back;
ucm_bistro_orig_func_t *orig_func;
size_t prefix_len, code_size;

prefix_len = ucm_bistro_detect_pic_prefix(func_ptr, patch_len);
ucm_debug("'%s' at %p prefix length %zu/%zu", symbol, func_ptr, prefix_len,
patch_len);
if (prefix_len < patch_len) {
return UCS_ERR_UNSUPPORTED;
}
ucs_status_t status;
char code_buf[64];

/* Allocate executable page */
code_size = sizeof(*orig_func) + patch_len + sizeof(*jmp_back);
orig_func = ucm_bistro_allocate_code(code_size);
max_code_len = patch_len + sizeof(ucm_bistro_compare_xlt_t);
orig_func = ucm_bistro_allocate_code(sizeof(*orig_func) + max_code_len +
sizeof(*jmp_back));
if (orig_func == NULL) {
return UCS_ERR_NO_MEMORY;
}

/* Copy code fragment from original function */
memcpy(orig_func->code, func_ptr, prefix_len);
/* Copy and translate code from 'func_ptr' to 'orig_func->code'.
'code_len' is the code size at destination buffer, and 'prefix_len' is
how many bytes were translated from 'func_ptr'. */
status = ucm_bistro_relocate_code(orig_func->code, func_ptr, patch_len,
max_code_len, &code_len, &prefix_len);
if (status != UCS_OK) {
ucm_diag("'%s' could not patch by bistro, code:%s", symbol,
ucm_bistro_dump_code(func_ptr, 16, code_buf,
sizeof(code_buf)));
return UCS_ERR_UNSUPPORTED;
}

ucm_debug("'%s' at %p code length %zu/%zu prefix length %zu", symbol,
func_ptr, code_len, patch_len, prefix_len);

/* Indirect jump to *orig_func->jmp_address */
orig_func->jmp_addr = UCS_PTR_BYTE_OFFSET(func_ptr, prefix_len);
jmp_back = UCS_PTR_BYTE_OFFSET(orig_func->code, prefix_len);
jmp_back = UCS_PTR_BYTE_OFFSET(orig_func->code, code_len);
jmp_back->opcode = 0xff;
jmp_back->modrm = 0x25;
jmp_back->displ = UCS_PTR_BYTE_DIFF(jmp_back + 1, &orig_func->jmp_addr);
Expand Down
30 changes: 18 additions & 12 deletions src/ucm/cuda/cudamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,10 @@ static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = {
{{NULL}, NULL}
};

static ucm_mmap_hook_mode_t ucm_cuda_hook_mode()
static int ucm_cuda_allow_hook_mode(ucm_mmap_hook_mode_t mode)
{
return ucm_get_hook_mode(ucm_global_opts.cuda_hook_mode);
return (ucm_global_opts.cuda_hook_modes & UCS_BIT(mode)) &&
(ucm_get_hook_mode(mode) == mode);
}

static ucs_status_t
Expand All @@ -231,7 +232,7 @@ ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc,

status = UCS_ERR_UNSUPPORTED;

if (ucm_cuda_hook_mode() == UCM_MMAP_HOOK_BISTRO) {
if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) {
status = ucm_bistro_patch(func_ptr, func->patch.value,
func->patch.symbol, func->orig_func_ptr,
NULL);
Expand All @@ -242,19 +243,24 @@ ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc,
continue;
}

ucm_debug("failed to install bistro hook for '%s', trying reloc",
ucm_debug("failed to install bistro hook for '%s'",
func->patch.symbol);
}

status = ucm_reloc_modify(&func->patch);
if (status != UCS_OK) {
ucm_diag("failed to install relocation table entry for '%s'",
func->patch.symbol);
return status;
if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_RELOC)) {
status = ucm_reloc_modify(&func->patch);
if (status == UCS_OK) {
++num_reloc;
ucm_trace("installed reloc hook on '%s'", func->patch.symbol);
continue;
}

ucm_debug("failed to install relocation table hook for '%s'",
func->patch.symbol);
}

++num_reloc;
ucm_trace("installed reloc hook on '%s'", func->patch.symbol);
ucm_diag("failed to install hook for '%s'", func->patch.symbol);
return status;
}

*used_reloc = num_reloc > 0;
Expand All @@ -274,7 +280,7 @@ static ucs_status_t ucm_cudamem_install(int events)
goto out;
}

if (ucm_cuda_hook_mode() == UCM_MMAP_HOOK_NONE) {
if (ucm_global_opts.cuda_hook_modes == 0) {
ucm_info("cuda memory hooks are disabled by configuration");
status = UCS_ERR_UNSUPPORTED;
goto out;
Expand Down
6 changes: 5 additions & 1 deletion src/ucm/util/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ ucm_global_config_t ucm_global_opts = {
.mmap_hook_mode = UCM_DEFAULT_HOOK_MODE,
.enable_malloc_hooks = 1,
.enable_malloc_reloc = 0,
.cuda_hook_mode = UCM_DEFAULT_HOOK_MODE,
.cuda_hook_modes =
#if UCM_BISTRO_HOOKS
UCS_BIT(UCM_MMAP_HOOK_BISTRO) |
#endif
UCS_BIT(UCM_MMAP_HOOK_RELOC),
.enable_dynamic_mmap_thresh = 1,
.alloc_alignment = 16,
.dlopen_process_rpath = 1
Expand Down
Loading

0 comments on commit fb26609

Please sign in to comment.