Skip to content

Commit

Permalink
Merge pull request #7209 from yosefe/topic/ucm-bistro-test-fix-suppor…
Browse files Browse the repository at this point in the history
…t-for-cuda-v1.11.x

UCM/BISTRO/TEST: Fix support for cuda memory hooks - v1.11.x
  • Loading branch information
yosefe authored Aug 9, 2021
2 parents 7057cb3 + fb26609 commit 064e6da
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 95 deletions.
4 changes: 4 additions & 0 deletions contrib/test_jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1024,6 +1024,10 @@ test_malloc_hook() {
${cuda_dynamic_exe} -d
[ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d

# Test hooks in gtest
UCX_MEM_LOG_LEVEL=diag \
./test/gtest/gtest --gtest_filter='cuda_hooks.*'

unset UCX_MEM_CUDA_HOOK_MODE
done
fi
Expand Down
2 changes: 1 addition & 1 deletion src/ucm/api/ucm.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ typedef struct ucm_global_config {
ucm_mmap_hook_mode_t mmap_hook_mode; /* MMAP hook mode */
int enable_malloc_hooks; /* Enable installing malloc hooks */
int enable_malloc_reloc; /* Enable installing malloc relocations */
ucm_mmap_hook_mode_t cuda_hook_mode; /* Cuda hooks mode */
int cuda_hook_modes; /* Bitmap of allowed cuda hooks modes */
int enable_dynamic_mmap_thresh; /* Enable adaptive mmap threshold */
size_t alloc_alignment; /* Alignment for memory allocations */
int dlopen_process_rpath; /* Process RPATH section in dlopen hook */
Expand Down
250 changes: 173 additions & 77 deletions src/ucm/bistro/bistro_x86_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <ucm/bistro/bistro_int.h>
#include <ucm/util/sys.h>
#include <ucs/sys/math.h>
#include <ucs/type/serialize.h>


typedef struct {
Expand All @@ -35,6 +36,15 @@ typedef struct {
int32_t displ;
} UCS_S_PACKED ucm_bistro_jmp_indirect_t;

typedef struct {
uint8_t push_rax;
uint8_t movabs_rax[2];
uint64_t rax_value;
uint8_t cmp_dptr_rax[2];
uint32_t cmp_value;
uint8_t pop_rax;
} UCS_S_PACKED ucm_bistro_compare_xlt_t;


/* REX prefix */
#define UCM_BISTRO_X86_REX_MASK 0xF0 /* Mask */
Expand Down Expand Up @@ -87,109 +97,195 @@ typedef struct {
#define UCM_BISTRO_X86_MODRM_CMP_RIP 0x3D /* 11 111 101 */


/*
* Find the minimal length of initial instructions in the function which can be
* safely executed from any memory location.
* Uses a very simplified disassembler which supports only the typical
* instructions found in function prologue.
*/
static size_t ucm_bistro_detect_pic_prefix(const void *func, size_t min_length)
static ucs_status_t
ucm_bistro_relocate_one(void *dst, const void *src, size_t max_dst_length,
size_t *dst_length, size_t *src_length)
{
const void *src_p = src;
ucm_bistro_compare_xlt_t cmp_xlt = {
.push_rax = 0x50,
.movabs_rax = {0x48, 0xb8},
.cmp_dptr_rax = {0x81, 0x38},
.pop_rax = 0x58
};
uint8_t rex, opcode, modrm, mod;
size_t offset, prev_offset;

offset = 0;
while (offset < min_length) {
prev_offset = offset;
opcode = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);

/* check for REX prefix */
if ((opcode & UCM_BISTRO_X86_REX_MASK) == UCM_BISTRO_X86_REX) {
rex = opcode;
opcode = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
} else {
rex = 0;
const void *copy_src;
int32_t disp32;
uint32_t imm32;

/* Check opcode and REX prefix */
opcode = *ucs_serialize_next(&src_p, const uint8_t);
if ((opcode & UCM_BISTRO_X86_REX_MASK) == UCM_BISTRO_X86_REX) {
rex = opcode;
opcode = *ucs_serialize_next(&src_p, const uint8_t);
} else {
rex = 0;
}

if (((rex == 0) || rex == UCM_BISTRO_X86_REX_B) &&
((opcode & UCM_BISTRO_X86_PUSH_R_MASK) == UCM_BISTRO_X86_PUSH_R)) {
/* push reg */
goto out_copy_src;
} else if ((rex == UCM_BISTRO_X86_REX_W) &&
(opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) {
modrm = *ucs_serialize_next(&src_p, const uint8_t);
if (modrm == UCM_BISTRO_X86_MODRM_SUB_SP) {
/* sub $imm32, %rsp */
ucs_serialize_next(&src_p, const uint32_t);
goto out_copy_src;
}
} else if ((rex == UCM_BISTRO_X86_REX_W) &&
(opcode == UCM_BISTRO_X86_MOV_EV_GV)) {
modrm = *ucs_serialize_next(&src_p, const uint8_t);
mod = modrm >> UCM_BISTRO_X86_MODRM_MOD_SHIFT;
if (modrm == UCM_BISTRO_X86_MODRM_BP_SP) {
/* mov %rsp, %rbp */
goto out_copy_src;
}

/* check the opcode */
if (((rex == 0) || rex == UCM_BISTRO_X86_REX_B) &&
((opcode & UCM_BISTRO_X86_PUSH_R_MASK) == UCM_BISTRO_X86_PUSH_R)) {
continue;
} else if ((rex == UCM_BISTRO_X86_REX_W) &&
(opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) {
modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
if (modrm == UCM_BISTRO_X86_MODRM_SUB_SP) {
/* sub $imm32, %rsp */
offset += sizeof(uint32_t);
continue;
}
} else if ((rex == UCM_BISTRO_X86_REX_W) &&
(opcode == UCM_BISTRO_X86_MOV_EV_GV)) {
modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
if (modrm == UCM_BISTRO_X86_MODRM_BP_SP) {
/* mov %rsp, %rbp */
continue;
if ((mod != UCM_BISTRO_X86_MODRM_MOD_REG) &&
((modrm & UCS_MASK(UCM_BISTRO_X86_MODRM_RM_BITS)) ==
UCM_BISTRO_X86_MODRM_RM_SIB)) {
/* r/m = 0b100, mod = 0b00/0b01/0b10 */
ucs_serialize_next(&src_p, const uint8_t); /* skip SIB */
if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP8) {
ucs_serialize_next(&src_p, const uint8_t); /* skip disp8 */
goto out_copy_src;
} else if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP32) {
ucs_serialize_next(&src_p, const uint32_t); /* skip disp32 */
goto out_copy_src;
}
mod = modrm >> UCM_BISTRO_X86_MODRM_MOD_SHIFT;
if ((mod != UCM_BISTRO_X86_MODRM_MOD_REG) &&
((modrm & UCS_MASK(UCM_BISTRO_X86_MODRM_RM_BITS)) ==
UCM_BISTRO_X86_MODRM_RM_SIB)) {
/* r/m = 0b100, mod = 0b00/0b01/0b10 */
++offset; /* skip SIB */
if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP8) {
offset += sizeof(uint8_t); /* skip disp8 */
} else if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP32) {
offset += sizeof(uint32_t); /* skip disp32 */
}
continue;
}
} else if ((rex == 0) &&
((opcode & UCM_BISTRO_X86_MOV_IR_MASK) == UCM_BISTRO_X86_MOV_IR)) {
offset += sizeof(uint32_t);
continue;
} else if ((rex == 0) && (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) {
modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
if (modrm == UCM_BISTRO_X86_MODRM_CMP_RIP) {
offset += sizeof(uint32_t) * 2; /* skip disp32 and imm32 */
}
continue;
}
} else if ((rex == 0) && ((opcode & UCM_BISTRO_X86_MOV_IR_MASK) ==
UCM_BISTRO_X86_MOV_IR)) {
/* mov $imm32, %reg */
ucs_serialize_next(&src_p, const uint32_t);
goto out_copy_src;
} else if ((rex == 0) && (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) {
modrm = *ucs_serialize_next(&src_p, const uint8_t);
if (modrm == UCM_BISTRO_X86_MODRM_CMP_RIP) {
/*
* Since we can't assume the new code will be within 32-bit
* range of the global variable argument, we need to translate
* the code from:
* cmpl $imm32, $disp32(%rip)
* to:
* push %rax
* movq $addr64, %rax ; $addr64 is $disp32+%rip
* cmpl $imm32, (%rax)
* pop %rax
*/
disp32 = *ucs_serialize_next(&src_p, const uint32_t);
imm32 = *ucs_serialize_next(&src_p, const uint32_t);
cmp_xlt.rax_value = (uintptr_t)UCS_PTR_BYTE_OFFSET(src_p, disp32);
cmp_xlt.cmp_value = imm32;
copy_src = &cmp_xlt;
*dst_length = sizeof(cmp_xlt);
goto out_copy;
}
}

/* Could not recognize the instruction */
return UCS_ERR_UNSUPPORTED;

out_copy_src:
copy_src = src;
*dst_length = UCS_PTR_BYTE_DIFF(src, src_p);
out_copy:
if (*dst_length > max_dst_length) {
return UCS_ERR_BUFFER_TOO_SMALL;
}

*src_length = UCS_PTR_BYTE_DIFF(src, src_p);
memcpy(dst, copy_src, *dst_length);
return UCS_OK;
}

/*
* Relocate at least 'min_src_length' code instructions from 'src' to 'dst',
* possibly changing some of them to new instructions.
* Uses a simplified disassembler which supports only typical instructions
* found in function prologue.
*/
static ucs_status_t
ucm_bistro_relocate_code(void *dst, const void *src, size_t min_src_length,
size_t max_dst_length, size_t *dst_length,
size_t *src_length)
{
size_t src_length_one, dst_length_one;
ucs_status_t status;

*src_length = 0;
*dst_length = 0;
while (*src_length < min_src_length) {
status = ucm_bistro_relocate_one(UCS_PTR_BYTE_OFFSET(dst, *dst_length),
UCS_PTR_BYTE_OFFSET(src, *src_length),
max_dst_length - *dst_length,
&dst_length_one, &src_length_one);
if (status != UCS_OK) {
return status;
}

*dst_length += dst_length_one;
*src_length += src_length_one;
}

/* unsupported instruction - bail */
return prev_offset;
ucm_assert(*dst_length <= max_dst_length);
return UCS_OK;
}

static const char *
ucm_bistro_dump_code(const void *code, size_t length, char *str, size_t max)
{
const void *code_p = code;
char *p = str;
char *endp = str + max;

while (code_p < UCS_PTR_BYTE_OFFSET(code, length)) {
snprintf(p, endp - p, " %02X",
*ucs_serialize_next(&code_p, const uint8_t));
p += strlen(p);
}

return offset;
return str;
}

static ucs_status_t
ucm_bistro_construct_orig_func(const void *func_ptr, size_t patch_len,
const char *symbol, void **orig_func_p)
{
size_t code_len, prefix_len, max_code_len;
ucm_bistro_jmp_indirect_t *jmp_back;
ucm_bistro_orig_func_t *orig_func;
size_t prefix_len, code_size;

prefix_len = ucm_bistro_detect_pic_prefix(func_ptr, patch_len);
ucm_debug("'%s' at %p prefix length %zu/%zu", symbol, func_ptr, prefix_len,
patch_len);
if (prefix_len < patch_len) {
return UCS_ERR_UNSUPPORTED;
}
ucs_status_t status;
char code_buf[64];

/* Allocate executable page */
code_size = sizeof(*orig_func) + patch_len + sizeof(*jmp_back);
orig_func = ucm_bistro_allocate_code(code_size);
max_code_len = patch_len + sizeof(ucm_bistro_compare_xlt_t);
orig_func = ucm_bistro_allocate_code(sizeof(*orig_func) + max_code_len +
sizeof(*jmp_back));
if (orig_func == NULL) {
return UCS_ERR_NO_MEMORY;
}

/* Copy code fragment from original function */
memcpy(orig_func->code, func_ptr, prefix_len);
/* Copy and translate code from 'func_ptr' to 'orig_func->code'.
'code_len' is the code size at destination buffer, and 'prefix_len' is
how many bytes were translated from 'func_ptr'. */
status = ucm_bistro_relocate_code(orig_func->code, func_ptr, patch_len,
max_code_len, &code_len, &prefix_len);
if (status != UCS_OK) {
ucm_diag("'%s' could not patch by bistro, code:%s", symbol,
ucm_bistro_dump_code(func_ptr, 16, code_buf,
sizeof(code_buf)));
return UCS_ERR_UNSUPPORTED;
}

ucm_debug("'%s' at %p code length %zu/%zu prefix length %zu", symbol,
func_ptr, code_len, patch_len, prefix_len);

/* Indirect jump to *orig_func->jmp_address */
orig_func->jmp_addr = UCS_PTR_BYTE_OFFSET(func_ptr, prefix_len);
jmp_back = UCS_PTR_BYTE_OFFSET(orig_func->code, prefix_len);
jmp_back = UCS_PTR_BYTE_OFFSET(orig_func->code, code_len);
jmp_back->opcode = 0xff;
jmp_back->modrm = 0x25;
jmp_back->displ = UCS_PTR_BYTE_DIFF(jmp_back + 1, &orig_func->jmp_addr);
Expand Down
30 changes: 18 additions & 12 deletions src/ucm/cuda/cudamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,10 @@ static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = {
{{NULL}, NULL}
};

static ucm_mmap_hook_mode_t ucm_cuda_hook_mode()
static int ucm_cuda_allow_hook_mode(ucm_mmap_hook_mode_t mode)
{
return ucm_get_hook_mode(ucm_global_opts.cuda_hook_mode);
return (ucm_global_opts.cuda_hook_modes & UCS_BIT(mode)) &&
(ucm_get_hook_mode(mode) == mode);
}

static ucs_status_t
Expand All @@ -231,7 +232,7 @@ ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc,

status = UCS_ERR_UNSUPPORTED;

if (ucm_cuda_hook_mode() == UCM_MMAP_HOOK_BISTRO) {
if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) {
status = ucm_bistro_patch(func_ptr, func->patch.value,
func->patch.symbol, func->orig_func_ptr,
NULL);
Expand All @@ -242,19 +243,24 @@ ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc,
continue;
}

ucm_debug("failed to install bistro hook for '%s', trying reloc",
ucm_debug("failed to install bistro hook for '%s'",
func->patch.symbol);
}

status = ucm_reloc_modify(&func->patch);
if (status != UCS_OK) {
ucm_diag("failed to install relocation table entry for '%s'",
func->patch.symbol);
return status;
if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_RELOC)) {
status = ucm_reloc_modify(&func->patch);
if (status == UCS_OK) {
++num_reloc;
ucm_trace("installed reloc hook on '%s'", func->patch.symbol);
continue;
}

ucm_debug("failed to install relocation table hook for '%s'",
func->patch.symbol);
}

++num_reloc;
ucm_trace("installed reloc hook on '%s'", func->patch.symbol);
ucm_diag("failed to install hook for '%s'", func->patch.symbol);
return status;
}

*used_reloc = num_reloc > 0;
Expand All @@ -274,7 +280,7 @@ static ucs_status_t ucm_cudamem_install(int events)
goto out;
}

if (ucm_cuda_hook_mode() == UCM_MMAP_HOOK_NONE) {
if (ucm_global_opts.cuda_hook_modes == 0) {
ucm_info("cuda memory hooks are disabled by configuration");
status = UCS_ERR_UNSUPPORTED;
goto out;
Expand Down
6 changes: 5 additions & 1 deletion src/ucm/util/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ ucm_global_config_t ucm_global_opts = {
.mmap_hook_mode = UCM_DEFAULT_HOOK_MODE,
.enable_malloc_hooks = 1,
.enable_malloc_reloc = 0,
.cuda_hook_mode = UCM_DEFAULT_HOOK_MODE,
.cuda_hook_modes =
#if UCM_BISTRO_HOOKS
UCS_BIT(UCM_MMAP_HOOK_BISTRO) |
#endif
UCS_BIT(UCM_MMAP_HOOK_RELOC),
.enable_dynamic_mmap_thresh = 1,
.alloc_alignment = 16,
.dlopen_process_rpath = 1
Expand Down
Loading

0 comments on commit 064e6da

Please sign in to comment.