diff --git a/config/m4/rocm.m4 b/config/m4/rocm.m4 index db4afcf2a7c..ab58b6dce4d 100644 --- a/config/m4/rocm.m4 +++ b/config/m4/rocm.m4 @@ -57,6 +57,7 @@ AC_ARG_WITH([rocm], [with_rocm=guess]) rocm_happy=no +hip_happy=no AS_IF([test "x$with_rocm" != "xno"], [AS_CASE(["x$with_rocm"], [x|xguess|xyes], diff --git a/src/tools/perf/Makefile.am b/src/tools/perf/Makefile.am index 9365b2cd517..bc68f81c440 100644 --- a/src/tools/perf/Makefile.am +++ b/src/tools/perf/Makefile.am @@ -8,7 +8,7 @@ # See file LICENSE for terms. # -SUBDIRS = cuda lib +SUBDIRS = cuda rocm lib CC = $(UCX_PERFTEST_CC) noinst_HEADERS = api/libperf.h diff --git a/src/tools/perf/configure.m4 b/src/tools/perf/configure.m4 index 181006bd330..509c38331f3 100644 --- a/src/tools/perf/configure.m4 +++ b/src/tools/perf/configure.m4 @@ -7,6 +7,7 @@ ucx_perftest_modules="" m4_include([src/tools/perf/lib/configure.m4]) m4_include([src/tools/perf/cuda/configure.m4]) +m4_include([src/tools/perf/rocm/configure.m4]) AC_DEFINE_UNQUOTED([ucx_perftest_MODULES], ["${ucx_perftest_modules}"], [Perftest loadable modules]) diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c index 8854960d5a0..7b52e022410 100644 --- a/src/tools/perf/perftest.c +++ b/src/tools/perf/perftest.c @@ -373,6 +373,12 @@ static void usage(const struct perftest_context *ctx, const char *program) if (ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_CUDA_MANAGED] != NULL) { printf(" cuda-managed - NVIDIA GPU managed/unified memory\n"); } + if (ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM] != NULL) { + printf(" rocm - AMD/ROCm GPU memory\n"); + } + if (ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM_MANAGED] != NULL) { + printf(" rocm-managed - AMD/ROCm GPU managed memory\n"); + } printf(" -n number of iterations to run (%ld)\n", ctx->params.max_iter); printf(" -w number of warm-up iterations (%zu)\n", ctx->params.warmup_iter); @@ -673,6 +679,14 @@ static ucs_status_t parse_test_params(ucx_perf_params_t *params, char opt, const (ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_CUDA_MANAGED] != NULL)) { params->mem_type = UCS_MEMORY_TYPE_CUDA_MANAGED; return UCS_OK; + } else if (!strcmp(optarg, "rocm") && + (ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM] != NULL)) { + params->mem_type = UCS_MEMORY_TYPE_ROCM; + return UCS_OK; + } else if (!strcmp(optarg, "rocm-managed") && + (ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM_MANAGED] != NULL)) { + params->mem_type = UCS_MEMORY_TYPE_ROCM_MANAGED; + return UCS_OK; } ucs_error("Unsupported memory type: \"%s\"", optarg); diff --git a/src/tools/perf/rocm/Makefile.am b/src/tools/perf/rocm/Makefile.am new file mode 100644 index 00000000000..81757ed2b16 --- /dev/null +++ b/src/tools/perf/rocm/Makefile.am @@ -0,0 +1,17 @@ +# +# Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. +# +# See file LICENSE for terms. +# + +if HAVE_HIP + +module_LTLIBRARIES = libucx_perftest_rocm.la +libucx_perftest_rocm_la_CPPFLAGS = $(BASE_CPPFLAGS) $(HIP_CPPFLAGS) +libucx_perftest_rocm_la_CFLAGS = $(BASE_CFLAGS) $(HIP_CFLAGS) +libucx_perftest_rocm_la_LDFLAGS = $(HIP_LDFLAGS) $(HIP_LIBS) -version-info $(SOVERSION) +libucx_perftest_rocm_la_SOURCES = rocm_alloc.c + +include $(top_srcdir)/config/module.am + +endif diff --git a/src/tools/perf/rocm/configure.m4 b/src/tools/perf/rocm/configure.m4 new file mode 100644 index 00000000000..cb662a4d540 --- /dev/null +++ b/src/tools/perf/rocm/configure.m4 @@ -0,0 +1,11 @@ +# +# Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. +# +# See file LICENSE for terms. +# + +UCX_CHECK_ROCM + +AS_IF([test "x$rocm_happy" = "xyes"], [ucx_perftest_modules="${ucx_perftest_modules}:rocm"]) + +AC_CONFIG_FILES([src/tools/perf/rocm/Makefile]) diff --git a/src/tools/perf/rocm/rocm_alloc.c b/src/tools/perf/rocm/rocm_alloc.c new file mode 100644 index 00000000000..f8c0f2d2858 --- /dev/null +++ b/src/tools/perf/rocm/rocm_alloc.c @@ -0,0 +1,189 @@ +/** + * Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#include + +#include +#include + + +static ucs_status_t ucx_perf_rocm_init(ucx_perf_context_t *perf) +{ + hipError_t ret; + unsigned group_index; + int num_gpus; + int gpu_index; + + group_index = rte_call(perf, group_index); + + ret = hipGetDeviceCount(&num_gpus); + if (ret != hipSuccess) { + return UCS_ERR_NO_DEVICE; + } + + gpu_index = group_index % num_gpus; + + ret = hipSetDevice(gpu_index); + if (ret != hipSuccess) { + return UCS_ERR_NO_DEVICE; + } + + return UCS_OK; +} + +static inline ucs_status_t ucx_perf_rocm_alloc(size_t length, + ucs_memory_type_t mem_type, + void **address_p) +{ + hipError_t ret; + + ucs_assert((mem_type == UCS_MEMORY_TYPE_ROCM) || + (mem_type == UCS_MEMORY_TYPE_ROCM_MANAGED)); + + ret = ((mem_type == UCS_MEMORY_TYPE_ROCM) ? + hipMalloc(address_p, length) : + hipMallocManaged(address_p, length, hipMemAttachGlobal)); + if (ret != hipSuccess) { + ucs_error("failed to allocate memory"); + return UCS_ERR_NO_MEMORY; + } + + return UCS_OK; +} + +static ucs_status_t ucp_perf_rocm_alloc(const ucx_perf_context_t *perf, size_t length, + void **address_p, ucp_mem_h *memh_p, + int non_blk_flag) +{ + return ucx_perf_rocm_alloc(length, UCS_MEMORY_TYPE_ROCM, address_p); +} + +static ucs_status_t ucp_perf_rocm_alloc_managed(const ucx_perf_context_t *perf, + size_t length, void **address_p, + ucp_mem_h *memh_p, int non_blk_flag) +{ + return ucx_perf_rocm_alloc(length, UCS_MEMORY_TYPE_ROCM_MANAGED, address_p); +} + +static void ucp_perf_rocm_free(const ucx_perf_context_t *perf, + void *address, ucp_mem_h memh) +{ + hipFree(address); +} + +static inline ucs_status_t +uct_perf_rocm_alloc_reg_mem(const ucx_perf_context_t *perf, + size_t length, + ucs_memory_type_t mem_type, + unsigned flags, + uct_allocated_memory_t *alloc_mem) +{ + ucs_status_t status; + + status = ucx_perf_rocm_alloc(length, mem_type, &alloc_mem->address); + if (status != UCS_OK) { + return status; + } + + status = uct_md_mem_reg(perf->uct.md, alloc_mem->address, + length, flags, &alloc_mem->memh); + if (status != UCS_OK) { + hipFree(alloc_mem->address); + ucs_error("failed to register memory"); + return status; + } + + alloc_mem->mem_type = mem_type; + alloc_mem->md = perf->uct.md; + + return UCS_OK; +} + +static ucs_status_t uct_perf_rocm_alloc(const ucx_perf_context_t *perf, + size_t length, unsigned flags, + uct_allocated_memory_t *alloc_mem) +{ + return uct_perf_rocm_alloc_reg_mem(perf, length, UCS_MEMORY_TYPE_ROCM, + flags, alloc_mem); +} + +static ucs_status_t uct_perf_rocm_managed_alloc(const ucx_perf_context_t *perf, + size_t length, unsigned flags, + uct_allocated_memory_t *alloc_mem) +{ + return uct_perf_rocm_alloc_reg_mem(perf, length, UCS_MEMORY_TYPE_ROCM_MANAGED, + flags, alloc_mem); +} + +static void uct_perf_rocm_free(const ucx_perf_context_t *perf, + uct_allocated_memory_t *alloc_mem) +{ + ucs_status_t status; + + ucs_assert(alloc_mem->md == perf->uct.md); + + status = uct_md_mem_dereg(perf->uct.md, alloc_mem->memh); + if (status != UCS_OK) { + ucs_error("failed to deregister memory"); + } + + hipFree(alloc_mem->address); +} + +static void ucx_perf_rocm_memcpy(void *dst, ucs_memory_type_t dst_mem_type, + const void *src, ucs_memory_type_t src_mem_type, + size_t count) +{ + hipError_t ret; + + ret = hipMemcpy(dst, src, count, hipMemcpyDefault); + if (ret != hipSuccess) { + ucs_error("failed to copy memory: %s", hipGetErrorString(ret)); + } +} + +static void* ucx_perf_rocm_memset(void *dst, int value, size_t count) +{ + hipError_t ret; + + ret = hipMemset(dst, value, count); + if (ret != hipSuccess) { + ucs_error("failed to set memory: %s", hipGetErrorString(ret)); + } + + return dst; +} + +UCS_STATIC_INIT { + static ucx_perf_allocator_t rocm_allocator = { + .mem_type = UCS_MEMORY_TYPE_ROCM, + .init = ucx_perf_rocm_init, + .ucp_alloc = ucp_perf_rocm_alloc, + .ucp_free = ucp_perf_rocm_free, + .uct_alloc = uct_perf_rocm_alloc, + .uct_free = uct_perf_rocm_free, + .memcpy = ucx_perf_rocm_memcpy, + .memset = ucx_perf_rocm_memset + }; + static ucx_perf_allocator_t rocm_managed_allocator = { + .mem_type = UCS_MEMORY_TYPE_ROCM_MANAGED, + .init = ucx_perf_rocm_init, + .ucp_alloc = ucp_perf_rocm_alloc_managed, + .ucp_free = ucp_perf_rocm_free, + .uct_alloc = uct_perf_rocm_managed_alloc, + .uct_free = uct_perf_rocm_free, + .memcpy = ucx_perf_rocm_memcpy, + .memset = ucx_perf_rocm_memset + }; + + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM] = &rocm_allocator; + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM_MANAGED] = &rocm_managed_allocator; +} +UCS_STATIC_CLEANUP { + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM] = NULL; + ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_ROCM_MANAGED] = NULL; + +}