From ce7608e1bebde9a6231759d4db59d929332b3ef3 Mon Sep 17 00:00:00 2001 From: Sourav Chakraborty Date: Tue, 3 Dec 2019 08:54:58 -0800 Subject: [PATCH] UCT/ROCM/COPY: Use faster memcpy for device to host copies --- src/ucs/arch/aarch64/cpu.h | 6 ++- src/ucs/arch/ppc64/cpu.h | 6 ++- src/ucs/arch/x86_64/cpu.c | 68 ++++++++++++++++++++++++++++++++ src/ucs/arch/x86_64/cpu.h | 6 +++ src/uct/rocm/copy/rocm_copy_ep.c | 13 +++--- 5 files changed, 92 insertions(+), 7 deletions(-) diff --git a/src/ucs/arch/aarch64/cpu.h b/src/ucs/arch/aarch64/cpu.h index 360f4843eaa3..52a8ae6e7d5b 100644 --- a/src/ucs/arch/aarch64/cpu.h +++ b/src/ucs/arch/aarch64/cpu.h @@ -65,7 +65,6 @@ BEGIN_C_DECLS */ #define ucs_memory_cpu_wc_fence() ucs_aarch64_dmb(oshst) - /* * ARM processor ID (ARM ISA - Main ID Register, EL1) */ @@ -203,6 +202,11 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) return memcpy(dst, src, len); } +static inline void *ucs_memcpy_nontemporal(void *dst, const void *src, size_t len) +{ + return memcpy(dst, src, len); +} + static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes) { return UCS_ERR_UNSUPPORTED; diff --git a/src/ucs/arch/ppc64/cpu.h b/src/ucs/arch/ppc64/cpu.h index 964c29d16756..e61a383e1b51 100644 --- a/src/ucs/arch/ppc64/cpu.h +++ b/src/ucs/arch/ppc64/cpu.h @@ -38,7 +38,6 @@ BEGIN_C_DECLS ::: "memory") #define ucs_memory_cpu_wc_fence() ucs_memory_bus_fence() - static inline uint64_t ucs_arch_read_hres_clock() { #ifndef HAVE_SYS_PLATFORM_PPC_H @@ -85,6 +84,11 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) return memcpy(dst, src, len); } +static inline void *ucs_memcpy_nontemporal(void *dst, const void *src, size_t len) +{ + return memcpy(dst, src, len); +} + static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes) { return UCS_ERR_UNSUPPORTED; diff --git a/src/ucs/arch/x86_64/cpu.c b/src/ucs/arch/x86_64/cpu.c index 8362b1314453..11f836bc24cc 100644 --- a/src/ucs/arch/x86_64/cpu.c +++ b/src/ucs/arch/x86_64/cpu.c @@ -586,4 +586,72 @@ ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes) return cache_count == UCS_CPU_CACHE_LAST ? UCS_OK : UCS_ERR_UNSUPPORTED; } +#ifdef __SSE4_1__ +#define _mm_load(a) _mm_stream_load_si128((__m128i *) (a)) +#define _mm_store(a,v) _mm_storeu_si128((__m128i *) (a), (v)) +#endif + +void *ucs_x86_memcpy_sse_movntdqa(void *dest, const void *source, size_t len) +{ +#ifdef __SSE4_1__ + const char *src = source; + char *dst = dest; + void *result = dst; + + /* Copy unaligned portion of src */ + if ((uintptr_t)src & 15) { + uintptr_t aligned = (uintptr_t)src & ~15; + uintptr_t misalign = (uintptr_t)src & 15; + uintptr_t copy = ucs_min(len, 16 - misalign); + + __m128i tmp = _mm_load(aligned); + memcpy(dst, (char*)(&tmp) + misalign, copy); + + src += copy; + dst += copy; + len -= copy; + } + + /* Copy 64 bytes at a time */ + while (len >= 64) { + __m128i *S = (__m128i *)src; + __m128i *D = (__m128i *)dst; + __m128i tmp[4]; + + tmp[0] = _mm_load(S + 0); + tmp[1] = _mm_load(S + 1); + tmp[2] = _mm_load(S + 2); + tmp[3] = _mm_load(S + 3); + + _mm_store(D + 0, tmp[0]); + _mm_store(D + 1, tmp[1]); + _mm_store(D + 2, tmp[2]); + _mm_store(D + 3, tmp[3]); + + src += 64; + dst += 64; + len -= 64; + } + + /* Copy 16 bytes at a time */ + while (len >= 16) { + _mm_store(dst, _mm_load(src)); + + src += 16; + dst += 16; + len -= 16; + } + + /* Copy any remaining bytes */ + if (len) { + __m128i tmp = _mm_load(src); + memcpy(dst, &tmp, len); + } + + return result; +#else + return memcpy(dest, source, len); +#endif +} + #endif diff --git a/src/ucs/arch/x86_64/cpu.h b/src/ucs/arch/x86_64/cpu.h index 264fb7f3fee5..ac67108bf3dc 100644 --- a/src/ucs/arch/x86_64/cpu.h +++ b/src/ucs/arch/x86_64/cpu.h @@ -52,6 +52,7 @@ ucs_cpu_flag_t ucs_arch_get_cpu_flag() UCS_F_NOOPTIMIZE; ucs_cpu_vendor_t ucs_arch_get_cpu_vendor(); void ucs_cpu_init(); ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes); +void *ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len); static inline int ucs_arch_x86_rdtsc_enabled() { @@ -109,6 +110,11 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) return memcpy(dst, src, len); } +static inline void *ucs_memcpy_nontemporal(void *dst, const void *src, size_t len) +{ + return ucs_x86_memcpy_sse_movntdqa(dst, src, len); +} + END_C_DECLS #endif diff --git a/src/uct/rocm/copy/rocm_copy_ep.c b/src/uct/rocm/copy/rocm_copy_ep.c index 35bca63142e4..e72de0496b2c 100644 --- a/src/uct/rocm/copy/rocm_copy_ep.c +++ b/src/uct/rocm/copy/rocm_copy_ep.c @@ -9,6 +9,10 @@ #include #include #include +#include + +#define memcpy_h2d(_d,_s,_l) memcpy((_d),(_s),(_l)) +#define memcpy_d2h(_d,_s,_l) ucs_memcpy_nontemporal((_d),(_s),(_l)) static UCS_CLASS_INIT_FUNC(uct_rocm_copy_ep_t, const uct_ep_params_t *params) { @@ -44,9 +48,9 @@ uct_rocm_copy_ep_zcopy(uct_ep_h tl_ep, } if (is_put) - memcpy((void *)remote_addr, iov->buffer, size); + memcpy_h2d((void *)remote_addr, iov->buffer, size); else - memcpy(iov->buffer, (void *)remote_addr, size); + memcpy_d2h(iov->buffer, (void *)remote_addr, size); return UCS_OK; } @@ -87,7 +91,7 @@ ucs_status_t uct_rocm_copy_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { - memcpy((void *)remote_addr, buffer, length); + memcpy_h2d((void *)remote_addr, buffer, length); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, SHORT, length); ucs_trace_data("PUT_SHORT size %d from %p to %p", @@ -99,8 +103,7 @@ ucs_status_t uct_rocm_copy_ep_get_short(uct_ep_h tl_ep, void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { - /* device to host */ - memcpy(buffer, (void *)remote_addr, length); + memcpy_d2h(buffer, (void *)remote_addr, length); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, SHORT, length); ucs_trace_data("GET_SHORT size %d from %p to %p",