diff --git a/src/ucs/arch/aarch64/cpu.h b/src/ucs/arch/aarch64/cpu.h index 360f4843eaa..1fee75a9608 100644 --- a/src/ucs/arch/aarch64/cpu.h +++ b/src/ucs/arch/aarch64/cpu.h @@ -203,6 +203,12 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) return memcpy(dst, src, len); } +static UCS_F_ALWAYS_INLINE void +ucs_memcpy_nontemporal(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes) { return UCS_ERR_UNSUPPORTED; diff --git a/src/ucs/arch/ppc64/cpu.h b/src/ucs/arch/ppc64/cpu.h index 964c29d1675..d1aeb9fe737 100644 --- a/src/ucs/arch/ppc64/cpu.h +++ b/src/ucs/arch/ppc64/cpu.h @@ -85,6 +85,12 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) return memcpy(dst, src, len); } +static UCS_F_ALWAYS_INLINE void +ucs_memcpy_nontemporal(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes) { return UCS_ERR_UNSUPPORTED; diff --git a/src/ucs/arch/x86_64/cpu.c b/src/ucs/arch/x86_64/cpu.c index 8362b131445..4540f091555 100644 --- a/src/ucs/arch/x86_64/cpu.c +++ b/src/ucs/arch/x86_64/cpu.c @@ -27,6 +27,11 @@ #define X86_CPU_CACHE_TAG_L1_ONLY 0x40 #define X86_CPU_CACHE_TAG_LEAF4 0xff +#if defined (__SSE4_1__) +#define _mm_load(a) _mm_stream_load_si128((__m128i *) (a)) +#define _mm_store(a,v) _mm_storeu_si128((__m128i *) (a), (v)) +#endif + typedef enum ucs_x86_cpu_cache_type { X86_CPU_CACHE_TYPE_DATA = 1, @@ -586,4 +591,61 @@ ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes) return cache_count == UCS_CPU_CACHE_LAST ? UCS_OK : UCS_ERR_UNSUPPORTED; } +void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len) +{ +#if defined (__SSE4_1__) + /* Copy unaligned portion of src */ + if ((uintptr_t)src & 15) { + uintptr_t aligned = (uintptr_t)src & ~15; + uintptr_t misalign = (uintptr_t)src & 15; + uintptr_t copy = ucs_min(len, 16 - misalign); + + __m128i tmp = _mm_load(aligned); + memcpy(dst, UCS_PTR_BYTE_OFFSET(&tmp, misalign), copy); + + src = UCS_PTR_BYTE_OFFSET(src, copy); + dst = UCS_PTR_BYTE_OFFSET(dst, copy); + len -= copy; + } + + /* Copy 64 bytes at a time */ + while (len >= 64) { + __m128i *S = (__m128i *)src; + __m128i *D = (__m128i *)dst; + __m128i tmp[4]; + + tmp[0] = _mm_load(S + 0); + tmp[1] = _mm_load(S + 1); + tmp[2] = _mm_load(S + 2); + tmp[3] = _mm_load(S + 3); + + _mm_store(D + 0, tmp[0]); + _mm_store(D + 1, tmp[1]); + _mm_store(D + 2, tmp[2]); + _mm_store(D + 3, tmp[3]); + + src = UCS_PTR_BYTE_OFFSET(src, 64); + dst = UCS_PTR_BYTE_OFFSET(dst, 64); + len -= 64; + } + + /* Copy 16 bytes at a time */ + while (len >= 16) { + _mm_store(dst, _mm_load(src)); + + src = UCS_PTR_BYTE_OFFSET(src, 16); + dst = UCS_PTR_BYTE_OFFSET(dst, 16); + len -= 16; + } + + /* Copy any remaining bytes */ + if (len) { + __m128i tmp = _mm_load(src); + memcpy(dst, &tmp, len); + } +#else + memcpy(dst, src, len); +#endif +} + #endif diff --git a/src/ucs/arch/x86_64/cpu.h b/src/ucs/arch/x86_64/cpu.h index 264fb7f3fee..7fc00627e6e 100644 --- a/src/ucs/arch/x86_64/cpu.h +++ b/src/ucs/arch/x86_64/cpu.h @@ -52,6 +52,7 @@ ucs_cpu_flag_t ucs_arch_get_cpu_flag() UCS_F_NOOPTIMIZE; ucs_cpu_vendor_t ucs_arch_get_cpu_vendor(); void ucs_cpu_init(); ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes); +void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len); static inline int ucs_arch_x86_rdtsc_enabled() { @@ -109,6 +110,12 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) return memcpy(dst, src, len); } +static UCS_F_ALWAYS_INLINE void +ucs_memcpy_nontemporal(void *dst, const void *src, size_t len) +{ + ucs_x86_memcpy_sse_movntdqa(dst, src, len); +} + END_C_DECLS #endif diff --git a/src/uct/rocm/copy/rocm_copy_ep.c b/src/uct/rocm/copy/rocm_copy_ep.c index 35bca63142e..b89567669cc 100644 --- a/src/uct/rocm/copy/rocm_copy_ep.c +++ b/src/uct/rocm/copy/rocm_copy_ep.c @@ -9,6 +9,10 @@ #include #include #include +#include + +#define uct_rocm_memcpy_h2d(_d,_s,_l) memcpy((_d),(_s),(_l)) +#define uct_rocm_memcpy_d2h(_d,_s,_l) ucs_memcpy_nontemporal((_d),(_s),(_l)) static UCS_CLASS_INIT_FUNC(uct_rocm_copy_ep_t, const uct_ep_params_t *params) { @@ -44,9 +48,9 @@ uct_rocm_copy_ep_zcopy(uct_ep_h tl_ep, } if (is_put) - memcpy((void *)remote_addr, iov->buffer, size); + uct_rocm_memcpy_h2d((void *)remote_addr, iov->buffer, size); else - memcpy(iov->buffer, (void *)remote_addr, size); + uct_rocm_memcpy_d2h(iov->buffer, (void *)remote_addr, size); return UCS_OK; } @@ -87,7 +91,7 @@ ucs_status_t uct_rocm_copy_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { - memcpy((void *)remote_addr, buffer, length); + uct_rocm_memcpy_h2d((void *)remote_addr, buffer, length); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, SHORT, length); ucs_trace_data("PUT_SHORT size %d from %p to %p", @@ -99,8 +103,7 @@ ucs_status_t uct_rocm_copy_ep_get_short(uct_ep_h tl_ep, void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { - /* device to host */ - memcpy(buffer, (void *)remote_addr, length); + uct_rocm_memcpy_d2h(buffer, (void *)remote_addr, length); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, SHORT, length); ucs_trace_data("GET_SHORT size %d from %p to %p",