Skip to content

Commit

Permalink
UCT/ROCM/COPY: Use faster memcpy for device to host copies
Browse files Browse the repository at this point in the history
  • Loading branch information
Sourav Chakraborty committed Dec 3, 2019
1 parent a035751 commit ce7608e
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 7 deletions.
6 changes: 5 additions & 1 deletion src/ucs/arch/aarch64/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ BEGIN_C_DECLS
*/
#define ucs_memory_cpu_wc_fence() ucs_aarch64_dmb(oshst)


/*
* ARM processor ID (ARM ISA - Main ID Register, EL1)
*/
Expand Down Expand Up @@ -203,6 +202,11 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
return memcpy(dst, src, len);
}

static inline void *ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
{
return memcpy(dst, src, len);
}

static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
{
return UCS_ERR_UNSUPPORTED;
Expand Down
6 changes: 5 additions & 1 deletion src/ucs/arch/ppc64/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ BEGIN_C_DECLS
::: "memory")
#define ucs_memory_cpu_wc_fence() ucs_memory_bus_fence()


static inline uint64_t ucs_arch_read_hres_clock()
{
#ifndef HAVE_SYS_PLATFORM_PPC_H
Expand Down Expand Up @@ -85,6 +84,11 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
return memcpy(dst, src, len);
}

static inline void *ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
{
return memcpy(dst, src, len);
}

static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
{
return UCS_ERR_UNSUPPORTED;
Expand Down
68 changes: 68 additions & 0 deletions src/ucs/arch/x86_64/cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -586,4 +586,72 @@ ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
return cache_count == UCS_CPU_CACHE_LAST ? UCS_OK : UCS_ERR_UNSUPPORTED;
}

#ifdef __SSE4_1__
#define _mm_load(a) _mm_stream_load_si128((__m128i *) (a))
#define _mm_store(a,v) _mm_storeu_si128((__m128i *) (a), (v))
#endif

void *ucs_x86_memcpy_sse_movntdqa(void *dest, const void *source, size_t len)
{
#ifdef __SSE4_1__
const char *src = source;
char *dst = dest;
void *result = dst;

/* Copy unaligned portion of src */
if ((uintptr_t)src & 15) {
uintptr_t aligned = (uintptr_t)src & ~15;
uintptr_t misalign = (uintptr_t)src & 15;
uintptr_t copy = ucs_min(len, 16 - misalign);

__m128i tmp = _mm_load(aligned);
memcpy(dst, (char*)(&tmp) + misalign, copy);

src += copy;
dst += copy;
len -= copy;
}

/* Copy 64 bytes at a time */
while (len >= 64) {
__m128i *S = (__m128i *)src;
__m128i *D = (__m128i *)dst;
__m128i tmp[4];

tmp[0] = _mm_load(S + 0);
tmp[1] = _mm_load(S + 1);
tmp[2] = _mm_load(S + 2);
tmp[3] = _mm_load(S + 3);

_mm_store(D + 0, tmp[0]);
_mm_store(D + 1, tmp[1]);
_mm_store(D + 2, tmp[2]);
_mm_store(D + 3, tmp[3]);

src += 64;
dst += 64;
len -= 64;
}

/* Copy 16 bytes at a time */
while (len >= 16) {
_mm_store(dst, _mm_load(src));

src += 16;
dst += 16;
len -= 16;
}

/* Copy any remaining bytes */
if (len) {
__m128i tmp = _mm_load(src);
memcpy(dst, &tmp, len);
}

return result;
#else
return memcpy(dest, source, len);
#endif
}

#endif
6 changes: 6 additions & 0 deletions src/ucs/arch/x86_64/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ ucs_cpu_flag_t ucs_arch_get_cpu_flag() UCS_F_NOOPTIMIZE;
ucs_cpu_vendor_t ucs_arch_get_cpu_vendor();
void ucs_cpu_init();
ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes);
void *ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len);

static inline int ucs_arch_x86_rdtsc_enabled()
{
Expand Down Expand Up @@ -109,6 +110,11 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
return memcpy(dst, src, len);
}

static inline void *ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
{
return ucs_x86_memcpy_sse_movntdqa(dst, src, len);
}

END_C_DECLS

#endif
Expand Down
13 changes: 8 additions & 5 deletions src/uct/rocm/copy/rocm_copy_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#include <uct/base/uct_log.h>
#include <ucs/debug/memtrack.h>
#include <ucs/type/class.h>
#include <ucs/arch/cpu.h>

#define memcpy_h2d(_d,_s,_l) memcpy((_d),(_s),(_l))
#define memcpy_d2h(_d,_s,_l) ucs_memcpy_nontemporal((_d),(_s),(_l))

static UCS_CLASS_INIT_FUNC(uct_rocm_copy_ep_t, const uct_ep_params_t *params)
{
Expand Down Expand Up @@ -44,9 +48,9 @@ uct_rocm_copy_ep_zcopy(uct_ep_h tl_ep,
}

if (is_put)
memcpy((void *)remote_addr, iov->buffer, size);
memcpy_h2d((void *)remote_addr, iov->buffer, size);
else
memcpy(iov->buffer, (void *)remote_addr, size);
memcpy_d2h(iov->buffer, (void *)remote_addr, size);

return UCS_OK;
}
Expand Down Expand Up @@ -87,7 +91,7 @@ ucs_status_t uct_rocm_copy_ep_put_short(uct_ep_h tl_ep, const void *buffer,
unsigned length, uint64_t remote_addr,
uct_rkey_t rkey)
{
memcpy((void *)remote_addr, buffer, length);
memcpy_h2d((void *)remote_addr, buffer, length);

UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, SHORT, length);
ucs_trace_data("PUT_SHORT size %d from %p to %p",
Expand All @@ -99,8 +103,7 @@ ucs_status_t uct_rocm_copy_ep_get_short(uct_ep_h tl_ep, void *buffer,
unsigned length, uint64_t remote_addr,
uct_rkey_t rkey)
{
/* device to host */
memcpy(buffer, (void *)remote_addr, length);
memcpy_d2h(buffer, (void *)remote_addr, length);

UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, SHORT, length);
ucs_trace_data("GET_SHORT size %d from %p to %p",
Expand Down

0 comments on commit ce7608e

Please sign in to comment.