Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/ROCM/COPY: Use faster memcpy for device to host copies #4532

Merged
merged 1 commit into from
Dec 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/ucs/arch/aarch64/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,12 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
return memcpy(dst, src, len);
}

static UCS_F_ALWAYS_INLINE void
ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
{
memcpy(dst, src, len);
}

static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
{
return UCS_ERR_UNSUPPORTED;
Expand Down
6 changes: 6 additions & 0 deletions src/ucs/arch/ppc64/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
return memcpy(dst, src, len);
}

static UCS_F_ALWAYS_INLINE void
ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
{
memcpy(dst, src, len);
}

static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
{
return UCS_ERR_UNSUPPORTED;
Expand Down
62 changes: 62 additions & 0 deletions src/ucs/arch/x86_64/cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
#define X86_CPU_CACHE_TAG_L1_ONLY 0x40
#define X86_CPU_CACHE_TAG_LEAF4 0xff

#if defined (__SSE4_1__)
#define _mm_load(a) _mm_stream_load_si128((__m128i *) (a))
#define _mm_store(a,v) _mm_storeu_si128((__m128i *) (a), (v))
#endif


typedef enum ucs_x86_cpu_cache_type {
X86_CPU_CACHE_TYPE_DATA = 1,
Expand Down Expand Up @@ -586,4 +591,61 @@ ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
return cache_count == UCS_CPU_CACHE_LAST ? UCS_OK : UCS_ERR_UNSUPPORTED;
}

void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len)
{
#if defined (__SSE4_1__)
/* Copy unaligned portion of src */
if ((uintptr_t)src & 15) {
uintptr_t aligned = (uintptr_t)src & ~15;
uintptr_t misalign = (uintptr_t)src & 15;
uintptr_t copy = ucs_min(len, 16 - misalign);

__m128i tmp = _mm_load(aligned);
memcpy(dst, UCS_PTR_BYTE_OFFSET(&tmp, misalign), copy);

src = UCS_PTR_BYTE_OFFSET(src, copy);
dst = UCS_PTR_BYTE_OFFSET(dst, copy);
len -= copy;
}

/* Copy 64 bytes at a time */
while (len >= 64) {
__m128i *S = (__m128i *)src;
__m128i *D = (__m128i *)dst;
__m128i tmp[4];

tmp[0] = _mm_load(S + 0);
tmp[1] = _mm_load(S + 1);
tmp[2] = _mm_load(S + 2);
tmp[3] = _mm_load(S + 3);

_mm_store(D + 0, tmp[0]);
_mm_store(D + 1, tmp[1]);
_mm_store(D + 2, tmp[2]);
_mm_store(D + 3, tmp[3]);

src = UCS_PTR_BYTE_OFFSET(src, 64);
dst = UCS_PTR_BYTE_OFFSET(dst, 64);
len -= 64;
}

/* Copy 16 bytes at a time */
while (len >= 16) {
_mm_store(dst, _mm_load(src));

src = UCS_PTR_BYTE_OFFSET(src, 16);
dst = UCS_PTR_BYTE_OFFSET(dst, 16);
len -= 16;
}

/* Copy any remaining bytes */
if (len) {
__m128i tmp = _mm_load(src);
memcpy(dst, &tmp, len);
}
#else
memcpy(dst, src, len);
#endif
}

#endif
7 changes: 7 additions & 0 deletions src/ucs/arch/x86_64/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ ucs_cpu_flag_t ucs_arch_get_cpu_flag() UCS_F_NOOPTIMIZE;
ucs_cpu_vendor_t ucs_arch_get_cpu_vendor();
void ucs_cpu_init();
ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes);
void ucs_x86_memcpy_sse_movntdqa(void *dst, const void *src, size_t len);

static inline int ucs_arch_x86_rdtsc_enabled()
{
Expand Down Expand Up @@ -109,6 +110,12 @@ static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
return memcpy(dst, src, len);
}

static UCS_F_ALWAYS_INLINE void
ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
{
ucs_x86_memcpy_sse_movntdqa(dst, src, len);
}

END_C_DECLS

#endif
Expand Down
13 changes: 8 additions & 5 deletions src/uct/rocm/copy/rocm_copy_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#include <uct/base/uct_log.h>
#include <ucs/debug/memtrack.h>
#include <ucs/type/class.h>
#include <ucs/arch/cpu.h>

#define uct_rocm_memcpy_h2d(_d,_s,_l) memcpy((_d),(_s),(_l))
#define uct_rocm_memcpy_d2h(_d,_s,_l) ucs_memcpy_nontemporal((_d),(_s),(_l))

static UCS_CLASS_INIT_FUNC(uct_rocm_copy_ep_t, const uct_ep_params_t *params)
{
Expand Down Expand Up @@ -44,9 +48,9 @@ uct_rocm_copy_ep_zcopy(uct_ep_h tl_ep,
}

if (is_put)
memcpy((void *)remote_addr, iov->buffer, size);
uct_rocm_memcpy_h2d((void *)remote_addr, iov->buffer, size);
else
memcpy(iov->buffer, (void *)remote_addr, size);
uct_rocm_memcpy_d2h(iov->buffer, (void *)remote_addr, size);

return UCS_OK;
}
Expand Down Expand Up @@ -87,7 +91,7 @@ ucs_status_t uct_rocm_copy_ep_put_short(uct_ep_h tl_ep, const void *buffer,
unsigned length, uint64_t remote_addr,
uct_rkey_t rkey)
{
memcpy((void *)remote_addr, buffer, length);
uct_rocm_memcpy_h2d((void *)remote_addr, buffer, length);

UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, SHORT, length);
ucs_trace_data("PUT_SHORT size %d from %p to %p",
Expand All @@ -99,8 +103,7 @@ ucs_status_t uct_rocm_copy_ep_get_short(uct_ep_h tl_ep, void *buffer,
unsigned length, uint64_t remote_addr,
uct_rkey_t rkey)
{
/* device to host */
memcpy(buffer, (void *)remote_addr, length);
uct_rocm_memcpy_d2h(buffer, (void *)remote_addr, length);

UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, SHORT, length);
ucs_trace_data("GET_SHORT size %d from %p to %p",
Expand Down