Skip to content

Commit

Permalink
Fix performance bugs in scalar reductions (#509) (#543)
Browse files Browse the repository at this point in the history
* Unify the template for device reduction tree and do some cleanup

* Fix performance bugs in scalar reduction kernels:

* Use unsigned 64-bit integers instead of signed integers wherever
  possible; CUDA hasn't added an atomic intrinsic for the latter yet.

* Move reduction buffers from zero-copy memory to framebuffer. This
  makes the slow atomic update code path in reduction operators
  run much more efficiently.

* Use thew new scalar reduction buffer in binary reductions as well

* Use only the RHS type in the reduction buffer as we never call apply

* Minor clean up per review

* Rename the buffer class and method to make the intent explicit

* Flip the polarity of reduce's template parameter

Co-authored-by: Wonchan Lee <wonchanl@nvidia.com>
  • Loading branch information
marcinz and magnatelee authored Aug 17, 2022
1 parent ffb37b4 commit 2959f0a
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 262 deletions.
8 changes: 4 additions & 4 deletions src/cunumeric/binary/binary_red.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
{
const size_t idx = global_tid_1d();
if (idx >= volume) return;
if (!func(in1[idx], in2[idx])) out <<= false;
if (!func(in1[idx], in2[idx])) out.reduce<false /*EXCLUSIVE*/>(false);
}

template <typename Function, typename RES, typename ReadAcc, typename Pitches, typename Rect>
Expand All @@ -39,7 +39,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) gen
const size_t idx = global_tid_1d();
if (idx >= volume) return;
auto point = pitches.unflatten(idx, rect.lo);
if (!func(in1[point], in2[point])) out <<= false;
if (!func(in1[point], in2[point])) out.reduce<false /*EXCLUSIVE*/>(false);
}

template <typename Buffer, typename RedAcc>
Expand All @@ -64,8 +64,8 @@ struct BinaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
{
size_t volume = rect.volume();
const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
DeferredReduction<ProdReduction<bool>> result;
auto stream = get_cached_stream();
auto stream = get_cached_stream();
DeviceScalarReductionBuffer<ProdReduction<bool>> result(stream);
if (dense) {
auto in1ptr = in1.ptr(rect);
auto in2ptr = in2.ptr(rect);
Expand Down
257 changes: 22 additions & 235 deletions src/cunumeric/cuda_help.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "core/cuda/stream_pool.h"
#include "cunumeric/arg.h"
#include "cunumeric/arg.inl"
#include "cunumeric/device_scalar_reduction_buffer.h"
#include <cublas_v2.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
Expand Down Expand Up @@ -211,71 +212,35 @@ __device__ __forceinline__ T shuffle(unsigned mask, T var, int laneMask, int wid
return var;
}

// Overload for complex
// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
// version
template <typename T, typename REDUCTION>
__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
complex<T> value)
{
__shared__ complex<T> trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const complex<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
}
// Write warp values into shared memory
if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
__syncthreads();
// Output reduction
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
result <<= value;
// Make sure the result is visible externally
__threadfence_system();
}
}
template <typename T>
struct HasNativeShuffle {
static constexpr bool value = true;
};

// Overload for argval
// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
// version
template <typename T, typename REDUCTION>
__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
Argval<T> value)
{
__shared__ Argval<T> trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const Argval<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
}
// Write warp values into shared memory
if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
__syncthreads();
// Output reduction
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
result <<= value;
// Make sure the result is visible externally
__threadfence_system();
}
}
template <typename T>
struct HasNativeShuffle<complex<T>> {
static constexpr bool value = false;
};

template <typename T>
struct HasNativeShuffle<Argval<T>> {
static constexpr bool value = false;
};

template <typename T, typename REDUCTION>
__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result, T value)
__device__ __forceinline__ void reduce_output(DeviceScalarReductionBuffer<REDUCTION> result,
T value)
{
__shared__ T trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const T shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
T shuffle_value;
if constexpr (HasNativeShuffle<T>::value)
shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
else
shuffle_value = shuffle(0xffffffff, value, i, 32);
REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
}
// Write warp values into shared memory
Expand All @@ -285,190 +250,12 @@ __device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTIO
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
result <<= value;
result.reduce<false /*EXCLUSIVE*/>(value);
// Make sure the result is visible externally
__threadfence_system();
}
}

__device__ __forceinline__ void reduce_bool(Legion::DeferredValue<bool> result, int value)
{
__shared__ int trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const int shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
if (shuffle_value == 0) value = 0;
}
// Write warp values into shared memory
if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
__syncthreads();
// Output reduction
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
if (trampoline[i] == 0) {
value = 0;
break;
}
if (value == 0) {
result = false;
// Make sure the result is visible externally
__threadfence_system();
}
}
}

template <typename T>
__device__ __forceinline__ T load_cached(const T* ptr)
{
return *ptr;
}

// Specializations to use PTX cache qualifiers to keep
// all the input data in as many caches as we can
// Use .ca qualifier to cache at all levels
template <>
__device__ __forceinline__ uint16_t load_cached<uint16_t>(const uint16_t* ptr)
{
uint16_t value;
asm volatile("ld.global.ca.u16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint32_t load_cached<uint32_t>(const uint32_t* ptr)
{
uint32_t value;
asm volatile("ld.global.ca.u32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint64_t load_cached<uint64_t>(const uint64_t* ptr)
{
uint64_t value;
asm volatile("ld.global.ca.u64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int16_t load_cached<int16_t>(const int16_t* ptr)
{
int16_t value;
asm volatile("ld.global.ca.s16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int32_t load_cached<int32_t>(const int32_t* ptr)
{
int32_t value;
asm volatile("ld.global.ca.s32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int64_t load_cached<int64_t>(const int64_t* ptr)
{
int64_t value;
asm volatile("ld.global.ca.s64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

// No half because inline ptx is dumb about the type

template <>
__device__ __forceinline__ float load_cached<float>(const float* ptr)
{
float value;
asm volatile("ld.global.ca.f32 %0, [%1];" : "=f"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ double load_cached<double>(const double* ptr)
{
double value;
asm volatile("ld.global.ca.f64 %0, [%1];" : "=d"(value) : "l"(ptr) : "memory");
return value;
}

template <typename T>
__device__ __forceinline__ T load_l2(const T* ptr)
{
return *ptr;
}

// Specializations to use PTX cache qualifiers to keep
// data loaded into L2 but no higher in the hierarchy
// Use .cg qualifier to cache at L2
template <>
__device__ __forceinline__ uint16_t load_l2<uint16_t>(const uint16_t* ptr)
{
uint16_t value;
asm volatile("ld.global.cg.u16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint32_t load_l2<uint32_t>(const uint32_t* ptr)
{
uint32_t value;
asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint64_t load_l2<uint64_t>(const uint64_t* ptr)
{
uint64_t value;
asm volatile("ld.global.cg.u64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int16_t load_l2<int16_t>(const int16_t* ptr)
{
int16_t value;
asm volatile("ld.global.cg.s16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int32_t load_l2<int32_t>(const int32_t* ptr)
{
int32_t value;
asm volatile("ld.global.cg.s32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int64_t load_l2<int64_t>(const int64_t* ptr)
{
int64_t value;
asm volatile("ld.global.cg.s64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

// No half because inline ptx is dumb about the type

template <>
__device__ __forceinline__ float load_l2<float>(const float* ptr)
{
float value;
asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ double load_l2<double>(const double* ptr)
{
double value;
asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(value) : "l"(ptr) : "memory");
return value;
}

template <typename T>
__device__ __forceinline__ T load_streaming(const T* ptr)
{
Expand Down
59 changes: 59 additions & 0 deletions src/cunumeric/device_scalar_reduction_buffer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/* Copyright 2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

#pragma once

#include "core/cuda/cuda_help.h"
#include "core/data/buffer.h"

namespace cunumeric {

template <typename REDOP>
class DeviceScalarReductionBuffer {
private:
using VAL = typename REDOP::RHS;

public:
DeviceScalarReductionBuffer(cudaStream_t stream)
: buffer_(legate::create_buffer<VAL>(1, Legion::Memory::Kind::GPU_FB_MEM))
{
VAL identity{REDOP::identity};
ptr_ = buffer_.ptr(0);
CHECK_CUDA(cudaMemcpyAsync(ptr_, &identity, sizeof(VAL), cudaMemcpyHostToDevice, stream));
}

template <bool EXCLUSIVE>
__device__ void reduce(const VAL& value) const
{
REDOP::template fold<EXCLUSIVE /*exclusive*/>(*ptr_, value);
}

__host__ VAL read(cudaStream_t stream) const
{
VAL result{REDOP::identity};
CHECK_CUDA(cudaMemcpyAsync(&result, ptr_, sizeof(VAL), cudaMemcpyDeviceToHost, stream));
CHECK_CUDA(cudaStreamSynchronize(stream));
return result;
}

__device__ VAL read() const { return *ptr_; }

private:
legate::Buffer<VAL> buffer_;
VAL* ptr_;
};

} // namespace cunumeric
Loading

0 comments on commit 2959f0a

Please sign in to comment.