Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix performance bugs in scalar reductions #509

Merged
4 changes: 2 additions & 2 deletions src/cunumeric/binary/binary_red.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ struct BinaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
{
size_t volume = rect.volume();
const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
DeferredReduction<ProdReduction<bool>> result;
auto stream = get_cached_stream();
auto stream = get_cached_stream();
ScalarReductionBuffer<ProdReduction<bool>> result(stream);
if (dense) {
auto in1ptr = in1.ptr(rect);
auto in2ptr = in2.ptr(rect);
Expand Down
254 changes: 20 additions & 234 deletions src/cunumeric/cuda_help.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "core/cuda/stream_pool.h"
#include "cunumeric/arg.h"
#include "cunumeric/arg.inl"
#include "cunumeric/scalar_reduction_buffer.h"
#include <cublas_v2.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
Expand Down Expand Up @@ -211,71 +212,34 @@ __device__ __forceinline__ T shuffle(unsigned mask, T var, int laneMask, int wid
return var;
}

// Overload for complex
// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
// version
template <typename T, typename REDUCTION>
__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
complex<T> value)
{
__shared__ complex<T> trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const complex<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
}
// Write warp values into shared memory
if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
__syncthreads();
// Output reduction
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
result <<= value;
// Make sure the result is visible externally
__threadfence_system();
}
}
template <typename T>
struct HasNativeShuffle {
static constexpr bool value = true;
};

// Overload for argval
// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
// version
template <typename T, typename REDUCTION>
__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
Argval<T> value)
{
__shared__ Argval<T> trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const Argval<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
}
// Write warp values into shared memory
if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
__syncthreads();
// Output reduction
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
result <<= value;
// Make sure the result is visible externally
__threadfence_system();
}
}
template <typename T>
struct HasNativeShuffle<complex<T>> {
static constexpr bool value = false;
};

template <typename T>
struct HasNativeShuffle<Argval<T>> {
static constexpr bool value = false;
};

template <typename T, typename REDUCTION>
__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result, T value)
__device__ __forceinline__ void reduce_output(ScalarReductionBuffer<REDUCTION> result, T value)
{
__shared__ T trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const T shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
T shuffle_value;
if constexpr (HasNativeShuffle<T>::value)
shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
else
shuffle_value = shuffle(0xffffffff, value, i, 32);
REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
}
// Write warp values into shared memory
Expand All @@ -291,184 +255,6 @@ __device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTIO
}
}

__device__ __forceinline__ void reduce_bool(Legion::DeferredValue<bool> result, int value)
{
__shared__ int trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const int shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
if (shuffle_value == 0) value = 0;
}
// Write warp values into shared memory
if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
__syncthreads();
// Output reduction
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
if (trampoline[i] == 0) {
value = 0;
break;
}
if (value == 0) {
result = false;
// Make sure the result is visible externally
__threadfence_system();
}
}
}

template <typename T>
__device__ __forceinline__ T load_cached(const T* ptr)
{
return *ptr;
}

// Specializations to use PTX cache qualifiers to keep
// all the input data in as many caches as we can
// Use .ca qualifier to cache at all levels
template <>
__device__ __forceinline__ uint16_t load_cached<uint16_t>(const uint16_t* ptr)
{
uint16_t value;
asm volatile("ld.global.ca.u16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint32_t load_cached<uint32_t>(const uint32_t* ptr)
{
uint32_t value;
asm volatile("ld.global.ca.u32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint64_t load_cached<uint64_t>(const uint64_t* ptr)
{
uint64_t value;
asm volatile("ld.global.ca.u64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int16_t load_cached<int16_t>(const int16_t* ptr)
{
int16_t value;
asm volatile("ld.global.ca.s16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int32_t load_cached<int32_t>(const int32_t* ptr)
{
int32_t value;
asm volatile("ld.global.ca.s32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int64_t load_cached<int64_t>(const int64_t* ptr)
{
int64_t value;
asm volatile("ld.global.ca.s64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

// No half because inline ptx is dumb about the type

template <>
__device__ __forceinline__ float load_cached<float>(const float* ptr)
{
float value;
asm volatile("ld.global.ca.f32 %0, [%1];" : "=f"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ double load_cached<double>(const double* ptr)
{
double value;
asm volatile("ld.global.ca.f64 %0, [%1];" : "=d"(value) : "l"(ptr) : "memory");
return value;
}

template <typename T>
__device__ __forceinline__ T load_l2(const T* ptr)
{
return *ptr;
}

// Specializations to use PTX cache qualifiers to keep
// data loaded into L2 but no higher in the hierarchy
// Use .cg qualifier to cache at L2
template <>
__device__ __forceinline__ uint16_t load_l2<uint16_t>(const uint16_t* ptr)
{
uint16_t value;
asm volatile("ld.global.cg.u16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint32_t load_l2<uint32_t>(const uint32_t* ptr)
{
uint32_t value;
asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint64_t load_l2<uint64_t>(const uint64_t* ptr)
{
uint64_t value;
asm volatile("ld.global.cg.u64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int16_t load_l2<int16_t>(const int16_t* ptr)
{
int16_t value;
asm volatile("ld.global.cg.s16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int32_t load_l2<int32_t>(const int32_t* ptr)
{
int32_t value;
asm volatile("ld.global.cg.s32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int64_t load_l2<int64_t>(const int64_t* ptr)
{
int64_t value;
asm volatile("ld.global.cg.s64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

// No half because inline ptx is dumb about the type

template <>
__device__ __forceinline__ float load_l2<float>(const float* ptr)
{
float value;
asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ double load_l2<double>(const double* ptr)
{
double value;
asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(value) : "l"(ptr) : "memory");
return value;
}

template <typename T>
__device__ __forceinline__ T load_streaming(const T* ptr)
{
Expand Down
10 changes: 5 additions & 5 deletions src/cunumeric/index/advanced_indexing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
const size_t skip_size,
const size_t key_dim)
{
size_t value = 0;
uint64_t value = 0;
for (size_t i = 0; i < iters; i++) {
size_t idx = (i * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (idx > volume) break;
auto point = pitches.unflatten(idx, origin);
bool val = (index[point] && ((idx + 1) % skip_size == 0));
offsets[idx] = static_cast<int64_t>(val);
SumReduction<size_t>::fold<true>(value, val);
SumReduction<uint64_t>::fold<true>(value, val);
}
// Every thread in the thread block must participate in the exchange to get correct results
reduce_output(out, value);
Expand Down Expand Up @@ -90,7 +90,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
const size_t skip_size,
const size_t key_dim) const
{
DeferredReduction<SumReduction<size_t>> size;
ScalarReductionBuffer<SumReduction<uint64_t>> size(stream);

const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

Expand All @@ -104,12 +104,12 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
count_nonzero_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
volume, size, offsets, in, pitches, rect.lo, 1, skip_size, key_dim);

cudaStreamSynchronize(stream);
CHECK_CUDA_STREAM(stream);

auto off_ptr = offsets.ptr(0);
thrust::exclusive_scan(thrust::cuda::par.on(stream), off_ptr, off_ptr + volume, off_ptr);

manopapad marked this conversation as resolved.
Show resolved Hide resolved
return size.read();
return size.read(stream);
}

void operator()(Array& out_arr,
Expand Down
10 changes: 4 additions & 6 deletions src/cunumeric/index/repeat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
const size_t iters,
Buffer<int64_t> offsets)
{
int64_t value = 0;
uint64_t value = 0;
for (size_t idx = 0; idx < iters; idx++) {
const int64_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (offset < extent) {
auto p = origin;
p[axis] += offset;
auto val = repeats[p];
offsets[offset] = val;
SumReduction<int64_t>::fold<true>(value, val);
SumReduction<uint64_t>::fold<true>(value, val);
}
}
// Every thread in the thread block must participate in the exchange to get correct results
Expand Down Expand Up @@ -137,7 +137,7 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
int64_t extent = in_rect.hi[axis] - in_rect.lo[axis] + 1;
auto offsets = create_buffer<int64_t>(Point<1>(extent), Memory::Kind::Z_COPY_MEM);

DeferredReduction<SumReduction<int64_t>> sum;
ScalarReductionBuffer<SumReduction<uint64_t>> sum(stream);
const size_t blocks_count = (extent + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
const size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(int64_t);

Expand All @@ -151,10 +151,8 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
}
CHECK_CUDA_STREAM(stream);

cudaStreamSynchronize(stream);

Point<DIM> out_extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
out_extents[axis] = sum.read();
out_extents[axis] = static_cast<Legion::coord_t>(sum.read(stream));

auto out = out_array.create_output_buffer<VAL, DIM>(out_extents, true);

Expand Down
2 changes: 1 addition & 1 deletion src/cunumeric/matrix/dot.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ struct DotImplBody<VariantKind::GPU, CODE> {

const auto volume = rect.volume();
const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
DeferredReduction<SumReduction<ACC>> result;
ScalarReductionBuffer<SumReduction<ACC>> result(stream);
size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(ACC);

if (blocks >= MAX_REDUCTION_CTAS) {
Expand Down
Loading