Fix performance bugs in scalar reductions (#509) (#543)

* Unify the template for device reduction tree and do some cleanup * Fix performance bugs in scalar reduction kernels: * Use unsigned 64-bit integers instead of signed integers wherever possible; CUDA hasn't added an atomic intrinsic for the latter yet. * Move reduction buffers from zero-copy memory to framebuffer. This makes the slow atomic update code path in reduction operators run much more efficiently. * Use thew new scalar reduction buffer in binary reductions as well * Use only the RHS type in the reduction buffer as we never call apply * Minor clean up per review * Rename the buffer class and method to make the intent explicit * Flip the polarity of reduce's template parameter Co-authored-by: Wonchan Lee <wonchanl@nvidia.com>
nv-legate · Aug 17, 2022 · 2959f0a · 2959f0a
1 parent ffb37b4
commit 2959f0a
Show file tree

Hide file tree

Showing 8 changed files with 103 additions and 262 deletions.
diff --git a/src/cunumeric/binary/binary_red.cu b/src/cunumeric/binary/binary_red.cu
@@ -29,7 +29,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
 {
   const size_t idx = global_tid_1d();
   if (idx >= volume) return;
-  if (!func(in1[idx], in2[idx])) out <<= false;
+  if (!func(in1[idx], in2[idx])) out.reduce<false /*EXCLUSIVE*/>(false);
 }
 
 template <typename Function, typename RES, typename ReadAcc, typename Pitches, typename Rect>
@@ -39,7 +39,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) gen
   const size_t idx = global_tid_1d();
   if (idx >= volume) return;
   auto point = pitches.unflatten(idx, rect.lo);
-  if (!func(in1[point], in2[point])) out <<= false;
+  if (!func(in1[point], in2[point])) out.reduce<false /*EXCLUSIVE*/>(false);
 }
 
 template <typename Buffer, typename RedAcc>
@@ -64,8 +64,8 @@ struct BinaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   {
     size_t volume       = rect.volume();
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    DeferredReduction<ProdReduction<bool>> result;
-    auto stream = get_cached_stream();
+    auto stream         = get_cached_stream();
+    DeviceScalarReductionBuffer<ProdReduction<bool>> result(stream);
     if (dense) {
       auto in1ptr = in1.ptr(rect);
       auto in2ptr = in2.ptr(rect);

diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h
@@ -21,6 +21,7 @@
 #include "core/cuda/stream_pool.h"
 #include "cunumeric/arg.h"
 #include "cunumeric/arg.inl"
+#include "cunumeric/device_scalar_reduction_buffer.h"
 #include <cublas_v2.h>
 #include <cusolverDn.h>
 #include <cuda_runtime.h>
@@ -211,71 +212,35 @@ __device__ __forceinline__ T shuffle(unsigned mask, T var, int laneMask, int wid
   return var;
 }
 
-// Overload for complex
-// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
-// version
-template <typename T, typename REDUCTION>
-__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
-                                              complex<T> value)
-{
-  __shared__ complex<T> trampoline[THREADS_PER_BLOCK / 32];
-  // Reduce across the warp
-  const int laneid = threadIdx.x & 0x1f;
-  const int warpid = threadIdx.x >> 5;
-  for (int i = 16; i >= 1; i /= 2) {
-    const complex<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
-    REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
-  }
-  // Write warp values into shared memory
-  if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
-  __syncthreads();
-  // Output reduction
-  if (threadIdx.x == 0) {
-    for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
-      REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
-    result <<= value;
-    // Make sure the result is visible externally
-    __threadfence_system();
-  }
-}
+template <typename T>
+struct HasNativeShuffle {
+  static constexpr bool value = true;
+};
 
-// Overload for argval
-// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
-// version
-template <typename T, typename REDUCTION>
-__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
-                                              Argval<T> value)
-{
-  __shared__ Argval<T> trampoline[THREADS_PER_BLOCK / 32];
-  // Reduce across the warp
-  const int laneid = threadIdx.x & 0x1f;
-  const int warpid = threadIdx.x >> 5;
-  for (int i = 16; i >= 1; i /= 2) {
-    const Argval<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
-    REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
-  }
-  // Write warp values into shared memory
-  if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
-  __syncthreads();
-  // Output reduction
-  if (threadIdx.x == 0) {
-    for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
-      REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
-    result <<= value;
-    // Make sure the result is visible externally
-    __threadfence_system();
-  }
-}
+template <typename T>
+struct HasNativeShuffle<complex<T>> {
+  static constexpr bool value = false;
+};
+
+template <typename T>
+struct HasNativeShuffle<Argval<T>> {
+  static constexpr bool value = false;
+};
 
 template <typename T, typename REDUCTION>
-__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result, T value)
+__device__ __forceinline__ void reduce_output(DeviceScalarReductionBuffer<REDUCTION> result,
+                                              T value)
 {
   __shared__ T trampoline[THREADS_PER_BLOCK / 32];
   // Reduce across the warp
   const int laneid = threadIdx.x & 0x1f;
   const int warpid = threadIdx.x >> 5;
   for (int i = 16; i >= 1; i /= 2) {
-    const T shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
+    T shuffle_value;
+    if constexpr (HasNativeShuffle<T>::value)
+      shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
+    else
+      shuffle_value = shuffle(0xffffffff, value, i, 32);
     REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
   }
   // Write warp values into shared memory
@@ -285,190 +250,12 @@ __device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTIO
   if (threadIdx.x == 0) {
     for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
       REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
-    result <<= value;
+    result.reduce<false /*EXCLUSIVE*/>(value);
     // Make sure the result is visible externally
     __threadfence_system();
   }
 }
 
-__device__ __forceinline__ void reduce_bool(Legion::DeferredValue<bool> result, int value)
-{
-  __shared__ int trampoline[THREADS_PER_BLOCK / 32];
-  // Reduce across the warp
-  const int laneid = threadIdx.x & 0x1f;
-  const int warpid = threadIdx.x >> 5;
-  for (int i = 16; i >= 1; i /= 2) {
-    const int shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
-    if (shuffle_value == 0) value = 0;
-  }
-  // Write warp values into shared memory
-  if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
-  __syncthreads();
-  // Output reduction
-  if (threadIdx.x == 0) {
-    for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
-      if (trampoline[i] == 0) {
-        value = 0;
-        break;
-      }
-    if (value == 0) {
-      result = false;
-      // Make sure the result is visible externally
-      __threadfence_system();
-    }
-  }
-}
-
-template <typename T>
-__device__ __forceinline__ T load_cached(const T* ptr)
-{
-  return *ptr;
-}
-
-// Specializations to use PTX cache qualifiers to keep
-// all the input data in as many caches as we can
-// Use .ca qualifier to cache at all levels
-template <>
-__device__ __forceinline__ uint16_t load_cached<uint16_t>(const uint16_t* ptr)
-{
-  uint16_t value;
-  asm volatile("ld.global.ca.u16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ uint32_t load_cached<uint32_t>(const uint32_t* ptr)
-{
-  uint32_t value;
-  asm volatile("ld.global.ca.u32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ uint64_t load_cached<uint64_t>(const uint64_t* ptr)
-{
-  uint64_t value;
-  asm volatile("ld.global.ca.u64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int16_t load_cached<int16_t>(const int16_t* ptr)
-{
-  int16_t value;
-  asm volatile("ld.global.ca.s16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int32_t load_cached<int32_t>(const int32_t* ptr)
-{
-  int32_t value;
-  asm volatile("ld.global.ca.s32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int64_t load_cached<int64_t>(const int64_t* ptr)
-{
-  int64_t value;
-  asm volatile("ld.global.ca.s64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-// No half because inline ptx is dumb about the type
-
-template <>
-__device__ __forceinline__ float load_cached<float>(const float* ptr)
-{
-  float value;
-  asm volatile("ld.global.ca.f32 %0, [%1];" : "=f"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ double load_cached<double>(const double* ptr)
-{
-  double value;
-  asm volatile("ld.global.ca.f64 %0, [%1];" : "=d"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <typename T>
-__device__ __forceinline__ T load_l2(const T* ptr)
-{
-  return *ptr;
-}
-
-// Specializations to use PTX cache qualifiers to keep
-// data loaded into L2 but no higher in the hierarchy
-// Use .cg qualifier to cache at L2
-template <>
-__device__ __forceinline__ uint16_t load_l2<uint16_t>(const uint16_t* ptr)
-{
-  uint16_t value;
-  asm volatile("ld.global.cg.u16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ uint32_t load_l2<uint32_t>(const uint32_t* ptr)
-{
-  uint32_t value;
-  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ uint64_t load_l2<uint64_t>(const uint64_t* ptr)
-{
-  uint64_t value;
-  asm volatile("ld.global.cg.u64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int16_t load_l2<int16_t>(const int16_t* ptr)
-{
-  int16_t value;
-  asm volatile("ld.global.cg.s16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int32_t load_l2<int32_t>(const int32_t* ptr)
-{
-  int32_t value;
-  asm volatile("ld.global.cg.s32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int64_t load_l2<int64_t>(const int64_t* ptr)
-{
-  int64_t value;
-  asm volatile("ld.global.cg.s64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-// No half because inline ptx is dumb about the type
-
-template <>
-__device__ __forceinline__ float load_l2<float>(const float* ptr)
-{
-  float value;
-  asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ double load_l2<double>(const double* ptr)
-{
-  double value;
-  asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
 template <typename T>
 __device__ __forceinline__ T load_streaming(const T* ptr)
 {

diff --git a/src/cunumeric/device_scalar_reduction_buffer.h b/src/cunumeric/device_scalar_reduction_buffer.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "core/cuda/cuda_help.h"
+#include "core/data/buffer.h"
+
+namespace cunumeric {
+
+template <typename REDOP>
+class DeviceScalarReductionBuffer {
+ private:
+  using VAL = typename REDOP::RHS;
+
+ public:
+  DeviceScalarReductionBuffer(cudaStream_t stream)
+    : buffer_(legate::create_buffer<VAL>(1, Legion::Memory::Kind::GPU_FB_MEM))
+  {
+    VAL identity{REDOP::identity};
+    ptr_ = buffer_.ptr(0);
+    CHECK_CUDA(cudaMemcpyAsync(ptr_, &identity, sizeof(VAL), cudaMemcpyHostToDevice, stream));
+  }
+
+  template <bool EXCLUSIVE>
+  __device__ void reduce(const VAL& value) const
+  {
+    REDOP::template fold<EXCLUSIVE /*exclusive*/>(*ptr_, value);
+  }
+
+  __host__ VAL read(cudaStream_t stream) const
+  {
+    VAL result{REDOP::identity};
+    CHECK_CUDA(cudaMemcpyAsync(&result, ptr_, sizeof(VAL), cudaMemcpyDeviceToHost, stream));
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    return result;
+  }
+
+  __device__ VAL read() const { return *ptr_; }
+
+ private:
+  legate::Buffer<VAL> buffer_;
+  VAL* ptr_;
+};
+
+}  // namespace cunumeric