nv-legate · magnatelee · Aug 6, 2022 · Aug 5, 2022 · Aug 5, 2022 · Aug 5, 2022
diff --git a/src/cunumeric/binary/binary_red.cu b/src/cunumeric/binary/binary_red.cu
@@ -64,8 +64,8 @@ struct BinaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   {
     size_t volume       = rect.volume();
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    DeferredReduction<ProdReduction<bool>> result;
-    auto stream = get_cached_stream();
+    auto stream         = get_cached_stream();
+    ScalarReductionBuffer<ProdReduction<bool>> result(stream);
     if (dense) {
       auto in1ptr = in1.ptr(rect);
       auto in2ptr = in2.ptr(rect);

diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h
@@ -21,6 +21,7 @@
 #include "core/cuda/stream_pool.h"
 #include "cunumeric/arg.h"
 #include "cunumeric/arg.inl"
+#include "cunumeric/scalar_reduction_buffer.h"
 #include <cublas_v2.h>
 #include <cusolverDn.h>
 #include <cuda_runtime.h>
@@ -211,71 +212,34 @@ __device__ __forceinline__ T shuffle(unsigned mask, T var, int laneMask, int wid
   return var;
 }
 
-// Overload for complex
-// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
-// version
-template <typename T, typename REDUCTION>
-__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
-                                              complex<T> value)
-{
-  __shared__ complex<T> trampoline[THREADS_PER_BLOCK / 32];
-  // Reduce across the warp
-  const int laneid = threadIdx.x & 0x1f;
-  const int warpid = threadIdx.x >> 5;
-  for (int i = 16; i >= 1; i /= 2) {
-    const complex<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
-    REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
-  }
-  // Write warp values into shared memory
-  if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
-  __syncthreads();
-  // Output reduction
-  if (threadIdx.x == 0) {
-    for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
-      REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
-    result <<= value;
-    // Make sure the result is visible externally
-    __threadfence_system();
-  }
-}
+template <typename T>
+struct HasNativeShuffle {
+  static constexpr bool value = true;
+};
 
-// Overload for argval
-// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
-// version
-template <typename T, typename REDUCTION>
-__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
-                                              Argval<T> value)
-{
-  __shared__ Argval<T> trampoline[THREADS_PER_BLOCK / 32];
-  // Reduce across the warp
-  const int laneid = threadIdx.x & 0x1f;
-  const int warpid = threadIdx.x >> 5;
-  for (int i = 16; i >= 1; i /= 2) {
-    const Argval<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
-    REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
-  }
-  // Write warp values into shared memory
-  if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
-  __syncthreads();
-  // Output reduction
-  if (threadIdx.x == 0) {
-    for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
-      REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
-    result <<= value;
-    // Make sure the result is visible externally
-    __threadfence_system();
-  }
-}
+template <typename T>
+struct HasNativeShuffle<complex<T>> {
+  static constexpr bool value = false;
+};
+
+template <typename T>
+struct HasNativeShuffle<Argval<T>> {
+  static constexpr bool value = false;
+};
 
 template <typename T, typename REDUCTION>
-__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result, T value)
+__device__ __forceinline__ void reduce_output(ScalarReductionBuffer<REDUCTION> result, T value)
 {
   __shared__ T trampoline[THREADS_PER_BLOCK / 32];
   // Reduce across the warp
   const int laneid = threadIdx.x & 0x1f;
   const int warpid = threadIdx.x >> 5;
   for (int i = 16; i >= 1; i /= 2) {
-    const T shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
+    T shuffle_value;
+    if constexpr (HasNativeShuffle<T>::value)
+      shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
+    else
+      shuffle_value = shuffle(0xffffffff, value, i, 32);
     REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
   }
   // Write warp values into shared memory
@@ -291,184 +255,6 @@ __device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTIO
   }
 }
 
-__device__ __forceinline__ void reduce_bool(Legion::DeferredValue<bool> result, int value)
-{
-  __shared__ int trampoline[THREADS_PER_BLOCK / 32];
-  // Reduce across the warp
-  const int laneid = threadIdx.x & 0x1f;
-  const int warpid = threadIdx.x >> 5;
-  for (int i = 16; i >= 1; i /= 2) {
-    const int shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
-    if (shuffle_value == 0) value = 0;
-  }
-  // Write warp values into shared memory
-  if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
-  __syncthreads();
-  // Output reduction
-  if (threadIdx.x == 0) {
-    for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
-      if (trampoline[i] == 0) {
-        value = 0;
-        break;
-      }
-    if (value == 0) {
-      result = false;
-      // Make sure the result is visible externally
-      __threadfence_system();
-    }
-  }
-}
-
-template <typename T>
-__device__ __forceinline__ T load_cached(const T* ptr)
-{
-  return *ptr;
-}
-
-// Specializations to use PTX cache qualifiers to keep
-// all the input data in as many caches as we can
-// Use .ca qualifier to cache at all levels
-template <>
-__device__ __forceinline__ uint16_t load_cached<uint16_t>(const uint16_t* ptr)
-{
-  uint16_t value;
-  asm volatile("ld.global.ca.u16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ uint32_t load_cached<uint32_t>(const uint32_t* ptr)
-{
-  uint32_t value;
-  asm volatile("ld.global.ca.u32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ uint64_t load_cached<uint64_t>(const uint64_t* ptr)
-{
-  uint64_t value;
-  asm volatile("ld.global.ca.u64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int16_t load_cached<int16_t>(const int16_t* ptr)
-{
-  int16_t value;
-  asm volatile("ld.global.ca.s16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int32_t load_cached<int32_t>(const int32_t* ptr)
-{
-  int32_t value;
-  asm volatile("ld.global.ca.s32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int64_t load_cached<int64_t>(const int64_t* ptr)
-{
-  int64_t value;
-  asm volatile("ld.global.ca.s64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-// No half because inline ptx is dumb about the type
-
-template <>
-__device__ __forceinline__ float load_cached<float>(const float* ptr)
-{
-  float value;
-  asm volatile("ld.global.ca.f32 %0, [%1];" : "=f"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ double load_cached<double>(const double* ptr)
-{
-  double value;
-  asm volatile("ld.global.ca.f64 %0, [%1];" : "=d"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <typename T>
-__device__ __forceinline__ T load_l2(const T* ptr)
-{
-  return *ptr;
-}
-
-// Specializations to use PTX cache qualifiers to keep
-// data loaded into L2 but no higher in the hierarchy
-// Use .cg qualifier to cache at L2
-template <>
-__device__ __forceinline__ uint16_t load_l2<uint16_t>(const uint16_t* ptr)
-{
-  uint16_t value;
-  asm volatile("ld.global.cg.u16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ uint32_t load_l2<uint32_t>(const uint32_t* ptr)
-{
-  uint32_t value;
-  asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ uint64_t load_l2<uint64_t>(const uint64_t* ptr)
-{
-  uint64_t value;
-  asm volatile("ld.global.cg.u64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int16_t load_l2<int16_t>(const int16_t* ptr)
-{
-  int16_t value;
-  asm volatile("ld.global.cg.s16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int32_t load_l2<int32_t>(const int32_t* ptr)
-{
-  int32_t value;
-  asm volatile("ld.global.cg.s32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ int64_t load_l2<int64_t>(const int64_t* ptr)
-{
-  int64_t value;
-  asm volatile("ld.global.cg.s64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-// No half because inline ptx is dumb about the type
-
-template <>
-__device__ __forceinline__ float load_l2<float>(const float* ptr)
-{
-  float value;
-  asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
-template <>
-__device__ __forceinline__ double load_l2<double>(const double* ptr)
-{
-  double value;
-  asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(value) : "l"(ptr) : "memory");
-  return value;
-}
-
 template <typename T>
 __device__ __forceinline__ T load_streaming(const T* ptr)
 {

diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
@@ -37,14 +37,14 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
                        const size_t skip_size,
                        const size_t key_dim)
 {
-  size_t value = 0;
+  uint64_t value = 0;
   for (size_t i = 0; i < iters; i++) {
     size_t idx = (i * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (idx > volume) break;
     auto point   = pitches.unflatten(idx, origin);
     bool val     = (index[point] && ((idx + 1) % skip_size == 0));
     offsets[idx] = static_cast<int64_t>(val);
-    SumReduction<size_t>::fold<true>(value, val);
+    SumReduction<uint64_t>::fold<true>(value, val);
   }
   // Every thread in the thread block must participate in the exchange to get correct results
   reduce_output(out, value);
@@ -90,7 +90,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
                        const size_t skip_size,
                        const size_t key_dim) const
   {
-    DeferredReduction<SumReduction<size_t>> size;
+    ScalarReductionBuffer<SumReduction<uint64_t>> size(stream);
 
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
 
@@ -104,12 +104,12 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
       count_nonzero_kernel<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         volume, size, offsets, in, pitches, rect.lo, 1, skip_size, key_dim);
 
-    cudaStreamSynchronize(stream);
+    CHECK_CUDA_STREAM(stream);
 
     auto off_ptr = offsets.ptr(0);
     thrust::exclusive_scan(thrust::cuda::par.on(stream), off_ptr, off_ptr + volume, off_ptr);
 
-    return size.read();
+    return size.read(stream);
   }
 
   void operator()(Array& out_arr,

diff --git a/src/cunumeric/index/repeat.cu b/src/cunumeric/index/repeat.cu
@@ -35,15 +35,15 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
                       const size_t iters,
                       Buffer<int64_t> offsets)
 {
-  int64_t value = 0;
+  uint64_t value = 0;
   for (size_t idx = 0; idx < iters; idx++) {
     const int64_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (offset < extent) {
       auto p = origin;
       p[axis] += offset;
       auto val        = repeats[p];
       offsets[offset] = val;
-      SumReduction<int64_t>::fold<true>(value, val);
+      SumReduction<uint64_t>::fold<true>(value, val);
     }
   }
   // Every thread in the thread block must participate in the exchange to get correct results
@@ -137,7 +137,7 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
     int64_t extent = in_rect.hi[axis] - in_rect.lo[axis] + 1;
     auto offsets   = create_buffer<int64_t>(Point<1>(extent), Memory::Kind::Z_COPY_MEM);
 
-    DeferredReduction<SumReduction<int64_t>> sum;
+    ScalarReductionBuffer<SumReduction<uint64_t>> sum(stream);
     const size_t blocks_count = (extent + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     const size_t shmem_size   = THREADS_PER_BLOCK / 32 * sizeof(int64_t);
 
@@ -151,10 +151,8 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
     }
     CHECK_CUDA_STREAM(stream);
 
-    cudaStreamSynchronize(stream);
-
     Point<DIM> out_extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
-    out_extents[axis]      = sum.read();
+    out_extents[axis]      = static_cast<Legion::coord_t>(sum.read(stream));
 
     auto out = out_array.create_output_buffer<VAL, DIM>(out_extents, true);
 

diff --git a/src/cunumeric/matrix/dot.cu b/src/cunumeric/matrix/dot.cu
@@ -61,7 +61,7 @@ struct DotImplBody<VariantKind::GPU, CODE> {
 
     const auto volume   = rect.volume();
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    DeferredReduction<SumReduction<ACC>> result;
+    ScalarReductionBuffer<SumReduction<ACC>> result(stream);
     size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(ACC);
 
     if (blocks >= MAX_REDUCTION_CTAS) {