nv-legate · magnatelee · Feb 14, 2023 · Feb 3, 2023 · Feb 10, 2023 · Feb 10, 2023
diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
@@ -16,10 +16,10 @@
 
 #include "cunumeric/index/advanced_indexing.h"
 #include "cunumeric/index/advanced_indexing_template.inl"
+#include "cunumeric/utilities/thrust_util.h"
 #include "cunumeric/cuda_help.h"
 
 #include <thrust/scan.h>
-#include <thrust/execution_policy.h>
 
 namespace cunumeric {
 
@@ -107,7 +107,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
     CHECK_CUDA_STREAM(stream);
 
     auto off_ptr = offsets.ptr(0);
-    thrust::exclusive_scan(thrust::cuda::par.on(stream), off_ptr, off_ptr + volume, off_ptr);
+    thrust::exclusive_scan(DEFAULT_POLICY.on(stream), off_ptr, off_ptr + volume, off_ptr);
 
     return size.read(stream);
   }

diff --git a/src/cunumeric/index/repeat.cu b/src/cunumeric/index/repeat.cu
@@ -16,10 +16,10 @@
 
 #include "cunumeric/index/repeat.h"
 #include "cunumeric/index/repeat_template.inl"
+#include "cunumeric/utilities/thrust_util.h"
 #include "cunumeric/cuda_help.h"
 
 #include <thrust/scan.h>
-#include <thrust/execution_policy.h>
 
 namespace cunumeric {
 
@@ -157,7 +157,7 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
     auto out = out_array.create_output_buffer<VAL, DIM>(out_extents, true);
 
     auto p_offsets = offsets.ptr(0);
-    thrust::exclusive_scan(thrust::cuda::par.on(stream), p_offsets, p_offsets + extent, p_offsets);
+    thrust::exclusive_scan(DEFAULT_POLICY.on(stream), p_offsets, p_offsets + extent, p_offsets);
 
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     repeat_kernel<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(

diff --git a/src/cunumeric/scan/scan_global.cu b/src/cunumeric/scan/scan_global.cu
@@ -16,9 +16,9 @@
 
 #include "cunumeric/scan/scan_global.h"
 #include "cunumeric/scan/scan_global_template.inl"
+#include "cunumeric/utilities/thrust_util.h"
 
 #include <thrust/reduce.h>
-#include <thrust/execution_policy.h>
 
 #include "cunumeric/cuda_help.h"
 
@@ -69,7 +69,7 @@ struct ScanGlobalImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
       sum_valsp[DIM - 1]     = 0;
       auto sum_valsp_end     = sum_valsp;
       sum_valsp_end[DIM - 1] = partition_index[DIM - 1];
-      auto global_prefix     = thrust::reduce(thrust::cuda::par.on(stream),
+      auto global_prefix     = thrust::reduce(DEFAULT_POLICY.on(stream),
                                           &sum_vals[sum_valsp],
                                           &sum_vals[sum_valsp_end],
                                           (VAL)ScanOp<OP_CODE, CODE>::nan_identity,

diff --git a/src/cunumeric/scan/scan_local.cu b/src/cunumeric/scan/scan_local.cu
@@ -17,9 +17,9 @@
 #include "cunumeric/scan/scan_local.h"
 #include "cunumeric/scan/scan_local_template.inl"
 #include "cunumeric/unary/isnan.h"
+#include "cunumeric/utilities/thrust_util.h"
 
 #include <thrust/scan.h>
-#include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
 
 #include "cunumeric/cuda_help.h"
@@ -65,7 +65,7 @@ struct ScanLocalImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
 
     for (uint64_t index = 0; index < volume; index += stride) {
       thrust::inclusive_scan(
-        thrust::cuda::par.on(stream), inptr + index, inptr + index + stride, outptr + index, func);
+        DEFAULT_POLICY.on(stream), inptr + index, inptr + index + stride, outptr + index, func);
       // get the corresponding ND index with base zero to use for sum_val
       auto sum_valp = pitches.unflatten(index, Point<DIM>::ZEROES());
       // only one element on scan axis
@@ -112,7 +112,7 @@ struct ScanLocalNanImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
 
     for (uint64_t index = 0; index < volume; index += stride) {
       thrust::inclusive_scan(
-        thrust::cuda::par.on(stream),
+        DEFAULT_POLICY.on(stream),
         thrust::make_transform_iterator(inptr + index, convert_nan_func()),
         thrust::make_transform_iterator(inptr + index + stride, convert_nan_func()),
         outptr + index,

diff --git a/src/cunumeric/search/nonzero.cuh b/src/cunumeric/search/nonzero.cuh
@@ -15,9 +15,9 @@
  */
 
 #include <thrust/scan.h>
-#include <thrust/execution_policy.h>
 
 #include "cunumeric/cuda_help.h"
+#include "cunumeric/utilities/thrust_util.h"
 
 namespace cunumeric {
 
@@ -49,7 +49,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
 
 static void exclusive_sum(int64_t* offsets, size_t volume, cudaStream_t stream)
 {
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), offsets, offsets + volume, offsets);
+  thrust::exclusive_scan(DEFAULT_POLICY.on(stream), offsets, offsets + volume, offsets);
 }
 
 template <typename VAL, int32_t DIM>

diff --git a/src/cunumeric/set/unique.cu b/src/cunumeric/set/unique.cu
@@ -16,13 +16,13 @@
 
 #include "cunumeric/set/unique.h"
 #include "cunumeric/set/unique_template.inl"
+#include "cunumeric/utilities/thrust_util.h"
 
 #include "cunumeric/cuda_help.h"
 
 #include <thrust/merge.h>
 #include <thrust/sort.h>
 #include <thrust/unique.h>
-#include <thrust/execution_policy.h>
 
 namespace cunumeric {
 
@@ -102,13 +102,13 @@ static Piece<VAL> tree_reduce(Array& output,
       auto p_mine      = my_piece.first.ptr(0);
       auto p_other     = other_piece.first.ptr(0);
 
-      thrust::merge(thrust::cuda::par.on(stream),
+      thrust::merge(DEFAULT_POLICY.on(stream),
                     p_mine,
                     p_mine + my_piece.second,
                     p_other,
                     p_other + other_piece.second,
                     p_merged);
-      auto* end = thrust::unique(thrust::cuda::par.on(stream), p_merged, p_merged + merged_size);
+      auto* end = thrust::unique(DEFAULT_POLICY.on(stream), p_merged, p_merged + merged_size);
 
       // Make sure we release the memory so that we can reuse it
       my_piece.first.destroy();
@@ -172,8 +172,8 @@ struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
       CHECK_CUDA_STREAM(stream);
 
       // Find unique values
-      thrust::sort(thrust::cuda::par.on(stream), ptr, ptr + volume);
-      end = thrust::unique(thrust::cuda::par.on(stream), ptr, ptr + volume);
+      thrust::sort(DEFAULT_POLICY.on(stream), ptr, ptr + volume);
+      end = thrust::unique(DEFAULT_POLICY.on(stream), ptr, ptr + volume);
     }
 
     Piece<VAL> result;

diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
@@ -19,6 +19,7 @@
 #include "cunumeric/sort/cub_sort.h"
 #include "cunumeric/sort/thrust_sort.h"
 #include "cunumeric/utilities/thrust_allocator.h"
+#include "cunumeric/utilities/thrust_util.h"
 
 #include <thrust/scan.h>
 #include <thrust/sort.h>
@@ -643,7 +644,7 @@ SegmentMergePiece<legate_type_of<CODE>> merge_all_buffers(
     return result;
   } else {
     // maybe k-way merge is more efficient here...
-    auto exec_policy      = thrust::cuda::par(alloc).on(stream);
+    auto exec_policy      = DEFAULT_POLICY(alloc).on(stream);
     size_t num_sort_ranks = merge_buffers.size();
     std::vector<SegmentMergePiece<VAL>> destroy_queue;
     for (size_t stride = 1; stride < num_sort_ranks; stride *= 2) {
@@ -774,7 +775,7 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
     output_values = static_cast<VAL*>(output_ptr);
   }
 
-  auto exec_policy = thrust::cuda::par(alloc).on(stream);
+  auto exec_policy = DEFAULT_POLICY(alloc).on(stream);
 
   {
     // compute diff for each segment
@@ -1334,7 +1335,7 @@ void sample_sort_nccl_nd(SortPiece<legate_type_of<CODE>> local_sorted,
 
   // sort samples on device
   auto alloc       = ThrustAllocator(Memory::GPU_FB_MEM);
-  auto exec_policy = thrust::cuda::par(alloc).on(stream);
+  auto exec_policy = DEFAULT_POLICY(alloc).on(stream);
   thrust::stable_sort(
     exec_policy, samples.ptr(0), samples.ptr(0) + num_samples_g, SegmentSampleComparator<VAL>());
 
@@ -1706,9 +1707,9 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
       size_t offset = rect.lo[DIM - 1];
       if (volume > 0) {
         if (DIM == 1) {
-          thrust::sequence(thrust::cuda::par.on(stream), indices_ptr, indices_ptr + volume, offset);
+          thrust::sequence(DEFAULT_POLICY.on(stream), indices_ptr, indices_ptr + volume, offset);
         } else {
-          thrust::transform(thrust::cuda::par.on(stream),
+          thrust::transform(DEFAULT_POLICY.on(stream),
                             thrust::make_counting_iterator<int64_t>(0),
                             thrust::make_counting_iterator<int64_t>(volume),
                             thrust::make_constant_iterator<int64_t>(segment_size_l),

diff --git a/src/cunumeric/sort/thrust_sort.cuh b/src/cunumeric/sort/thrust_sort.cuh
@@ -18,8 +18,10 @@
 
 #include "core/data/buffer.h"
 #include "cunumeric/utilities/thrust_allocator.h"
+#include "cunumeric/utilities/thrust_util.h"
 
 #include <thrust/sort.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/execution_policy.h>
 
@@ -42,7 +44,7 @@ void thrust_local_sort(const VAL* values_in,
                        cudaStream_t stream)
 {
   auto alloc       = ThrustAllocator(Memory::GPU_FB_MEM);
-  auto exec_policy = thrust::cuda::par(alloc).on(stream);
+  auto exec_policy = DEFAULT_POLICY(alloc).on(stream);
 
   if (values_in != values_out) {
     // not in-place --> need a copy

diff --git a/src/cunumeric/utilities/thrust_util.h b/src/cunumeric/utilities/thrust_util.h
@@ -0,0 +1,26 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include <thrust/version.h>
+#include <thrust/execution_policy.h>
+
+#if THRUST_VERSION >= 101600
+#define DEFAULT_POLICY thrust::cuda::par_nosync
+#else
+#define DEFAULT_POLICY thrust::cuda::par
+#endif