From 6ca627a29f8f0b497d4c17df844a0d7383d985ff Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Fri, 3 Feb 2023 14:25:52 -0800 Subject: [PATCH 1/3] Use thrust::cuda::par_nosync if available --- src/cunumeric/index/advanced_indexing.cu | 4 ++-- src/cunumeric/index/repeat.cu | 4 ++-- src/cunumeric/scan/scan_global.cu | 4 ++-- src/cunumeric/scan/scan_local.cu | 6 +++--- src/cunumeric/search/nonzero.cuh | 4 ++-- src/cunumeric/set/unique.cu | 6 +++--- src/cunumeric/sort/sort.cu | 11 +++++----- src/cunumeric/sort/thrust_sort.cuh | 4 +++- src/cunumeric/utilities/thrust_util.h | 26 ++++++++++++++++++++++++ 9 files changed, 49 insertions(+), 20 deletions(-) create mode 100644 src/cunumeric/utilities/thrust_util.h diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu index a7d3f2f94..b2f578a68 100644 --- a/src/cunumeric/index/advanced_indexing.cu +++ b/src/cunumeric/index/advanced_indexing.cu @@ -16,10 +16,10 @@ #include "cunumeric/index/advanced_indexing.h" #include "cunumeric/index/advanced_indexing_template.inl" +#include "cunumeric/utilities/thrust_util.h" #include "cunumeric/cuda_help.h" #include -#include namespace cunumeric { @@ -107,7 +107,7 @@ struct AdvancedIndexingImplBody { CHECK_CUDA_STREAM(stream); auto off_ptr = offsets.ptr(0); - thrust::exclusive_scan(thrust::cuda::par.on(stream), off_ptr, off_ptr + volume, off_ptr); + thrust::exclusive_scan(DEFAULT_POLICY.on(stream), off_ptr, off_ptr + volume, off_ptr); return size.read(stream); } diff --git a/src/cunumeric/index/repeat.cu b/src/cunumeric/index/repeat.cu index 1b658874a..85f2032a5 100644 --- a/src/cunumeric/index/repeat.cu +++ b/src/cunumeric/index/repeat.cu @@ -16,10 +16,10 @@ #include "cunumeric/index/repeat.h" #include "cunumeric/index/repeat_template.inl" +#include "cunumeric/utilities/thrust_util.h" #include "cunumeric/cuda_help.h" #include -#include namespace cunumeric { @@ -157,7 +157,7 @@ struct RepeatImplBody { auto out = out_array.create_output_buffer(out_extents, true); auto p_offsets = offsets.ptr(0); - thrust::exclusive_scan(thrust::cuda::par.on(stream), p_offsets, p_offsets + extent, p_offsets); + thrust::exclusive_scan(DEFAULT_POLICY.on(stream), p_offsets, p_offsets + extent, p_offsets); const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; repeat_kernel<<>>( diff --git a/src/cunumeric/scan/scan_global.cu b/src/cunumeric/scan/scan_global.cu index 6b2afdcb0..424bde0cb 100644 --- a/src/cunumeric/scan/scan_global.cu +++ b/src/cunumeric/scan/scan_global.cu @@ -16,9 +16,9 @@ #include "cunumeric/scan/scan_global.h" #include "cunumeric/scan/scan_global_template.inl" +#include "cunumeric/utilities/thrust_util.h" #include -#include #include "cunumeric/cuda_help.h" @@ -69,7 +69,7 @@ struct ScanGlobalImplBody { sum_valsp[DIM - 1] = 0; auto sum_valsp_end = sum_valsp; sum_valsp_end[DIM - 1] = partition_index[DIM - 1]; - auto global_prefix = thrust::reduce(thrust::cuda::par.on(stream), + auto global_prefix = thrust::reduce(DEFAULT_POLICY.on(stream), &sum_vals[sum_valsp], &sum_vals[sum_valsp_end], (VAL)ScanOp::nan_identity, diff --git a/src/cunumeric/scan/scan_local.cu b/src/cunumeric/scan/scan_local.cu index ddc073951..b27678697 100644 --- a/src/cunumeric/scan/scan_local.cu +++ b/src/cunumeric/scan/scan_local.cu @@ -17,9 +17,9 @@ #include "cunumeric/scan/scan_local.h" #include "cunumeric/scan/scan_local_template.inl" #include "cunumeric/unary/isnan.h" +#include "cunumeric/utilities/thrust_util.h" #include -#include #include #include "cunumeric/cuda_help.h" @@ -65,7 +65,7 @@ struct ScanLocalImplBody { for (uint64_t index = 0; index < volume; index += stride) { thrust::inclusive_scan( - thrust::cuda::par.on(stream), inptr + index, inptr + index + stride, outptr + index, func); + DEFAULT_POLICY.on(stream), inptr + index, inptr + index + stride, outptr + index, func); // get the corresponding ND index with base zero to use for sum_val auto sum_valp = pitches.unflatten(index, Point::ZEROES()); // only one element on scan axis @@ -112,7 +112,7 @@ struct ScanLocalNanImplBody { for (uint64_t index = 0; index < volume; index += stride) { thrust::inclusive_scan( - thrust::cuda::par.on(stream), + DEFAULT_POLICY.on(stream), thrust::make_transform_iterator(inptr + index, convert_nan_func()), thrust::make_transform_iterator(inptr + index + stride, convert_nan_func()), outptr + index, diff --git a/src/cunumeric/search/nonzero.cuh b/src/cunumeric/search/nonzero.cuh index e9af92578..732bafac8 100644 --- a/src/cunumeric/search/nonzero.cuh +++ b/src/cunumeric/search/nonzero.cuh @@ -15,9 +15,9 @@ */ #include -#include #include "cunumeric/cuda_help.h" +#include "cunumeric/utilities/thrust_util.h" namespace cunumeric { @@ -49,7 +49,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) static void exclusive_sum(int64_t* offsets, size_t volume, cudaStream_t stream) { - thrust::exclusive_scan(thrust::cuda::par.on(stream), offsets, offsets + volume, offsets); + thrust::exclusive_scan(DEFAULT_POLICY.on(stream), offsets, offsets + volume, offsets); } template diff --git a/src/cunumeric/set/unique.cu b/src/cunumeric/set/unique.cu index 9104474ef..147345c0c 100644 --- a/src/cunumeric/set/unique.cu +++ b/src/cunumeric/set/unique.cu @@ -16,13 +16,13 @@ #include "cunumeric/set/unique.h" #include "cunumeric/set/unique_template.inl" +#include "cunumeric/utilities/thrust_util.h" #include "cunumeric/cuda_help.h" #include #include #include -#include namespace cunumeric { @@ -102,13 +102,13 @@ static Piece tree_reduce(Array& output, auto p_mine = my_piece.first.ptr(0); auto p_other = other_piece.first.ptr(0); - thrust::merge(thrust::cuda::par.on(stream), + thrust::merge(DEFAULT_POLICY.on(stream), p_mine, p_mine + my_piece.second, p_other, p_other + other_piece.second, p_merged); - auto* end = thrust::unique(thrust::cuda::par.on(stream), p_merged, p_merged + merged_size); + auto* end = thrust::unique(DEFAULT_POLICY.on(stream), p_merged, p_merged + merged_size); // Make sure we release the memory so that we can reuse it my_piece.first.destroy(); diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu index c303eb1ab..55b9fd68a 100644 --- a/src/cunumeric/sort/sort.cu +++ b/src/cunumeric/sort/sort.cu @@ -19,6 +19,7 @@ #include "cunumeric/sort/cub_sort.h" #include "cunumeric/sort/thrust_sort.h" #include "cunumeric/utilities/thrust_allocator.h" +#include "cunumeric/utilities/thrust_util.h" #include #include @@ -643,7 +644,7 @@ SegmentMergePiece> merge_all_buffers( return result; } else { // maybe k-way merge is more efficient here... - auto exec_policy = thrust::cuda::par(alloc).on(stream); + auto exec_policy = DEFAULT_POLICY(alloc).on(stream); size_t num_sort_ranks = merge_buffers.size(); std::vector> destroy_queue; for (size_t stride = 1; stride < num_sort_ranks; stride *= 2) { @@ -774,7 +775,7 @@ void rebalance_data(SegmentMergePiece& merge_buffer, output_values = static_cast(output_ptr); } - auto exec_policy = thrust::cuda::par(alloc).on(stream); + auto exec_policy = DEFAULT_POLICY(alloc).on(stream); { // compute diff for each segment @@ -1334,7 +1335,7 @@ void sample_sort_nccl_nd(SortPiece> local_sorted, // sort samples on device auto alloc = ThrustAllocator(Memory::GPU_FB_MEM); - auto exec_policy = thrust::cuda::par(alloc).on(stream); + auto exec_policy = DEFAULT_POLICY(alloc).on(stream); thrust::stable_sort( exec_policy, samples.ptr(0), samples.ptr(0) + num_samples_g, SegmentSampleComparator()); @@ -1706,9 +1707,9 @@ struct SortImplBody { size_t offset = rect.lo[DIM - 1]; if (volume > 0) { if (DIM == 1) { - thrust::sequence(thrust::cuda::par.on(stream), indices_ptr, indices_ptr + volume, offset); + thrust::sequence(DEFAULT_POLICY.on(stream), indices_ptr, indices_ptr + volume, offset); } else { - thrust::transform(thrust::cuda::par.on(stream), + thrust::transform(DEFAULT_POLICY.on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(volume), thrust::make_constant_iterator(segment_size_l), diff --git a/src/cunumeric/sort/thrust_sort.cuh b/src/cunumeric/sort/thrust_sort.cuh index eee97e199..66c922d1e 100644 --- a/src/cunumeric/sort/thrust_sort.cuh +++ b/src/cunumeric/sort/thrust_sort.cuh @@ -18,8 +18,10 @@ #include "core/data/buffer.h" #include "cunumeric/utilities/thrust_allocator.h" +#include "cunumeric/utilities/thrust_util.h" #include +#include #include #include @@ -42,7 +44,7 @@ void thrust_local_sort(const VAL* values_in, cudaStream_t stream) { auto alloc = ThrustAllocator(Memory::GPU_FB_MEM); - auto exec_policy = thrust::cuda::par(alloc).on(stream); + auto exec_policy = DEFAULT_POLICY(alloc).on(stream); if (values_in != values_out) { // not in-place --> need a copy diff --git a/src/cunumeric/utilities/thrust_util.h b/src/cunumeric/utilities/thrust_util.h new file mode 100644 index 000000000..b121dd6d4 --- /dev/null +++ b/src/cunumeric/utilities/thrust_util.h @@ -0,0 +1,26 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include +#include + +#if (THRUST_MAJOR_VERSION >= 1) && (THRUST_MINOR_VERSION >= 16) +#define DEFAULT_POLICY thrust::cuda::par_nosync +#else +#define DEFAULT_POLICY thrust::cuda::par +#endif From 41953ae7d852417a46fbbe1e47b48103cfcd3dbe Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Fri, 10 Feb 2023 11:47:26 -0800 Subject: [PATCH 2/3] Fix the thrust version check --- src/cunumeric/utilities/thrust_util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cunumeric/utilities/thrust_util.h b/src/cunumeric/utilities/thrust_util.h index b121dd6d4..99ecbd644 100644 --- a/src/cunumeric/utilities/thrust_util.h +++ b/src/cunumeric/utilities/thrust_util.h @@ -1,4 +1,4 @@ -/* Copyright 2022 NVIDIA Corporation +/* Copyright 2023 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ #include #include -#if (THRUST_MAJOR_VERSION >= 1) && (THRUST_MINOR_VERSION >= 16) +#if THRUST_VERSION >= 101600 #define DEFAULT_POLICY thrust::cuda::par_nosync #else #define DEFAULT_POLICY thrust::cuda::par From 8cba3cdec8e92745e5b299f79d3fb357d4ea3c92 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Fri, 10 Feb 2023 11:48:29 -0800 Subject: [PATCH 3/3] Fix the remaining execution policy creations --- src/cunumeric/set/unique.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cunumeric/set/unique.cu b/src/cunumeric/set/unique.cu index 147345c0c..1756aad1a 100644 --- a/src/cunumeric/set/unique.cu +++ b/src/cunumeric/set/unique.cu @@ -172,8 +172,8 @@ struct UniqueImplBody { CHECK_CUDA_STREAM(stream); // Find unique values - thrust::sort(thrust::cuda::par.on(stream), ptr, ptr + volume); - end = thrust::unique(thrust::cuda::par.on(stream), ptr, ptr + volume); + thrust::sort(DEFAULT_POLICY.on(stream), ptr, ptr + volume); + end = thrust::unique(DEFAULT_POLICY.on(stream), ptr, ptr + volume); } Piece result;