Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use thrust::cuda::par_nosync if available #780

Merged
merged 3 commits into from
Feb 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/cunumeric/index/advanced_indexing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

#include "cunumeric/index/advanced_indexing.h"
#include "cunumeric/index/advanced_indexing_template.inl"
#include "cunumeric/utilities/thrust_util.h"
#include "cunumeric/cuda_help.h"

#include <thrust/scan.h>
#include <thrust/execution_policy.h>

namespace cunumeric {

Expand Down Expand Up @@ -107,7 +107,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
CHECK_CUDA_STREAM(stream);

auto off_ptr = offsets.ptr(0);
thrust::exclusive_scan(thrust::cuda::par.on(stream), off_ptr, off_ptr + volume, off_ptr);
thrust::exclusive_scan(DEFAULT_POLICY.on(stream), off_ptr, off_ptr + volume, off_ptr);

return size.read(stream);
}
Expand Down
4 changes: 2 additions & 2 deletions src/cunumeric/index/repeat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

#include "cunumeric/index/repeat.h"
#include "cunumeric/index/repeat_template.inl"
#include "cunumeric/utilities/thrust_util.h"
#include "cunumeric/cuda_help.h"

#include <thrust/scan.h>
#include <thrust/execution_policy.h>

namespace cunumeric {

Expand Down Expand Up @@ -157,7 +157,7 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
auto out = out_array.create_output_buffer<VAL, DIM>(out_extents, true);

auto p_offsets = offsets.ptr(0);
thrust::exclusive_scan(thrust::cuda::par.on(stream), p_offsets, p_offsets + extent, p_offsets);
thrust::exclusive_scan(DEFAULT_POLICY.on(stream), p_offsets, p_offsets + extent, p_offsets);

const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
repeat_kernel<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
Expand Down
4 changes: 2 additions & 2 deletions src/cunumeric/scan/scan_global.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

#include "cunumeric/scan/scan_global.h"
#include "cunumeric/scan/scan_global_template.inl"
#include "cunumeric/utilities/thrust_util.h"

#include <thrust/reduce.h>
#include <thrust/execution_policy.h>

#include "cunumeric/cuda_help.h"

Expand Down Expand Up @@ -69,7 +69,7 @@ struct ScanGlobalImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
sum_valsp[DIM - 1] = 0;
auto sum_valsp_end = sum_valsp;
sum_valsp_end[DIM - 1] = partition_index[DIM - 1];
auto global_prefix = thrust::reduce(thrust::cuda::par.on(stream),
auto global_prefix = thrust::reduce(DEFAULT_POLICY.on(stream),
&sum_vals[sum_valsp],
&sum_vals[sum_valsp_end],
(VAL)ScanOp<OP_CODE, CODE>::nan_identity,
Expand Down
6 changes: 3 additions & 3 deletions src/cunumeric/scan/scan_local.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
#include "cunumeric/scan/scan_local.h"
#include "cunumeric/scan/scan_local_template.inl"
#include "cunumeric/unary/isnan.h"
#include "cunumeric/utilities/thrust_util.h"

#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/transform_iterator.h>

#include "cunumeric/cuda_help.h"
Expand Down Expand Up @@ -65,7 +65,7 @@ struct ScanLocalImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {

for (uint64_t index = 0; index < volume; index += stride) {
thrust::inclusive_scan(
thrust::cuda::par.on(stream), inptr + index, inptr + index + stride, outptr + index, func);
DEFAULT_POLICY.on(stream), inptr + index, inptr + index + stride, outptr + index, func);
// get the corresponding ND index with base zero to use for sum_val
auto sum_valp = pitches.unflatten(index, Point<DIM>::ZEROES());
// only one element on scan axis
Expand Down Expand Up @@ -112,7 +112,7 @@ struct ScanLocalNanImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {

for (uint64_t index = 0; index < volume; index += stride) {
thrust::inclusive_scan(
thrust::cuda::par.on(stream),
DEFAULT_POLICY.on(stream),
thrust::make_transform_iterator(inptr + index, convert_nan_func()),
thrust::make_transform_iterator(inptr + index + stride, convert_nan_func()),
outptr + index,
Expand Down
4 changes: 2 additions & 2 deletions src/cunumeric/search/nonzero.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
*/

#include <thrust/scan.h>
#include <thrust/execution_policy.h>

#include "cunumeric/cuda_help.h"
#include "cunumeric/utilities/thrust_util.h"

namespace cunumeric {

Expand Down Expand Up @@ -49,7 +49,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)

static void exclusive_sum(int64_t* offsets, size_t volume, cudaStream_t stream)
{
thrust::exclusive_scan(thrust::cuda::par.on(stream), offsets, offsets + volume, offsets);
thrust::exclusive_scan(DEFAULT_POLICY.on(stream), offsets, offsets + volume, offsets);
}

template <typename VAL, int32_t DIM>
Expand Down
10 changes: 5 additions & 5 deletions src/cunumeric/set/unique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@

#include "cunumeric/set/unique.h"
#include "cunumeric/set/unique_template.inl"
#include "cunumeric/utilities/thrust_util.h"

#include "cunumeric/cuda_help.h"

#include <thrust/merge.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/execution_policy.h>

namespace cunumeric {

Expand Down Expand Up @@ -102,13 +102,13 @@ static Piece<VAL> tree_reduce(Array& output,
auto p_mine = my_piece.first.ptr(0);
auto p_other = other_piece.first.ptr(0);

thrust::merge(thrust::cuda::par.on(stream),
thrust::merge(DEFAULT_POLICY.on(stream),
p_mine,
p_mine + my_piece.second,
p_other,
p_other + other_piece.second,
p_merged);
auto* end = thrust::unique(thrust::cuda::par.on(stream), p_merged, p_merged + merged_size);
auto* end = thrust::unique(DEFAULT_POLICY.on(stream), p_merged, p_merged + merged_size);

// Make sure we release the memory so that we can reuse it
manopapad marked this conversation as resolved.
Show resolved Hide resolved
my_piece.first.destroy();
manopapad marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -172,8 +172,8 @@ struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
CHECK_CUDA_STREAM(stream);

// Find unique values
thrust::sort(thrust::cuda::par.on(stream), ptr, ptr + volume);
end = thrust::unique(thrust::cuda::par.on(stream), ptr, ptr + volume);
thrust::sort(DEFAULT_POLICY.on(stream), ptr, ptr + volume);
end = thrust::unique(DEFAULT_POLICY.on(stream), ptr, ptr + volume);
}

Piece<VAL> result;
Expand Down
11 changes: 6 additions & 5 deletions src/cunumeric/sort/sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "cunumeric/sort/cub_sort.h"
#include "cunumeric/sort/thrust_sort.h"
#include "cunumeric/utilities/thrust_allocator.h"
#include "cunumeric/utilities/thrust_util.h"

#include <thrust/scan.h>
manopapad marked this conversation as resolved.
Show resolved Hide resolved
#include <thrust/sort.h>
Expand Down Expand Up @@ -643,7 +644,7 @@ SegmentMergePiece<legate_type_of<CODE>> merge_all_buffers(
return result;
} else {
// maybe k-way merge is more efficient here...
auto exec_policy = thrust::cuda::par(alloc).on(stream);
auto exec_policy = DEFAULT_POLICY(alloc).on(stream);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We would need to add synchronization before the cleanup loop in L729 in order to protect buffer destruction.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mfoerste4 can you elaborate why we need to protect buffer destructions?

size_t num_sort_ranks = merge_buffers.size();
std::vector<SegmentMergePiece<VAL>> destroy_queue;
for (size_t stride = 1; stride < num_sort_ranks; stride *= 2) {
Expand Down Expand Up @@ -774,7 +775,7 @@ void rebalance_data(SegmentMergePiece<VAL>& merge_buffer,
output_values = static_cast<VAL*>(output_ptr);
}

auto exec_policy = thrust::cuda::par(alloc).on(stream);
auto exec_policy = DEFAULT_POLICY(alloc).on(stream);

{
// compute diff for each segment
Expand Down Expand Up @@ -1334,7 +1335,7 @@ void sample_sort_nccl_nd(SortPiece<legate_type_of<CODE>> local_sorted,

// sort samples on device
auto alloc = ThrustAllocator(Memory::GPU_FB_MEM);
auto exec_policy = thrust::cuda::par(alloc).on(stream);
auto exec_policy = DEFAULT_POLICY(alloc).on(stream);
thrust::stable_sort(
exec_policy, samples.ptr(0), samples.ptr(0) + num_samples_g, SegmentSampleComparator<VAL>());

Expand Down Expand Up @@ -1706,9 +1707,9 @@ struct SortImplBody<VariantKind::GPU, CODE, DIM> {
size_t offset = rect.lo[DIM - 1];
if (volume > 0) {
if (DIM == 1) {
thrust::sequence(thrust::cuda::par.on(stream), indices_ptr, indices_ptr + volume, offset);
thrust::sequence(DEFAULT_POLICY.on(stream), indices_ptr, indices_ptr + volume, offset);
} else {
thrust::transform(thrust::cuda::par.on(stream),
thrust::transform(DEFAULT_POLICY.on(stream),
thrust::make_counting_iterator<int64_t>(0),
thrust::make_counting_iterator<int64_t>(volume),
thrust::make_constant_iterator<int64_t>(segment_size_l),
Expand Down
4 changes: 3 additions & 1 deletion src/cunumeric/sort/thrust_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@

#include "core/data/buffer.h"
#include "cunumeric/utilities/thrust_allocator.h"
#include "cunumeric/utilities/thrust_util.h"

#include <thrust/sort.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/execution_policy.h>

Expand All @@ -42,7 +44,7 @@ void thrust_local_sort(const VAL* values_in,
cudaStream_t stream)
{
auto alloc = ThrustAllocator(Memory::GPU_FB_MEM);
auto exec_policy = thrust::cuda::par(alloc).on(stream);
auto exec_policy = DEFAULT_POLICY(alloc).on(stream);
manopapad marked this conversation as resolved.
Show resolved Hide resolved

if (values_in != values_out) {
// not in-place --> need a copy
Expand Down
26 changes: 26 additions & 0 deletions src/cunumeric/utilities/thrust_util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/* Copyright 2023 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

#pragma once

#include <thrust/version.h>
#include <thrust/execution_policy.h>

#if THRUST_VERSION >= 101600
#define DEFAULT_POLICY thrust::cuda::par_nosync
#else
#define DEFAULT_POLICY thrust::cuda::par
#endif