Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batch of fixes for index overflows in grid stride loops. #10448

Merged
merged 10 commits into from
Mar 23, 2022
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ __global__ void valid_if_kernel(
{
constexpr size_type leader_lane{0};
auto const lane_id{threadIdx.x % warp_size};
size_type i = threadIdx.x + blockIdx.x * blockDim.x;
std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
size_type warp_valid_count{0};

auto active_mask = __ballot_sync(0xFFFF'FFFF, i < size);
Expand Down
11 changes: 8 additions & 3 deletions cpp/src/copying/scatter.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,7 +47,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
MapIterator scatter_map,
size_type num_scatter_rows)
{
size_type row = threadIdx.x + blockIdx.x * blockDim.x;
std::size_t row = threadIdx.x + blockIdx.x * blockDim.x;

while (row < num_scatter_rows) {
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
size_type const output_row = scatter_map[row];
Expand Down Expand Up @@ -351,8 +351,13 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>

// Transform negative indices to index + target size
auto scatter_rows = indices.size();
// note: the intermediate ((in % n_rows) + n_rows) will overflow a size_type for any value of `in`
// > (2^31)/2, but the end result after the final (% n_rows) will fit. so we'll do the computation
// using a signed 64 bit value.
auto scatter_iter = thrust::make_transform_iterator(
map_begin, [n_rows] __device__(size_type in) { return ((in % n_rows) + n_rows) % n_rows; });
map_begin, [n_rows = static_cast<int64_t>(n_rows)] __device__(size_type in) -> size_type {
return ((static_cast<int64_t>(in) % n_rows) + n_rows) % n_rows;
});

// Dispatch over data type per column
auto result = std::vector<std::unique_ptr<column>>(target.num_columns());
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/replace/nulls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
cudf::size_type* valid_counter)
{
cudf::size_type nrows = input.size();
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;

uint32_t active_mask = 0xffffffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand Down Expand Up @@ -115,7 +115,7 @@ __global__ void replace_nulls(cudf::column_device_view input,
cudf::size_type* output_valid_count)
{
cudf::size_type nrows = input.size();
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;

uint32_t active_mask = 0xffffffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand Down Expand Up @@ -247,6 +247,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<

std::unique_ptr<cudf::column> offsets = cudf::strings::detail::make_offsets_child_column(
sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);

auto offsets_view = offsets->mutable_view();

auto const bytes =
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/rolling/jit/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
FollowingWindowType following_window_begin,
cudf::size_type min_periods)
{
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::size_type stride = blockDim.x * gridDim.x;
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved

cudf::size_type warp_valid_count{0};
Expand All @@ -66,8 +66,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
cudf::size_type following_window = get_window(following_window_begin, i);

// compute bounds
cudf::size_type start = min(nrows, max(0, i - preceding_window + 1));
cudf::size_type end = min(nrows, max(0, i + following_window + 1));
cudf::size_type start = min(nrows, max(0, static_cast<size_type>(i) - preceding_window + 1));
cudf::size_type end = min(nrows, max(0, static_cast<size_type>(i) + following_window + 1));
bdice marked this conversation as resolved.
Show resolved Hide resolved
cudf::size_type start_index = min(start, end);
cudf::size_type end_index = max(start, end);

Expand Down
12 changes: 7 additions & 5 deletions cpp/src/rolling/rolling_detail.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1008,7 +1008,7 @@ __launch_bounds__(block_size) __global__
PrecedingWindowIterator preceding_window_begin,
FollowingWindowIterator following_window_begin)
{
size_type i = blockIdx.x * block_size + threadIdx.x;
std::size_t i = blockIdx.x * block_size + threadIdx.x;
size_type stride = block_size * gridDim.x;

size_type warp_valid_count{0};
Expand All @@ -1020,10 +1020,12 @@ __launch_bounds__(block_size) __global__
int64_t following_window = following_window_begin[i];

// compute bounds
auto start = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(0L, i - preceding_window + 1)));
auto end = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(0L, i + following_window + 1)));
auto start =
static_cast<size_type>(min(static_cast<int64_t>(input.size()),
max(0L, static_cast<cudf::size_type>(i) - preceding_window + 1)));
auto end =
static_cast<size_type>(min(static_cast<int64_t>(input.size()),
max(0L, static_cast<cudf::size_type>(i) + following_window + 1)));
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
size_type start_index = min(start, end);
size_type end_index = max(start, end);

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/transform/compute_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ __launch_bounds__(max_block_size) __global__
auto evaluator =
cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);

for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
for (std::size_t row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
auto output_dest = ast::detail::mutable_column_expression_result<has_nulls>(output_column);
evaluator.evaluate(output_dest, row_index, thread_intermediate_storage);
}
Expand Down