From 248ccab7e6546d45f111384a1aec09b3403dea2e Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 13 Sep 2023 15:11:37 -0500 Subject: [PATCH 1/4] Some additional kernel thread index refactoring. --- cpp/benchmarks/join/generate_input_tables.cuh | 17 ++++++---- .../type_dispatcher/type_dispatcher.cu | 32 +++++++++++-------- cpp/include/cudf/detail/copy_if_else.cuh | 17 +++++----- 3 files changed, 37 insertions(+), 29 deletions(-) diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index 84e607a9f28..ef2e6370760 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -33,7 +34,7 @@ __global__ static void init_curand(curandState* state, int const nstates) { - int ithread = threadIdx.x + blockIdx.x * blockDim.x; + int ithread = cudf::detail::grid_1d::global_thread_id(); if (ithread < nstates) { curand_init(1234ULL, ithread, 0, state + ithread); } } @@ -45,13 +46,14 @@ __global__ static void init_build_tbl(key_type* const build_tbl, curandState* state, int const num_states) { - auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x; - auto const stride = blockDim.x * gridDim.x; + auto const start_idx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); assert(start_idx < num_states); curandState localState = state[start_idx]; - for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) { + for (thread_index_type tidx = start_idx; tidx < build_tbl_size; tidx += stride) { + auto const idx = static_cast(tidx); double const x = curand_uniform_double(&localState); build_tbl[idx] = static_cast(x * (build_tbl_size / multiplicity)); @@ -70,13 +72,14 @@ __global__ void init_probe_tbl(key_type* const probe_tbl, curandState* state, int const num_states) { - auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x; - auto const stride = blockDim.x * gridDim.x; + auto const start_idx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); assert(start_idx < num_states); curandState localState = state[start_idx]; - for (size_type idx = start_idx; idx < probe_tbl_size; idx += stride) { + for (thread_index_type tidx = start_idx; tidx < probe_tbl_size; tidx += stride) { + auto const idx = static_cast(tidx); key_type val; double x = curand_uniform_double(&localState); diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index 3f985cffb1f..5a2cbe5a395 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -60,13 +60,15 @@ constexpr int block_size = 256; template __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols) { - using F = Functor; - cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x; - while (index < n_rows) { + using F = Functor; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + while (tidx < n_rows) { + auto const index = static_cast(tid); for (int c = 0; c < n_cols; c++) { A[c][index] = F::f(A[c][index]); } - index += blockDim.x * gridDim.x; + tidx += stride; } } @@ -74,12 +76,14 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_ template __global__ void host_dispatching_kernel(cudf::mutable_column_device_view source_column) { - using F = Functor; - T* A = source_column.data(); - cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x; - while (index < source_column.size()) { - A[index] = F::f(A[index]); - index += blockDim.x * gridDim.x; + using F = Functor; + T* A = source_column.data(); + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + while (tidx < source_column.size()) { + auto const index = static_cast(tid); + A[index] = F::f(A[index]); + tidx += stride; } } @@ -127,14 +131,14 @@ template __global__ void device_dispatching_kernel(cudf::mutable_table_device_view source) { cudf::size_type const n_rows = source.num_rows(); - cudf::size_type index = threadIdx.x + blockIdx.x * blockDim.x; - - while (index < n_rows) { + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + while (tidx < n_rows) { for (cudf::size_type i = 0; i < source.num_columns(); i++) { cudf::type_dispatcher( source.column(i).type(), RowHandle{}, source.column(i), index); } - index += blockDim.x * gridDim.x; + tidx += stride; } // while } diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 04ad1f20196..48c70df1862 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -44,18 +44,19 @@ __launch_bounds__(block_size) __global__ mutable_column_device_view out, size_type* __restrict__ const valid_count) { - size_type const tid = threadIdx.x + blockIdx.x * block_size; - int const warp_id = tid / warp_size; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + int const warp_id = tidx / warp_size; size_type const warps_per_grid = gridDim.x * block_size / warp_size; // begin/end indices for the column data - size_type begin = 0; - size_type end = out.size(); + size_type const begin = 0; + size_type const end = out.size(); // warp indices. since 1 warp == 32 threads == sizeof(bitmask_type) * 8, // each warp will process one (32 bit) of the validity mask via // __ballot_sync() - size_type warp_begin = cudf::word_index(begin); - size_type warp_end = cudf::word_index(end - 1); + size_type const warp_begin = cudf::word_index(begin); + size_type const warp_end = cudf::word_index(end - 1); // lane id within the current warp constexpr size_type leader_lane{0}; @@ -65,8 +66,8 @@ __launch_bounds__(block_size) __global__ // current warp. size_type warp_cur = warp_begin + warp_id; - size_type index = tid; while (warp_cur <= warp_end) { + auto const index = static_cast(tidx); auto const opt_value = (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt; if (opt_value) { out.element(index) = static_cast(*opt_value); } @@ -84,7 +85,7 @@ __launch_bounds__(block_size) __global__ // next grid warp_cur += warps_per_grid; - index += block_size * gridDim.x; + tidx += stride; } if (has_nulls) { From 9e4154dd176c677ad7168c9f0770a7e2363c6970 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 May 2024 15:15:58 -0700 Subject: [PATCH 2/4] Apply review suggestions. --- cpp/include/cudf/detail/copy_if_else.cuh | 12 +++++----- cpp/include/cudf/detail/utilities/cuda.cuh | 26 ++++++++++++++++++++++ cpp/include/cudf/detail/valid_if.cuh | 4 ++-- cpp/src/bitmask/null_mask.cu | 4 ++-- cpp/src/copying/concatenate.cu | 4 ++-- cpp/src/join/conditional_join_kernels.cuh | 10 ++++----- cpp/src/strings/convert/convert_urls.cu | 18 ++++++++------- 7 files changed, 53 insertions(+), 25 deletions(-) diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 1f45f6e4258..8418e279ce7 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -45,10 +45,10 @@ __launch_bounds__(block_size) CUDF_KERNEL mutable_column_device_view out, size_type* __restrict__ const valid_count) { - auto tidx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - int const warp_id = tidx / warp_size; - size_type const warps_per_grid = gridDim.x * block_size / warp_size; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + int const warp_id = tidx / cudf::detail::warp_size; + size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size; // begin/end indices for the column data size_type const begin = 0; @@ -61,7 +61,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // lane id within the current warp constexpr size_type leader_lane{0}; - int const lane_id = threadIdx.x % warp_size; + int const lane_id = threadIdx.x % cudf::detail::warp_size; size_type warp_valid_count{0}; @@ -160,7 +160,7 @@ std::unique_ptr copy_if_else(bool nullable, using Element = typename thrust::iterator_traits::value_type::value_type; size_type size = std::distance(lhs_begin, lhs_end); - size_type num_els = cudf::util::round_up_safe(size, warp_size); + size_type num_els = cudf::util::round_up_safe(size, cudf::detail::warp_size); constexpr int block_size = 256; cudf::detail::grid_1d grid{num_els, block_size, 1}; diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 86c85ca8d06..f1775c6d6d7 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -93,6 +93,19 @@ class grid_1d { return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x); } + /** + * @brief Returns the global thread index of the current thread in a 1D grid. + * + * @tparam num_threads_per_block The number of threads per block + * + * @return thread_index_type The global thread index + */ + template + static __device__ thread_index_type global_thread_id() + { + return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block); + } + /** * @brief Returns the stride of a 1D grid. * @@ -115,6 +128,19 @@ class grid_1d { * @return thread_index_type The number of threads in the grid. */ static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); } + + /** + * @brief Returns the stride of the current 1D grid. + * + * @tparam num_threads_per_block The number of threads per block + * + * @return thread_index_type The number of threads in the grid. + */ + template + static __device__ thread_index_type grid_stride() + { + return grid_stride(num_threads_per_block, gridDim.x); + } }; /** diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index 66163d6059a..64a3c4edf78 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -50,8 +50,8 @@ CUDF_KERNEL void valid_if_kernel( { constexpr size_type leader_lane{0}; auto const lane_id{threadIdx.x % warp_size}; - auto i = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); + auto i = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); size_type warp_valid_count{0}; auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size); diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 4da2e502ce6..d0faeea8336 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -269,8 +269,8 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask, auto const first_word_index{word_index(first_bit_index)}; auto const last_word_index{word_index(last_bit_index)}; - thread_index_type const tid = grid_1d::global_thread_id(); - thread_index_type const stride = grid_1d::grid_stride(); + thread_index_type const tid = grid_1d::global_thread_id(); + thread_index_type const stride = grid_1d::grid_stride(); thread_index_type thread_word_index = tid + first_word_index; size_type thread_count{0}; diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index b1136a9eeb3..47e74a5cb48 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -121,8 +121,8 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views, size_type number_of_mask_bits, size_type* out_valid_count) { - auto tidx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); auto active_mask = __ballot_sync(0xFFFF'FFFFu, tidx < number_of_mask_bits); size_type warp_valid_count = 0; diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh index 5e190eb2b27..1e16c451f5a 100644 --- a/cpp/src/join/conditional_join_kernels.cuh +++ b/cpp/src/join/conditional_join_kernels.cuh @@ -67,8 +67,8 @@ CUDF_KERNEL void compute_conditional_join_output_size( &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; std::size_t thread_counter{0}; - auto const start_idx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const start_idx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); cudf::thread_index_type const left_num_rows = left_table.num_rows(); cudf::thread_index_type const right_num_rows = right_table.num_rows(); @@ -174,7 +174,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table, __syncwarp(); - auto outer_row_index = cudf::detail::grid_1d::global_thread_id(); + auto outer_row_index = cudf::detail::grid_1d::global_thread_id(); unsigned int const activemask = __ballot_sync(0xffff'ffffu, outer_row_index < outer_num_rows); @@ -295,8 +295,8 @@ CUDF_KERNEL void conditional_join_anti_semi( int const lane_id = threadIdx.x % detail::warp_size; cudf::thread_index_type const outer_num_rows = left_table.num_rows(); cudf::thread_index_type const inner_num_rows = right_table.num_rows(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - auto const start_idx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const start_idx = cudf::detail::grid_1d::global_thread_id(); if (0 == lane_id) { current_idx_shared[warp_id] = 0; } diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 459c3e88a4e..d9920be045f 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -202,10 +202,11 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings, __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size]; __shared__ typename cub::WarpReduce::TempStorage cub_storage[num_warps_per_threadblock]; - auto const global_thread_id = cudf::detail::grid_1d::global_thread_id(); - auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); - auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); - auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); + auto const global_thread_id = + cudf::detail::grid_1d::global_thread_id(); + auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); + auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); + auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); auto const nwarps = static_cast(gridDim.x * blockDim.x / cudf::detail::warp_size); char* in_chars_shared = temporary_buffer[local_warp_id]; @@ -287,10 +288,11 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings, __shared__ typename cub::WarpScan::TempStorage cub_storage[num_warps_per_threadblock]; __shared__ size_type out_idx[num_warps_per_threadblock]; - auto const global_thread_id = cudf::detail::grid_1d::global_thread_id(); - auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); - auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); - auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); + auto const global_thread_id = + cudf::detail::grid_1d::global_thread_id(); + auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); + auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); + auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); auto const nwarps = static_cast(gridDim.x * blockDim.x / cudf::detail::warp_size); char* in_chars_shared = temporary_buffer[local_warp_id]; From e8245f77a8d1f738aa2ac329b8c504e44b0a27e1 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 6 May 2024 16:54:36 -0500 Subject: [PATCH 3/4] Fix benchmark builds. --- cpp/benchmarks/join/generate_input_tables.cuh | 4 ++-- cpp/benchmarks/type_dispatcher/type_dispatcher.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index a8c3433c9ba..f7984b29d6b 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -53,7 +53,7 @@ CUDF_KERNEL void init_build_tbl(key_type* const build_tbl, curandState localState = state[start_idx]; - for (thread_index_type tidx = start_idx; tidx < build_tbl_size; tidx += stride) { + for (cudf::thread_index_type tidx = start_idx; tidx < build_tbl_size; tidx += stride) { auto const idx = static_cast(tidx); double const x = curand_uniform_double(&localState); @@ -79,7 +79,7 @@ CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl, curandState localState = state[start_idx]; - for (thread_index_type tidx = start_idx; tidx < probe_tbl_size; tidx += stride) { + for (cudf::thread_index_type tidx = start_idx; tidx < probe_tbl_size; tidx += stride) { auto const idx = static_cast(tidx); key_type val; double x = curand_uniform_double(&localState); diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index 2708ced0779..4d07e292620 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -64,7 +64,7 @@ CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size auto tidx = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); while (tidx < n_rows) { - auto const index = static_cast(tid); + auto const index = static_cast(tidx); for (int c = 0; c < n_cols; c++) { A[c][index] = F::f(A[c][index]); } @@ -81,7 +81,7 @@ CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source auto tidx = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); while (tidx < source_column.size()) { - auto const index = static_cast(tid); + auto const index = static_cast(tidx); A[index] = F::f(A[index]); tidx += stride; } From 5f6713fc139947149ce0a8ca75148b4b3c841957 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 7 May 2024 12:31:23 -0500 Subject: [PATCH 4/4] Fix type_dispatcher benchmark. --- cpp/benchmarks/type_dispatcher/type_dispatcher.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index 4d07e292620..3aff75d840e 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -134,6 +134,7 @@ CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view sourc auto tidx = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); while (tidx < n_rows) { + auto const index = static_cast(tidx); for (cudf::size_type i = 0; i < source.num_columns(); i++) { cudf::type_dispatcher( source.column(i).type(), RowHandle{}, source.column(i), index);