Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use cudf::size_type instead of int32 where appropriate in nvtext functions #13915

Merged
merged 9 commits into from
Aug 25, 2023
18 changes: 9 additions & 9 deletions cpp/src/text/normalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ constexpr uint32_t UTF8_3BYTE = 0x01'0000;
struct codepoint_to_utf8_fn {
cudf::column_device_view const d_strings; // input strings
uint32_t const* cp_data; // full code-point array
int32_t const* d_cp_offsets{}; // offsets to each string's code-point array
int32_t* d_offsets{}; // offsets for the output strings
cudf::size_type const* d_cp_offsets{}; // offsets to each string's code-point array
cudf::size_type* d_offsets{}; // offsets for the output strings
char* d_chars{}; // buffer for the output strings column

/**
Expand All @@ -118,15 +118,15 @@ struct codepoint_to_utf8_fn {
* @param count number of code-points in `str_cps`
* @return Number of bytes required for the output
*/
__device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count)
__device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count)
{
return thrust::transform_reduce(
thrust::seq,
str_cps,
str_cps + count,
[](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); },
0,
thrust::plus<int32_t>());
thrust::plus());
}

__device__ void operator()(cudf::size_type idx)
Expand Down Expand Up @@ -208,9 +208,9 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
auto const aux_table = get_aux_codepoint_data(stream);
auto const normalizer = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
auto const offsets = strings.offsets();
auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
auto const offset = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
auto const d_chars = strings.chars().data<char>() + offset;
auto const d_offsets = offsets.data<cudf::size_type>() + strings.offset();
auto const offset = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
auto const d_chars = strings.chars().data<char>() + offset;
return normalizer.normalize(d_chars, d_offsets, strings.size(), stream);
}();

Expand All @@ -222,8 +222,8 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
// convert the result into a strings column
// - the cp_chars are the new 4-byte code-point values for all the characters in the output
// - the cp_offsets identify which code-points go with which strings
uint32_t const* cp_chars = result.first->data();
int32_t const* cp_offsets = reinterpret_cast<int32_t const*>(result.second->data());
uint32_t const* cp_chars = result.first->data();
cudf::size_type const* cp_offsets = result.second->data();

auto d_strings = cudf::column_device_view::create(strings.parent(), stream);

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/text/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ using replace_result = thrust::pair<bool, cudf::string_view>;
struct base_token_replacer_fn {
cudf::column_device_view const d_strings; ///< strings to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing
int32_t* d_offsets{}; ///< for locating output string in d_chars
cudf::size_type* d_offsets{}; ///< for locating output string in d_chars
char* d_chars{}; ///< output buffer

/**
Expand Down
26 changes: 15 additions & 11 deletions cpp/src/text/stemmer.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -184,17 +184,19 @@ struct dispatch_is_letter_fn {
struct porter_stemmer_measure_fn {
cudf::column_device_view const d_strings; // strings to measure

__device__ int32_t operator()(cudf::size_type idx) const
__device__ cudf::size_type operator()(cudf::size_type idx) const
{
if (d_strings.is_null(idx)) return 0;
if (d_strings.is_null(idx)) { return 0; }
cudf::string_view d_str = d_strings.element<cudf::string_view>(idx);
if (d_str.empty()) return 0;
int32_t measure = 0;
auto itr = d_str.begin();
bool vowel_run = !is_consonant(itr);
if (d_str.empty()) { return 0; }

cudf::size_type measure = 0;

auto itr = d_str.begin();
bool vowel_run = !is_consonant(itr);
while (itr != d_str.end()) {
if (is_consonant(itr)) {
if (vowel_run) measure++;
if (vowel_run) { measure++; }
vowel_run = false;
} else {
vowel_run = true;
Expand All @@ -211,11 +213,13 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
if (strings.is_empty()) {
return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
}

// create empty output column
auto results =
cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
strings.size(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
Expand All @@ -226,7 +230,7 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings.size()),
results->mutable_view().data<int32_t>(),
results->mutable_view().data<cudf::size_type>(),
porter_stemmer_measure_fn{*strings_column});
results->set_null_count(strings.null_count());
return results;
Expand Down
45 changes: 25 additions & 20 deletions cpp/src/text/subword/data_normalizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,10 @@ __device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6)
* @param start_byte_for_thread Which byte to start analyzing
* @return New code point value for this byte.
*/
__device__ uint32_t extract_code_points_from_utf8(unsigned char const* strings,
size_t const total_bytes,
uint32_t const start_byte_for_thread)
__device__ uint32_t
extract_code_points_from_utf8(unsigned char const* strings,
size_t const total_bytes,
cudf::thread_index_type const start_byte_for_thread)
{
constexpr uint8_t max_utf8_blocks_for_char = 4;
uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0};
Expand Down Expand Up @@ -214,8 +215,9 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
constexpr uint32_t init_val = (1 << FILTER_BIT);
uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};

uint32_t const char_for_thread = blockDim.x * blockIdx.x + threadIdx.x;
uint32_t num_new_chars = 0;
cudf::thread_index_type const char_for_thread =
threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x);
uint32_t num_new_chars = 0;

if (char_for_thread < total_bytes) {
auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread);
Expand Down Expand Up @@ -273,31 +275,34 @@ data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
}

uvector_pair data_normalizer::normalize(char const* d_strings,
uint32_t const* d_offsets,
uint32_t num_strings,
cudf::size_type const* d_offsets,
cudf::size_type num_strings,
rmm::cuda_stream_view stream) const
{
if (num_strings == 0)
return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
if (num_strings == 0) {
return uvector_pair{std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream)};
}

// copy offsets to working memory
size_t const num_offsets = num_strings + 1;
auto d_strings_offsets = std::make_unique<rmm::device_uvector<uint32_t>>(num_offsets, stream);
auto const num_offsets = num_strings + 1;
auto d_strings_offsets =
std::make_unique<rmm::device_uvector<cudf::size_type>>(num_offsets, stream);
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<uint32_t>(0),
thrust::make_counting_iterator<uint32_t>(num_offsets),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(num_offsets),
d_strings_offsets->begin(),
[d_offsets] __device__(auto idx) {
auto const offset = d_offsets[0]; // adjust for any offset to the offsets
return d_offsets[idx] - offset;
});
uint32_t const bytes_count = d_strings_offsets->element(num_strings, stream);
if (bytes_count == 0) // if no bytes, nothing to do
return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
auto const bytes_count = d_strings_offsets->element(num_strings, stream);
if (bytes_count == 0) { // if no bytes, nothing to do
return uvector_pair{std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream)};
}

cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(bytes_count), THREADS_PER_BLOCK, 1};
cudf::detail::grid_1d const grid{bytes_count, THREADS_PER_BLOCK, 1};
size_t const threads_on_device = grid.num_threads_per_block * grid.num_blocks;
size_t const max_new_char_total = MAX_NEW_CHARS * threads_on_device;

Expand Down Expand Up @@ -333,7 +338,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
num_strings,
update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()});

uint32_t const num_chars = d_strings_offsets->element(num_strings, stream);
auto const num_chars = d_strings_offsets->element(num_strings, stream);
d_code_points->resize(num_chars, stream); // should be smaller than original allocated size

// return the normalized code points and the new offsets
Expand Down
10 changes: 6 additions & 4 deletions cpp/src/text/subword/detail/data_normalizer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,11 +18,13 @@

#include <text/subword/detail/cp_data.h>

#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

using uvector_pair = std::pair<std::unique_ptr<rmm::device_uvector<uint32_t>>,
std::unique_ptr<rmm::device_uvector<uint32_t>>>;
std::unique_ptr<rmm::device_uvector<cudf::size_type>>>;

namespace nvtext {
namespace detail {
Expand Down Expand Up @@ -85,8 +87,8 @@ class data_normalizer {
* used to locate the code points for each string.
*/
uvector_pair normalize(char const* d_strings,
uint32_t const* d_offsets,
uint32_t num_strings,
cudf::size_type const* d_offsets,
cudf::size_type num_strings,
rmm::cuda_stream_view stream) const;

private:
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/text/subword/detail/tokenizer_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,8 @@

#include <text/subword/detail/cp_data.h>

#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

Expand Down Expand Up @@ -47,8 +49,9 @@ constexpr int THREADS_PER_BLOCK = 64;
*/
struct update_strings_lengths_fn {
uint32_t const* d_chars_up_to_idx;
uint32_t* d_offsets;
__device__ void operator()(uint32_t idx)
cudf::size_type* d_offsets;

__device__ void operator()(cudf::size_type idx)
{
auto const offset = d_offsets[idx];
d_offsets[idx] = offset > 0 ? d_chars_up_to_idx[offset - 1] : 0;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class wordpiece_tokenizer {
* @return Pointer to token-ids and token-id offsets
*/
uvector_pair tokenize(char const* d_strings,
uint32_t const* d_offsets,
cudf::size_type const* d_offsets,
uint32_t num_strings,
rmm::cuda_stream_view stream);

Expand Down
20 changes: 12 additions & 8 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ namespace {
__global__ void kernel_compute_tensor_metadata(
// input
uint32_t const* token_ids,
uint32_t const* offsets,
cudf::size_type const* offsets,
uint32_t const* row2tensor,
uint32_t const* row2row_within_tensor,
uint32_t max_sequence_length,
Expand All @@ -71,8 +71,12 @@ __global__ void kernel_compute_tensor_metadata(
uint32_t* attn_mask,
uint32_t* metadata)
{
uint32_t const output_idx = threadIdx.x + blockIdx.x * blockDim.x;
if (output_idx >= (nrows_tensor_token_ids * max_sequence_length)) return;
cudf::thread_index_type const output_idx =
threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
if (output_idx >= (cudf::thread_index_type(nrows_tensor_token_ids) *
cudf::thread_index_type(max_sequence_length))) {
return;
}

uint32_t const absolute_row_id = output_idx / max_sequence_length;
uint32_t const tensor_id = row2tensor[absolute_row_id];
Expand Down Expand Up @@ -179,18 +183,18 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
std::overflow_error);

auto const offsets = strings.offsets();
auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
auto const offset = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
auto const d_chars = strings.chars().data<char>() + offset;
auto const d_offsets = offsets.data<cudf::size_type>() + strings.offset();
auto const offset = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
auto const d_chars = strings.chars().data<char>() + offset;

// Create tokenizer
wordpiece_tokenizer tokenizer(
vocab_table, max_sequence_length, stride, do_truncate, do_lower_case);
// Run tokenizer
auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
// assign output components
uint32_t const* device_token_ids = tokens.first->data();
uint32_t const* device_offsets = tokens.second->data();
auto device_token_ids = tokens.first->data();
auto device_offsets = tokens.second->data();

// Format output from tokenizer
// Each string can create 1 or more tensor entries.
Expand Down
24 changes: 15 additions & 9 deletions cpp/src/text/subword/wordpiece_tokenizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
uint32_t* token_ids,
uint8_t* tokens_per_word)
{
uint32_t char_for_thread = blockDim.x * blockIdx.x + threadIdx.x;
cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;

// Deal with the start_word_indices array
if (char_for_thread < num_code_points) {
Expand Down Expand Up @@ -130,12 +132,14 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
* @param num_strings The total number of strings to be processed.
*/
__global__ void mark_string_start_and_ends(uint32_t const* code_points,
uint32_t const* strings_offsets,
cudf::size_type const* strings_offsets,
uint32_t* start_word_indices,
uint32_t* end_word_indices,
uint32_t num_strings)
{
uint32_t idx = blockDim.x * blockIdx.x + threadIdx.x;
cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;
// Ensure the starting character of each strings is written to the word start array.
if (idx <= num_strings) {
auto const offset = strings_offsets[idx];
Expand Down Expand Up @@ -330,7 +334,9 @@ __global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points,
uint32_t* token_ids,
uint8_t* tokens_per_word)
{
uint32_t const word_to_tokenize = blockDim.x * blockIdx.x + threadIdx.x;
cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;

if (word_to_tokenize >= total_words) return;
// Each thread gets the start code_point offset for each word and resets the token_id memory to
Expand Down Expand Up @@ -414,7 +420,7 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
}

uvector_pair wordpiece_tokenizer::tokenize(char const* d_strings,
uint32_t const* d_offsets,
cudf::size_type const* d_offsets,
uint32_t num_strings,
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
rmm::cuda_stream_view stream)
{
Expand All @@ -433,10 +439,10 @@ struct tranform_fn { // just converting uint8 value to uint32

void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream)
{
uint32_t* device_code_points = cps_and_offsets.first->data();
size_t const num_code_points = cps_and_offsets.first->size();
uint32_t* device_strings_offsets = cps_and_offsets.second->data();
uint32_t const num_strings = cps_and_offsets.second->size() - 1;
auto device_code_points = cps_and_offsets.first->data();
auto const num_code_points = cps_and_offsets.first->size();
auto device_strings_offsets = cps_and_offsets.second->data();
auto const num_strings = cps_and_offsets.second->size() - 1;

size_t const four_byte_cp_chunks = 1 + (num_code_points - 1) / sizeof(uint32_t);
size_t const rounded_num_cps = sizeof(uint32_t) * four_byte_cp_chunks;
Expand Down
Loading
Loading