diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index bc2b0607193..78dfb6bf1a6 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -107,8 +107,8 @@ constexpr uint32_t UTF8_3BYTE = 0x01'0000; struct codepoint_to_utf8_fn { cudf::column_device_view const d_strings; // input strings uint32_t const* cp_data; // full code-point array - int32_t const* d_cp_offsets{}; // offsets to each string's code-point array - int32_t* d_offsets{}; // offsets for the output strings + cudf::size_type const* d_cp_offsets{}; // offsets to each string's code-point array + cudf::size_type* d_offsets{}; // offsets for the output strings char* d_chars{}; // buffer for the output strings column /** @@ -118,7 +118,7 @@ struct codepoint_to_utf8_fn { * @param count number of code-points in `str_cps` * @return Number of bytes required for the output */ - __device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count) + __device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count) { return thrust::transform_reduce( thrust::seq, @@ -126,7 +126,7 @@ struct codepoint_to_utf8_fn { str_cps + count, [](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); }, 0, - thrust::plus()); + thrust::plus()); } __device__ void operator()(cudf::size_type idx) @@ -208,9 +208,9 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto const aux_table = get_aux_codepoint_data(stream); auto const normalizer = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case); auto const offsets = strings.offsets(); - auto const d_offsets = offsets.data() + strings.offset(); - auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); - auto const d_chars = strings.chars().data() + offset; + auto const d_offsets = offsets.data() + strings.offset(); + auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); + auto const d_chars = strings.chars().data() + offset; return normalizer.normalize(d_chars, d_offsets, strings.size(), stream); }(); @@ -222,8 +222,8 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con // convert the result into a strings column // - the cp_chars are the new 4-byte code-point values for all the characters in the output // - the cp_offsets identify which code-points go with which strings - uint32_t const* cp_chars = result.first->data(); - int32_t const* cp_offsets = reinterpret_cast(result.second->data()); + uint32_t const* cp_chars = result.first->data(); + cudf::size_type const* cp_offsets = result.second->data(); auto d_strings = cudf::column_device_view::create(strings.parent(), stream); diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 3cfaece64d7..d122f048a4e 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -47,7 +47,7 @@ using replace_result = thrust::pair; struct base_token_replacer_fn { cudf::column_device_view const d_strings; ///< strings to tokenize cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing - int32_t* d_offsets{}; ///< for locating output string in d_chars + cudf::size_type* d_offsets{}; ///< for locating output string in d_chars char* d_chars{}; ///< output buffer /** diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu index 6aad75bef71..2b2b8429d9c 100644 --- a/cpp/src/text/stemmer.cu +++ b/cpp/src/text/stemmer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -184,17 +184,19 @@ struct dispatch_is_letter_fn { struct porter_stemmer_measure_fn { cudf::column_device_view const d_strings; // strings to measure - __device__ int32_t operator()(cudf::size_type idx) const + __device__ cudf::size_type operator()(cudf::size_type idx) const { - if (d_strings.is_null(idx)) return 0; + if (d_strings.is_null(idx)) { return 0; } cudf::string_view d_str = d_strings.element(idx); - if (d_str.empty()) return 0; - int32_t measure = 0; - auto itr = d_str.begin(); - bool vowel_run = !is_consonant(itr); + if (d_str.empty()) { return 0; } + + cudf::size_type measure = 0; + + auto itr = d_str.begin(); + bool vowel_run = !is_consonant(itr); while (itr != d_str.end()) { if (is_consonant(itr)) { - if (vowel_run) measure++; + if (vowel_run) { measure++; } vowel_run = false; } else { vowel_run = true; @@ -211,11 +213,13 @@ std::unique_ptr porter_stemmer_measure(cudf::strings_column_view c rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); + if (strings.is_empty()) { + return cudf::make_empty_column(cudf::data_type{cudf::type_to_id()}); + } // create empty output column auto results = - cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, + cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, strings.size(), cudf::detail::copy_bitmask(strings.parent(), stream, mr), strings.null_count(), @@ -226,7 +230,7 @@ std::unique_ptr porter_stemmer_measure(cudf::strings_column_view c thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings.size()), - results->mutable_view().data(), + results->mutable_view().data(), porter_stemmer_measure_fn{*strings_column}); results->set_null_count(strings.null_count()); return results; diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 596c8d26e65..34eb95bea5c 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -124,9 +124,10 @@ __device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) * @param start_byte_for_thread Which byte to start analyzing * @return New code point value for this byte. */ -__device__ uint32_t extract_code_points_from_utf8(unsigned char const* strings, - size_t const total_bytes, - uint32_t const start_byte_for_thread) +__device__ uint32_t +extract_code_points_from_utf8(unsigned char const* strings, + size_t const total_bytes, + cudf::thread_index_type const start_byte_for_thread) { constexpr uint8_t max_utf8_blocks_for_char = 4; uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0}; @@ -214,8 +215,9 @@ __global__ void kernel_data_normalizer(unsigned char const* strings, constexpr uint32_t init_val = (1 << FILTER_BIT); uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val}; - uint32_t const char_for_thread = blockDim.x * blockIdx.x + threadIdx.x; - uint32_t num_new_chars = 0; + cudf::thread_index_type const char_for_thread = + threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x); + uint32_t num_new_chars = 0; if (char_for_thread < total_bytes) { auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread); @@ -273,31 +275,34 @@ data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata, } uvector_pair data_normalizer::normalize(char const* d_strings, - uint32_t const* d_offsets, - uint32_t num_strings, + cudf::size_type const* d_offsets, + cudf::size_type num_strings, rmm::cuda_stream_view stream) const { - if (num_strings == 0) - return std::pair(std::make_unique>(0, stream), - std::make_unique>(0, stream)); + if (num_strings == 0) { + return uvector_pair{std::make_unique>(0, stream), + std::make_unique>(0, stream)}; + } // copy offsets to working memory - size_t const num_offsets = num_strings + 1; - auto d_strings_offsets = std::make_unique>(num_offsets, stream); + auto const num_offsets = num_strings + 1; + auto d_strings_offsets = + std::make_unique>(num_offsets, stream); thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_offsets), + thrust::counting_iterator(0), + thrust::counting_iterator(num_offsets), d_strings_offsets->begin(), [d_offsets] __device__(auto idx) { auto const offset = d_offsets[0]; // adjust for any offset to the offsets return d_offsets[idx] - offset; }); - uint32_t const bytes_count = d_strings_offsets->element(num_strings, stream); - if (bytes_count == 0) // if no bytes, nothing to do - return std::pair(std::make_unique>(0, stream), - std::make_unique>(0, stream)); + auto const bytes_count = d_strings_offsets->element(num_strings, stream); + if (bytes_count == 0) { // if no bytes, nothing to do + return uvector_pair{std::make_unique>(0, stream), + std::make_unique>(0, stream)}; + } - cudf::detail::grid_1d const grid{static_cast(bytes_count), THREADS_PER_BLOCK, 1}; + cudf::detail::grid_1d const grid{bytes_count, THREADS_PER_BLOCK, 1}; size_t const threads_on_device = grid.num_threads_per_block * grid.num_blocks; size_t const max_new_char_total = MAX_NEW_CHARS * threads_on_device; @@ -333,7 +338,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings, num_strings, update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()}); - uint32_t const num_chars = d_strings_offsets->element(num_strings, stream); + auto const num_chars = d_strings_offsets->element(num_strings, stream); d_code_points->resize(num_chars, stream); // should be smaller than original allocated size // return the normalized code points and the new offsets diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp index 927de5a74f9..fb507b88e7e 100644 --- a/cpp/src/text/subword/detail/data_normalizer.hpp +++ b/cpp/src/text/subword/detail/data_normalizer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,11 +18,13 @@ #include +#include + #include #include using uvector_pair = std::pair>, - std::unique_ptr>>; + std::unique_ptr>>; namespace nvtext { namespace detail { @@ -85,8 +87,8 @@ class data_normalizer { * used to locate the code points for each string. */ uvector_pair normalize(char const* d_strings, - uint32_t const* d_offsets, - uint32_t num_strings, + cudf::size_type const* d_offsets, + cudf::size_type num_strings, rmm::cuda_stream_view stream) const; private: diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh index 5e8de1ba244..7cc0e7c0e24 100644 --- a/cpp/src/text/subword/detail/tokenizer_utils.cuh +++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include +#include + #include #include @@ -47,8 +49,9 @@ constexpr int THREADS_PER_BLOCK = 64; */ struct update_strings_lengths_fn { uint32_t const* d_chars_up_to_idx; - uint32_t* d_offsets; - __device__ void operator()(uint32_t idx) + cudf::size_type* d_offsets; + + __device__ void operator()(cudf::size_type idx) { auto const offset = d_offsets[idx]; d_offsets[idx] = offset > 0 ? d_chars_up_to_idx[offset - 1] : 0; diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp index 2f528dce897..e191890eeca 100644 --- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp +++ b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp @@ -82,8 +82,8 @@ class wordpiece_tokenizer { * @return Pointer to token-ids and token-id offsets */ uvector_pair tokenize(char const* d_strings, - uint32_t const* d_offsets, - uint32_t num_strings, + cudf::size_type const* d_offsets, + cudf::size_type num_strings, rmm::cuda_stream_view stream); private: diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 47f602362f2..1a3084a257f 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -59,7 +59,7 @@ namespace { __global__ void kernel_compute_tensor_metadata( // input uint32_t const* token_ids, - uint32_t const* offsets, + cudf::size_type const* offsets, uint32_t const* row2tensor, uint32_t const* row2row_within_tensor, uint32_t max_sequence_length, @@ -71,8 +71,13 @@ __global__ void kernel_compute_tensor_metadata( uint32_t* attn_mask, uint32_t* metadata) { - uint32_t const output_idx = threadIdx.x + blockIdx.x * blockDim.x; - if (output_idx >= (nrows_tensor_token_ids * max_sequence_length)) return; + cudf::thread_index_type const output_idx = + threadIdx.x + static_cast(blockIdx.x) * + static_cast(blockDim.x); + if (output_idx >= (static_cast(nrows_tensor_token_ids) * + static_cast(max_sequence_length))) { + return; + } uint32_t const absolute_row_id = output_idx / max_sequence_length; uint32_t const tensor_id = row2tensor[absolute_row_id]; @@ -179,9 +184,9 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, std::overflow_error); auto const offsets = strings.offsets(); - auto const d_offsets = offsets.data() + strings.offset(); - auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); - auto const d_chars = strings.chars().data() + offset; + auto const d_offsets = offsets.data() + strings.offset(); + auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); + auto const d_chars = strings.chars().data() + offset; // Create tokenizer wordpiece_tokenizer tokenizer( @@ -189,8 +194,8 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, // Run tokenizer auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream); // assign output components - uint32_t const* device_token_ids = tokens.first->data(); - uint32_t const* device_offsets = tokens.second->data(); + auto device_token_ids = tokens.first->data(); + auto device_offsets = tokens.second->data(); // Format output from tokenizer // Each string can create 1 or more tensor entries. diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index b6f6b7eda25..3b912017320 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -82,7 +82,9 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin uint32_t* token_ids, uint8_t* tokens_per_word) { - uint32_t char_for_thread = blockDim.x * blockIdx.x + threadIdx.x; + cudf::thread_index_type char_for_thread = static_cast(blockDim.x) * + static_cast(blockIdx.x) + + threadIdx.x; // Deal with the start_word_indices array if (char_for_thread < num_code_points) { @@ -130,12 +132,14 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin * @param num_strings The total number of strings to be processed. */ __global__ void mark_string_start_and_ends(uint32_t const* code_points, - uint32_t const* strings_offsets, + cudf::size_type const* strings_offsets, uint32_t* start_word_indices, uint32_t* end_word_indices, uint32_t num_strings) { - uint32_t idx = blockDim.x * blockIdx.x + threadIdx.x; + cudf::thread_index_type idx = static_cast(blockDim.x) * + static_cast(blockIdx.x) + + threadIdx.x; // Ensure the starting character of each strings is written to the word start array. if (idx <= num_strings) { auto const offset = strings_offsets[idx]; @@ -330,7 +334,9 @@ __global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points, uint32_t* token_ids, uint8_t* tokens_per_word) { - uint32_t const word_to_tokenize = blockDim.x * blockIdx.x + threadIdx.x; + cudf::thread_index_type word_to_tokenize = static_cast(blockDim.x) * + static_cast(blockIdx.x) + + threadIdx.x; if (word_to_tokenize >= total_words) return; // Each thread gets the start code_point offset for each word and resets the token_id memory to @@ -414,8 +420,8 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table, } uvector_pair wordpiece_tokenizer::tokenize(char const* d_strings, - uint32_t const* d_offsets, - uint32_t num_strings, + cudf::size_type const* d_offsets, + cudf::size_type num_strings, rmm::cuda_stream_view stream) { auto cps_and_offsets = normalizer.normalize(d_strings, d_offsets, num_strings, stream); @@ -433,10 +439,10 @@ struct tranform_fn { // just converting uint8 value to uint32 void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream) { - uint32_t* device_code_points = cps_and_offsets.first->data(); - size_t const num_code_points = cps_and_offsets.first->size(); - uint32_t* device_strings_offsets = cps_and_offsets.second->data(); - uint32_t const num_strings = cps_and_offsets.second->size() - 1; + auto device_code_points = cps_and_offsets.first->data(); + auto const num_code_points = cps_and_offsets.first->size(); + auto device_strings_offsets = cps_and_offsets.second->data(); + auto const num_strings = cps_and_offsets.second->size() - 1; size_t const four_byte_cp_chunks = 1 + (num_code_points - 1) / sizeof(uint32_t); size_t const rounded_num_cps = sizeof(uint32_t) * four_byte_cp_chunks; diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 8604152099c..16b9f25b802 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -52,12 +52,13 @@ std::unique_ptr token_count_fn(cudf::size_type strings_count, rmm::mr::device_memory_resource* mr) { // create output column - auto token_counts = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, - strings_count, - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_token_counts = token_counts->mutable_view().data(); + auto token_counts = + cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, + strings_count, + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_token_counts = token_counts->mutable_view().data(); // add the counts to the column thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -79,10 +80,10 @@ std::unique_ptr tokenize_fn(cudf::size_type strings_count, token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource()); auto d_token_counts = token_counts->view(); // create token-index offsets from the counts - rmm::device_uvector token_offsets(strings_count + 1, stream); + rmm::device_uvector token_offsets(strings_count + 1, stream); thrust::inclusive_scan(rmm::exec_policy(stream), - d_token_counts.template begin(), - d_token_counts.template end(), + d_token_counts.template begin(), + d_token_counts.template end(), token_offsets.begin() + 1); token_offsets.set_element_to_zero_async(0, stream); auto const total_tokens = token_offsets.back_element(stream); @@ -177,10 +178,10 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const } auto offsets = strings_column.offsets(); - auto offset = cudf::detail::get_value(offsets, strings_column.offset(), stream); - auto chars_bytes = - cudf::detail::get_value(offsets, strings_column.offset() + strings_count, stream) - - offset; + auto offset = cudf::detail::get_value(offsets, strings_column.offset(), stream); + auto chars_bytes = cudf::detail::get_value( + offsets, strings_column.offset() + strings_count, stream) - + offset; auto d_chars = strings_column.chars().data(); // unsigned is necessary for checking bits d_chars += offset; @@ -200,16 +201,17 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // create output offsets column // -- conditionally copy a counting iterator where // the first byte of each character is located - auto offsets_column = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, - num_characters + 1, - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_new_offsets = offsets_column->mutable_view().begin(); + auto offsets_column = + cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, + num_characters + 1, + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_new_offsets = offsets_column->mutable_view().begin(); thrust::copy_if( rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes + 1), + thrust::counting_iterator(0), + thrust::counting_iterator(chars_bytes + 1), d_new_offsets, [d_chars, chars_bytes] __device__(auto idx) { // this will also set the final value to the size chars_bytes diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh index 89825e31e5c..fbd2d1efcff 100644 --- a/cpp/src/text/utilities/tokenize_ops.cuh +++ b/cpp/src/text/utilities/tokenize_ops.cuh @@ -149,7 +149,7 @@ struct characters_tokenizer { struct strings_tokenizer { cudf::column_device_view const d_strings; ///< strings to tokenize cudf::string_view const d_delimiter; ///< delimiter characters to tokenize around - int32_t* d_offsets{}; ///< offsets into the d_tokens vector for each string + cudf::size_type* d_offsets{}; ///< offsets into the d_tokens vector for each string string_index_pair* d_tokens{}; ///< token positions in device memory /** @@ -194,7 +194,7 @@ struct multi_delimiter_strings_tokenizer { cudf::column_device_view const d_strings; ///< strings column to tokenize delimiterator delimiters_begin; ///< first delimiter delimiterator delimiters_end; ///< last delimiter - int32_t* d_offsets{}; ///< offsets into the d_tokens output vector + cudf::size_type* d_offsets{}; ///< offsets into the d_tokens output vector string_index_pair* d_tokens{}; ///< token positions found for each string /**