Skip to content

Commit

Permalink
Support UTF-8 BOM in CSV reader (#13516)
Browse files Browse the repository at this point in the history
Adds support to the CSV reader to skip the UTF-8 BOM bytes at the beginning of a text source.

Some text files may contain a BOM (byte order marker) at the beginning of the file to identify its encoding.
https://en.wikipedia.org/wiki/Byte_order_mark

cuDF only support UTF-8 encoding so skipping these bytes allows us to support these types of files.

Closes #12516

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #13516
  • Loading branch information
davidwendt authored Jun 7, 2023
1 parent a99f313 commit 0c68ca1
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 3 deletions.
13 changes: 10 additions & 3 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -413,9 +413,16 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> select_data_and_row_
auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
auto buffer = source->host_read(range_offset, data_size);

auto h_data = host_span<char const>( //
reinterpret_cast<const char*>(buffer->data()),
buffer->size());
// check for and skip UTF-8 BOM
auto buffer_data = buffer->data();
auto buffer_size = buffer->size();
uint8_t const UTF8_BOM[] = {0xEF, 0xBB, 0xBF};
if (buffer_size > sizeof(UTF8_BOM) && memcmp(buffer_data, UTF8_BOM, sizeof(UTF8_BOM)) == 0) {
buffer_data += sizeof(UTF8_BOM);
buffer_size -= sizeof(UTF8_BOM);
}

auto h_data = host_span<char const>(reinterpret_cast<char const*>(buffer_data), buffer_size);

std::vector<uint8_t> h_uncomp_data_owner;

Expand Down
18 changes: 18 additions & 0 deletions cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2480,4 +2480,22 @@ TEST_F(CsvReaderTest, NullCount)
EXPECT_EQ(result_view.column(2).null_count(), 8);
}

TEST_F(CsvReaderTest, UTF8BOM)
{
std::string buffer = "\xEF\xBB\xBFMonth,Day,Year\nJune,6,2023\nAugust,25,1990\nMay,1,2000\n";
cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()});
auto const result = cudf::io::read_csv(in_opts);
auto const result_view = result.tbl->view();
EXPECT_EQ(result_view.num_rows(), 3);
EXPECT_EQ(result.metadata.schema_info.front().name, "Month");

auto col1 = cudf::test::strings_column_wrapper({"June", "August", "May"});
auto col2 = cudf::test::fixed_width_column_wrapper<int64_t>({6, 25, 1});
auto col3 = cudf::test::fixed_width_column_wrapper<int64_t>({2023, 1990, 2000});
auto expected = cudf::table_view({col1, col2, col3});

CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expected);
}

CUDF_TEST_PROGRAM_MAIN()

0 comments on commit 0c68ca1

Please sign in to comment.