From 20f5064fb350302047e41b07005e39d0271cde6f Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Tue, 30 Jan 2024 23:29:28 -0800 Subject: [PATCH 1/6] Initial implementation --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 206 ++++++++++++++++-- .../ext/intel/experimental/esimd/memory.hpp | 16 +- 2 files changed, 192 insertions(+), 30 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 7af684ee99cfb..d3910ce944977 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -2505,11 +2505,13 @@ block_store(AccessorT acc, simd vals, simd_mask<1> pred, namespace detail { template ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t< - (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) && - (std::is_same_v || - is_accessor_with_v)> + std::is_same_v || + is_accessor_with_v> scatter_impl(AccessorTy acc, simd vals, simd offsets, uint32_t glob_offset, simd_mask mask) { + + static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32), + "Unexpected type or vector length"); constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); // TODO (performance) use hardware-supported scale once BE supports it constexpr int16_t scale = 0; @@ -2662,6 +2664,41 @@ slm_gather_impl(__ESIMD_NS::simd offsets, return lsc_format_ret(Result); } +/// SLM scatter implementation. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_store.slm +/// +/// Scatters elements located to slm. +/// +/// @tparam T is element type. +/// @tparam NElts is the number of elements to store per address. +/// @tparam DS is the data size. +/// @tparam N is the number of channels (platform dependent). +/// @param offsets is the zero-based offsets for SLM buffer in bytes. +/// @param vals is values to store. +/// @param pred is predicates. +/// +template +__ESIMD_API void slm_scatter_impl(__ESIMD_NS::simd offsets, + __ESIMD_NS::simd vals, + __ESIMD_NS::simd_mask pred) { + detail::check_lsc_vector_size(); + detail::check_lsc_data_size(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = + detail::expand_data_size(detail::finalize_data_size()); + constexpr detail::lsc_vector_size LSCVS = detail::to_lsc_vector_size(); + constexpr detail::lsc_data_order Transposed = + detail::lsc_data_order::nontranspose; + using MsgT = typename detail::lsc_expand_type::type; + using CstT = __ESIMD_DNS::uint_type_t; + __ESIMD_NS::simd Tmp = vals.template bit_cast_view(); + __esimd_lsc_store_slm( + pred.data(), offsets.data(), Tmp.data()); +} + } // namespace detail /// @endcond ESIMD_DETAIL @@ -3938,7 +3975,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask mask, /// @param byte_offsets the vector of 32-bit offsets in bytes. /// For each i, (byte_offsets[i]) must be element size aligned. /// @param props The optional compile-time properties. Only 'alignment' -/// and cache hint properties are used. +/// property is used. /// @return A vector of elements read. template __ESIMD_API T slm_scalar_load(uint32_t offset) { return Res[0]; } -/// Scatter operation over the Shared Local Memory. -/// This API has almost the same interface as the @ref accessor_scatter -/// "accessor-based scatter", except that it does not have the accessor and -/// the global offset parameters. -/// -template -__ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) && - (sizeof(T) <= 4)> -slm_scatter(simd offsets, simd vals, simd_mask mask = 1) { - detail::LocalAccessorMarker acc; - detail::scatter_impl(acc, vals, offsets, 0, mask); +/// template +/// void slm_scatter(simd byte_offsets, +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-1) +/// void slm_scatter(simd byte_offsets, +/// simd vals, PropertyListT props = {}); // (slm-sc-2) +/// +/// The next 2 functions are variations of the first 2 above (slm-sc-1,2) +/// and were added only to support simd_view instead of simd for byte_offsets. +/// template +/// void slm_scatter(simd_view byte_offsets, +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-3) +/// void slm_scatter(simd_view byte_offsets, +/// simd vals, PropertyListT props = {}); // (slm-sc-4) + +/// template +/// void slm_scatter(simd byte_offsets, +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-1) +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. Storage of any element +/// can be disabled via the input vector of predicates \p mask. +/// If mask[i] is unset, then the storage to (byte_offsets[i]) is skipped. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector of values to store. +/// @param mask The access mask, defaults to all 1s. +/// @param props The optional compile-time properties. Only 'alignment' property +/// is used. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +slm_scatter(simd byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS"); + + constexpr size_t Alignment = + detail::getPropertyValue(sizeof(T)); + static_assert(Alignment >= sizeof(T), + "slm_scatter() requires at least element-size alignment"); + + // Use LSC lowering if VS > 1. + if constexpr (VS > 1) { + __ESIMD_DNS::slm_scatter_impl( + byte_offsets, vals, mask); + } else { + detail::LocalAccessorMarker acc; + detail::scatter_impl(acc, vals, byte_offsets, 0, mask); + } +} + +/// template +/// void slm_scatter(simd byte_offsets, simd vals, +/// PropertyListT props = {}); // (slm-sc-2) +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// @param vals The vector of values to store. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +slm_scatter(simd byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + slm_scatter(byte_offsets, vals, Mask, props); +} + +/// template +/// void slm_scatter( +/// OffsetSimdViewT byte_offsets, simd vals, +/// simd_mask mask, PropertyListT props = {}); // (slm-sc-3) +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. +/// Storage to any element's memory location can be disabled via the +/// input vector of predicates \p mask. If mask[i] is unset, then the storage to +/// (byte_offsets[i]) is skipped. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector of values to store. +/// @param mask The access mask, defaults to all 1s. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +slm_scatter(OffsetSimdViewT byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + slm_scatter(byte_offsets.read(), vals, mask, props); +} + +/// void slm_scatter( +/// OffsetSimdViewT byte_offsets, simd vals, +/// PropertyListT props = {}); // (slm-sc-4) +/// Loads ("gathers") elements of the type 'T' from Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets, and returns the loaded +/// elements. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// @param vals The vector of values to store. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +slm_scatter(OffsetSimdViewT byte_offsets, simd vals, + PropertyListT props = {}) { + return slm_scatter(byte_offsets.read(), vals, props); } /// Store a scalar value into the Shared Local Memory. diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 069898f14e747..42b77137c1a91 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -1424,21 +1424,7 @@ template offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - using CstT = typename detail::lsc_bitcast_type::type; - __ESIMD_NS::simd Tmp = vals.template bit_cast_view(); - __esimd_lsc_store_slm( - pred.data(), offsets.data(), Tmp.data()); + __ESIMD_DNS::slm_scatter_impl(offsets, vals, pred); } /// Transposed SLM scatter with 1 channel. From 19c3dd611cdfd8521fb39dea790284d6222b2b98 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Thu, 1 Feb 2024 22:09:36 -0800 Subject: [PATCH 2/6] Add tests --- .../unified_memory_api/Inputs/scatter.hpp | 219 ++++++++++++++++++ .../ESIMD/unified_memory_api/slm_scatter.cpp | 33 +++ .../slm_scatter_dg2_pvc.cpp | 34 +++ sycl/test/esimd/memory_properties.cpp | 39 ++++ 4 files changed, 325 insertions(+) create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 270ecc4e698af..f61bab0e8cc27 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -225,3 +225,222 @@ template bool testUSM(queue Q) { return Passed; } + +template +bool testSLM(queue Q, uint32_t MaskStride, + ScatterPropertiesT ScatterProperties) { + constexpr uint32_t Groups = 8; + constexpr uint32_t Threads = 1; + constexpr size_t Size = Groups * Threads * N; + static_assert(VS > 0 && N % VS == 0, + "Incorrect VS parameter. N must be divisible by VS."); + constexpr int NOffsets = N / VS; + using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t; + + std::cout << "SLM case: T=" << esimd_test::type_name() << ",N=" << N + << ", VS=" << VS << ",UseMask=" << UseMask + << ",UseProperties=" << UseProperties << std::endl; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + T *Out = static_cast(sycl::malloc_shared(Size * sizeof(T), Q)); + for (size_t i = 0; i < Size; i++) + Out[i] = i; + + try { + Q.submit([&](handler &cgh) { + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + ScatterPropertiesT Props{}; + uint16_t GlobalID = ndi.get_global_id(0); + uint16_t LocalID = ndi.get_local_id(0); + uint32_t GlobalElemOffset = GlobalID * N; + uint32_t LocalElemOffset = LocalID * N; + + constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); + slm_init(); + + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I += 8) { + simd InVec(Out + GlobalElemOffset + I); + simd offsets(I * sizeof(T), sizeof(T)); + slm_scatter(offsets, InVec); + } + } + barrier(); + + simd ByteOffsets(LocalElemOffset * sizeof(T), + VS * sizeof(T)); + auto ByteOffsetsView = ByteOffsets.template select(); + + simd Vals = slm_gather(ByteOffsets, Props); + Vals *= 2; + + auto ValsView = Vals.template select(); + simd_mask Pred = 0; + for (int I = 0; I < NOffsets; I++) + Pred[I] = (I % MaskStride == 0) ? 1 : 0; + if constexpr (VS > 1) { // VS > 1 requires specifying + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView); + } + } + } else { // VS == 1 + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView); + } + } + } + barrier(); + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I += 8) { + simd offsets(I * sizeof(T), sizeof(T)); + simd OutVec = slm_gather(offsets); + OutVec.copy_to(Out + GlobalElemOffset + I); + } + } + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(Out, Q); + return false; + } + + bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask); + + sycl::free(Out, Q); + + return Passed; +} + +template bool testSLM(queue Q) { + constexpr bool CheckMask = true; + constexpr bool CheckProperties = true; + properties EmptyProps; + properties AlignElemProps{alignment}; + + bool Passed = true; + + // Test scatter() that is available on Gen12 and PVC. + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 1, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 1, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + + // // Test scatter() without passing compile-time properties argument. + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + + if constexpr (Features == TestFeatures::PVC || + Features == TestFeatures::DG2) { + properties LSCProps{alignment}; + Passed &= testSLM(Q, 2, LSCProps); + Passed &= testSLM(Q, 2, LSCProps); + Passed &= testSLM(Q, 2, LSCProps); + Passed &= testSLM(Q, 2, LSCProps); + + Passed &= testSLM(Q, 2, LSCProps); + + // Check VS > 1. GPU supports only dwords and qwords in this mode. + if constexpr (sizeof(T) >= 4) { + // TODO: This test case causes flaky fail. Enable it after the issue + // in GPU driver is fixed. + // Passed &= + // testSLM(Q, 2, AlignElemProps) + Passed &= + testSLM(Q, 2, AlignElemProps); + Passed &= + testSLM(Q, 2, AlignElemProps); + Passed &= + testSLM(Q, 2, AlignElemProps); + } + } // TestPVCFeatures + + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp new file mode 100644 index 0000000000000..96c49fcdae8af --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp @@ -0,0 +1,33 @@ +//==------- slm_scatter.cpp - DPC++ ESIMD on-device test ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::slm_scatter() functions accepting +// optional compile-time esimd::properties. +// The scatter() calls in this test do not use DG2/PVC features. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::Generic; + bool Passed = true; + + Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testSLM(Q); + Passed &= testSLM(Q); + Passed &= testSLM(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp new file mode 100644 index 0000000000000..26106a5d9b3af --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp @@ -0,0 +1,34 @@ +//==------- slm_scatter_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::slm_scatter() functions accepting +// optional compile-time esimd::properties. +// The slm_scatter() calls in this test use DG2 or PVC features. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::PVC; + bool Passed = true; + + Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testSLM(Q); + Passed &= testSLM(Q); + Passed &= testSLM(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 31dbc3e889f4b..87f43ef7f3553 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1303,4 +1303,43 @@ test_slm_gather_scatter(int byte_offset32) { props_align4); slm = slm_gather(ioffset_n16_view, mask_n16, pass_thru_view, props_align4); + + // Test SLM scatter using this plan: + // 1) slm_scatter(offsets): offsets is simd or simd_view + // 2) slm_scatter(offsets, mask): offsets is simd or simd_view + // 4) slm_scatter(...): same as (1), (2) above, but with VS > 1. + + // 1) slm_scatter(offsets): offsets is simd or simd_view + // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + slm_scatter(ioffset_n32, slm); + slm_scatter(ioffset_n32_view, slm); + + // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + slm_scatter(ioffset_n32, slm, props_align8); + slm_scatter(ioffset_n32_view, slm, props_align8); + + // 2) slm_gather(offsets, mask): offsets is simd or simd_view + // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + slm_scatter(ioffset_n32, slm, mask_n32); + slm_scatter(ioffset_n32_view, slm, mask_n32); + + // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + slm_scatter(ioffset_n32, slm, mask_n32, props_align8); + slm_scatter(ioffset_n32_view, slm, mask_n32, props_align8); + + // 4) slm_gather(...): same as (1), (2), above, but with VS > 1. + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) + // 4a) check VS > 1. no 'mask' operand first. + slm_scatter(ioffset_n16, slm); + slm_scatter(ioffset_n16_view, slm); + + slm_scatter(ioffset_n16, slm, props_align4); + slm_scatter(ioffset_n16_view, slm, props_align4); + + // 4b) check VS > 1. Pass the 'mask' operand this time. + slm_scatter(ioffset_n16, slm, mask_n16); + slm_scatter(ioffset_n16_view, slm, mask_n16); + + slm_scatter(ioffset_n16, slm, mask_n16, props_align4); + slm_scatter(ioffset_n16_view, slm, mask_n16, props_align4); } From 04451837f1d2888b7f831965a5bff651618f965d Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Thu, 1 Feb 2024 22:21:01 -0800 Subject: [PATCH 3/6] Update the test --- sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index f61bab0e8cc27..07d471d1fc19a 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -265,8 +265,8 @@ bool testSLM(queue Q, uint32_t MaskStride, if (LocalID == 0) { for (int I = 0; I < Threads * N; I += 8) { simd InVec(Out + GlobalElemOffset + I); - simd offsets(I * sizeof(T), sizeof(T)); - slm_scatter(offsets, InVec); + simd Offsets(I * sizeof(T), sizeof(T)); + slm_scatter(Offsets, InVec); } } barrier(); @@ -370,8 +370,8 @@ bool testSLM(queue Q, uint32_t MaskStride, barrier(); if (LocalID == 0) { for (int I = 0; I < Threads * N; I += 8) { - simd offsets(I * sizeof(T), sizeof(T)); - simd OutVec = slm_gather(offsets); + simd Offsets(I * sizeof(T), sizeof(T)); + simd OutVec = slm_gather(Offsets); OutVec.copy_to(Out + GlobalElemOffset + I); } } From 76223e5005dbc578d371942adece3955ae37821f Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Fri, 2 Feb 2024 12:24:10 -0800 Subject: [PATCH 4/6] Fix test failure for Gen12 and adress PR comments --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 4 ++-- sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp | 6 +++--- .../ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp | 3 +++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index d173069502082..14390fba7b7cd 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -3958,7 +3958,7 @@ slm_gather(simd byte_offsets, simd_mask mask, static_assert(Alignment >= sizeof(T), "slm_gather() requires at least element-size alignment"); - if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) && + if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) && !detail::isMaskedGatherScatterLLVMAvailable())) { simd PassThru; // Intentionally undefined return detail::slm_gather_impl( @@ -4251,7 +4251,7 @@ slm_scatter(simd byte_offsets, simd vals, "slm_scatter() requires at least element-size alignment"); // Use LSC lowering if VS > 1. - if constexpr (VS > 1) { + if constexpr (VS > 1 || !(detail::isPowerOf2(N, 32) && sizeof(T) <= 4)) { __ESIMD_DNS::slm_scatter_impl( byte_offsets, vals, mask); } else { diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 07d471d1fc19a..3a5c624328688 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -369,9 +369,9 @@ bool testSLM(queue Q, uint32_t MaskStride, } barrier(); if (LocalID == 0) { - for (int I = 0; I < Threads * N; I += 8) { - simd Offsets(I * sizeof(T), sizeof(T)); - simd OutVec = slm_gather(Offsets); + for (int I = 0; I < Threads * N; I++) { + simd Offsets(I * sizeof(T), sizeof(T)); + simd OutVec = slm_gather(Offsets); OutVec.copy_to(Out + GlobalElemOffset + I); } } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp index 26106a5d9b3af..4ebe20e376cf1 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp @@ -28,6 +28,9 @@ int main() { Passed &= testSLM(Q); Passed &= testSLM(Q); Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testSLM(Q); std::cout << (Passed ? "Passed\n" : "FAILED\n"); return Passed ? 0 : 1; From bb2f87838ec3921666c1615152fa6d1b27acc715 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Tue, 6 Feb 2024 11:27:06 -0800 Subject: [PATCH 5/6] Address PR comments --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 42 +++++++++----------- sycl/test/esimd/memory_properties.cpp | 33 +++++++++++---- 2 files changed, 43 insertions(+), 32 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index f57b7b1f66e74..3dafe61e3256a 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -2822,10 +2822,9 @@ gather_impl(AccessorT acc, simd byte_offsets, /// @return is a vector of type T and size N * NElts. /// template -__ESIMD_API __ESIMD_NS::simd -slm_gather_impl(__ESIMD_NS::simd offsets, - __ESIMD_NS::simd_mask pred, - __ESIMD_NS::simd pass_thru) { +__ESIMD_API simd slm_gather_impl(simd offsets, + simd_mask pred, + simd pass_thru) { check_lsc_vector_size(); check_lsc_data_size(); constexpr uint16_t AddressScale = 1; @@ -2834,9 +2833,8 @@ slm_gather_impl(__ESIMD_NS::simd offsets, constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; using MsgT = typename lsc_expand_type::type; - __ESIMD_NS::simd PassThruExpanded = - lsc_format_input(pass_thru); - __ESIMD_NS::simd Result = + simd PassThruExpanded = lsc_format_input(pass_thru); + simd Result = __esimd_lsc_load_merge_slm(pred.data(), offsets.data(), @@ -2859,21 +2857,17 @@ slm_gather_impl(__ESIMD_NS::simd offsets, /// @param pred is predicates. /// template -__ESIMD_API void slm_scatter_impl(__ESIMD_NS::simd offsets, - __ESIMD_NS::simd vals, - __ESIMD_NS::simd_mask pred) { - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); +__ESIMD_API void slm_scatter_impl(simd offsets, + simd vals, simd_mask pred) { + check_lsc_vector_size(); + check_lsc_data_size(); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; - constexpr lsc_data_size EDS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size LSCVS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - using CstT = __ESIMD_DNS::uint_type_t; - __ESIMD_NS::simd Tmp = vals.template bit_cast_view(); + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + simd Tmp = lsc_format_input(vals); __esimd_lsc_store_slm( pred.data(), offsets.data(), Tmp.data()); @@ -4181,8 +4175,8 @@ template __ESIMD_API T slm_scalar_load(uint32_t offset) { /// template /// void slm_scatter(simd byte_offsets, -/// simd vals, simd_mask mask, -/// PropertyListT props = {}); // (slm-sc-1) +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-1) /// void slm_scatter(simd byte_offsets, /// simd vals, PropertyListT props = {}); // (slm-sc-2) /// @@ -4252,7 +4246,7 @@ slm_scatter(simd byte_offsets, simd vals, /// @tparam N Number of elements to read. /// @tparam VS Vector size. It can also be read as the number of reads per each /// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported -/// only on DG2 and PVC. +/// only on DG2 and PVC and only for 4- and 8-byte element vectors.. /// @param byte_offsets the vector of 32-bit offsets in bytes. /// For each i, (byte_offsets[i]) must be element size aligned. /// @param vals The vector of values to store. @@ -4283,7 +4277,7 @@ slm_scatter(simd byte_offsets, simd vals, /// @tparam N Number of elements to read. /// @tparam VS Vector size. It can also be read as the number of reads per each /// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported -/// only on DG2 and PVC. +/// only on DG2 and PVC and only for 4- and 8-byte element vectors.. /// @param byte_offsets the vector of 32-bit offsets in bytes. /// For each i, (byte_offsets[i]) must be element size aligned. /// If the alignment property is not passed, then it is assumed that each diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 1103ed11bc26d..eb629935347db 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1302,6 +1302,7 @@ test_slm_gather_scatter(int byte_offset32) { simd slm; simd pass_thru; auto pass_thru_view = pass_thru.select<32, 1>(); + auto slm_view = slm.select<32, 1>(); // Test SLM gather using this plan: // 1) slm_gather(offsets): offsets is simd or simd_view @@ -1375,41 +1376,57 @@ test_slm_gather_scatter(int byte_offset32) { props_align4); // Test SLM scatter using this plan: - // 1) slm_scatter(offsets): offsets is simd or simd_view - // 2) slm_scatter(offsets, mask): offsets is simd or simd_view - // 4) slm_scatter(...): same as (1), (2) above, but with VS > 1. + // 1) slm_scatter(offsets, vals): offsets/vals is simd or simd_view + // 2) slm_scatter(offsets, vals, mask): offsets/vals is simd or simd_view + // 3) slm_scatter(...): same as (1), (2) above, but with VS > 1. // 1) slm_scatter(offsets): offsets is simd or simd_view - // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) slm_scatter(ioffset_n32, slm); slm_scatter(ioffset_n32_view, slm); + slm_scatter(ioffset_n32, slm_view); + slm_scatter(ioffset_n32_view, slm_view); - // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) slm_scatter(ioffset_n32, slm, props_align8); slm_scatter(ioffset_n32_view, slm, props_align8); + slm_scatter(ioffset_n32, slm_view, props_align8); + slm_scatter(ioffset_n32_view, slm_view, props_align8); // 2) slm_gather(offsets, mask): offsets is simd or simd_view - // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) slm_scatter(ioffset_n32, slm, mask_n32); slm_scatter(ioffset_n32_view, slm, mask_n32); + slm_scatter(ioffset_n32, slm_view, mask_n32); + slm_scatter(ioffset_n32_view, slm_view, mask_n32); - // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) slm_scatter(ioffset_n32, slm, mask_n32, props_align8); slm_scatter(ioffset_n32_view, slm, mask_n32, props_align8); + slm_scatter(ioffset_n32, slm_view, mask_n32, props_align8); + slm_scatter(ioffset_n32_view, slm_view, mask_n32, props_align8); // 4) slm_gather(...): same as (1), (2), above, but with VS > 1. - // CHECK-COUNT-8: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) + // CHECK-COUNT-16: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) // 4a) check VS > 1. no 'mask' operand first. slm_scatter(ioffset_n16, slm); slm_scatter(ioffset_n16_view, slm); + slm_scatter(ioffset_n16, slm_view); + slm_scatter(ioffset_n16_view, slm_view); slm_scatter(ioffset_n16, slm, props_align4); slm_scatter(ioffset_n16_view, slm, props_align4); + slm_scatter(ioffset_n16, slm_view, props_align4); + slm_scatter(ioffset_n16_view, slm_view, props_align4); // 4b) check VS > 1. Pass the 'mask' operand this time. slm_scatter(ioffset_n16, slm, mask_n16); slm_scatter(ioffset_n16_view, slm, mask_n16); + slm_scatter(ioffset_n16, slm_view, mask_n16); + slm_scatter(ioffset_n16_view, slm_view, mask_n16); slm_scatter(ioffset_n16, slm, mask_n16, props_align4); slm_scatter(ioffset_n16_view, slm, mask_n16, props_align4); + slm_scatter(ioffset_n16, slm_view, mask_n16, props_align4); + slm_scatter(ioffset_n16_view, slm_view, mask_n16, props_align4); } From 4fa7497acf0ffad34b87ec1bfed23800655930d3 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Tue, 6 Feb 2024 13:13:19 -0800 Subject: [PATCH 6/6] Address PR comments --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 3dafe61e3256a..7f4833017d38a 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -4184,10 +4184,10 @@ template __ESIMD_API T slm_scalar_load(uint32_t offset) { /// and were added only to support simd_view instead of simd for byte_offsets. /// template -/// void slm_scatter(simd_view byte_offsets, +/// void slm_scatter(OffsetSimdViewT byte_offsets, /// simd vals, simd_mask mask, /// PropertyListT props = {}); // (slm-sc-3) -/// void slm_scatter(simd_view byte_offsets, +/// void slm_scatter(OffsetSimdViewT byte_offsets, /// simd vals, PropertyListT props = {}); // (slm-sc-4) /// template vals, /// @tparam N Number of elements to read. /// @tparam VS Vector size. It can also be read as the number of reads per each /// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported -/// only on DG2 and PVC. +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. /// @param byte_offsets the vector of 32-bit offsets in bytes. /// For each i, (byte_offsets[i]) must be element size aligned. /// @param vals The vector of values to store.