From 20f5064fb350302047e41b07005e39d0271cde6f Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine@intel.com>
Date: Tue, 30 Jan 2024 23:29:28 -0800
Subject: [PATCH 1/6] Initial implementation

---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 206 ++++++++++++++++--
 .../ext/intel/experimental/esimd/memory.hpp   |  16 +-
 2 files changed, 192 insertions(+), 30 deletions(-)
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 7af684ee99cfb..d3910ce944977 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -2505,11 +2505,13 @@ block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
 namespace detail {
 template <typename T, int N, typename AccessorTy>
 ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
-    (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
-    (std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
-     is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>)>
+    std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
+    is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
 scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
              uint32_t glob_offset, simd_mask<N> mask) {
+
+  static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32),
+                "Unexpected type or vector length");
   constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
   // TODO (performance) use hardware-supported scale once BE supports it
   constexpr int16_t scale = 0;
@@ -2662,6 +2664,41 @@ slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
   return lsc_format_ret<T>(Result);
 }
 
+/// SLM scatter implementation.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.slm
+///
+/// Scatters elements located to slm.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam N is the number of channels (platform dependent).
+/// @param offsets is the zero-based offsets for SLM buffer in bytes.
+/// @param vals is values to store.
+/// @param pred is predicates.
+///
+template <typename T, int NElts, lsc_data_size DS, int N>
+__ESIMD_API void slm_scatter_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
+                                  __ESIMD_NS::simd<T, N * NElts> vals,
+                                  __ESIMD_NS::simd_mask<N> pred) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size LSCVS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order Transposed =
+      detail::lsc_data_order::nontranspose;
+  using MsgT = typename detail::lsc_expand_type<T>::type;
+  using CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
+  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
+  __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, AddressScale,
+                        ImmOffset, EDS, LSCVS, Transposed, N>(
+      pred.data(), offsets.data(), Tmp.data());
+}
+
 } // namespace detail
 
 /// @endcond ESIMD_DETAIL
@@ -3938,7 +3975,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 /// @param byte_offsets the vector of 32-bit offsets in bytes.
 /// For each i, (byte_offsets[i]) must be element size aligned.
 /// @param props The optional compile-time properties. Only 'alignment'
-/// and cache hint properties are used.
+/// property is used.
 /// @return A vector of elements read.
 template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
           typename PropertyListT =
@@ -3961,17 +3998,156 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
   return Res[0];
 }
 
-/// Scatter operation over the Shared Local Memory.
-/// This API has almost the same interface as the @ref accessor_scatter
-/// "accessor-based scatter", except that it does not have the accessor and
-/// the global offset parameters.
-///
-template <typename T, int N>
-__ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
-                             (sizeof(T) <= 4)>
-slm_scatter(simd<uint32_t, N> offsets, simd<T, N> vals, simd_mask<N> mask = 1) {
-  detail::LocalAccessorMarker acc;
-  detail::scatter_impl<T, N>(acc, vals, offsets, 0, mask);
+/// template <typename T, int N, int VS = 1,
+///           typename PropertyListT = empty_properties_t>
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
+///                   simd<T, N> vals, simd_mask<N / VS> mask,
+///                   PropertyListT props = {});                   // (slm-sc-1)
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
+///                   simd<T, N> vals, PropertyListT props = {});  // (slm-sc-2)
+///
+/// The next 2 functions are variations of the first 2 above (slm-sc-1,2)
+/// and were added only to support simd_view instead of simd for byte_offsets.
+/// template <typename T, int N, int VS = 1, typename OffsetObjT,
+///           typename OffsetRegionT, typename PropertyListT = empty_props_t>
+/// void slm_scatter(simd_view<OffsetObjT, OffsetRegionT> byte_offsets,
+///             simd<T, N> vals, simd_mask<N / VS> mask,
+///             PropertyListT props = {});                         // (slm-sc-3)
+/// void slm_scatter(simd_view<OffsetObjT, OffsetRegionT> byte_offsets,
+///             simd<T, N> vals, PropertyListT props = {});        // (slm-sc-4)
+
+/// template <typename T, int N, int VS = 1,
+///           typename PropertyListT = empty_properties_t>
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
+///                   simd<T, N> vals, simd_mask<N / VS> mask,
+///                   PropertyListT props = {});                   // (slm-sc-1)
+/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets. Storage of any element
+/// can be disabled via the input vector of predicates \p mask.
+/// If mask[i] is unset, then the storage to (byte_offsets[i]) is skipped.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of writes per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector of values to store.
+/// @param mask The access mask, defaults to all 1s.
+/// @param props The optional compile-time properties. Only 'alignment' property
+/// is used.
+template <typename T, int N, int VS = 1,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
+            simd_mask<N / VS> mask, PropertyListT props = {}) {
+  static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS");
+
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
+  static_assert(Alignment >= sizeof(T),
+                "slm_scatter() requires at least element-size alignment");
+
+  // Use LSC lowering if VS > 1.
+  if constexpr (VS > 1) {
+    __ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
+        byte_offsets, vals, mask);
+  } else {
+    detail::LocalAccessorMarker acc;
+    detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
+  }
+}
+
+/// template <typename T, int N, int VS = 1,
+///           typename PropertyListT = empty_properties_t>
+/// void slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
+///                   PropertyListT props = {});                   // (slm-sc-2)
+/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// @param vals The vector of values to store.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+template <typename T, int N, int VS = 1,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
+            PropertyListT props = {}) {
+  simd_mask<N / VS> Mask = 1;
+  slm_scatter<T, N, VS>(byte_offsets, vals, Mask, props);
+}
+
+/// template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+///           typename PropertyListT = empty_props_t>
+/// void slm_scatter(
+///             OffsetSimdViewT byte_offsets, simd<T, N> vals,
+///             simd_mask<N / VS> mask, PropertyListT props = {}); // (slm-sc-3)
+/// Stores ("scatters") elements of the type 'T' to Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets.
+/// Storage to any element's memory location can be disabled via the
+/// input vector of predicates \p mask. If mask[i] is unset, then the storage to
+/// (byte_offsets[i]) is skipped.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// If the alignment property is not passed, then it is assumed that each
+/// accessed address is aligned by element-size.
+/// @param vals The vector of values to store.
+/// @param mask The access mask, defaults to all 1s.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
+            simd_mask<N / VS> mask, PropertyListT props = {}) {
+  slm_scatter<T, N, VS>(byte_offsets.read(), vals, mask, props);
+}
+
+/// void slm_scatter(
+///             OffsetSimdViewT byte_offsets, simd<T, N> vals,
+///             PropertyListT props = {});                         // (slm-sc-4)
+/// Loads ("gathers") elements of the type 'T' from Shared Local Memory
+/// locations addressed by byte offsets \p byte_offsets, and returns the loaded
+/// elements.
+/// @tparam T Element type.
+/// @tparam N Number of elements to read.
+/// @tparam VS Vector size. It can also be read as the number of reads per each
+/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
+/// only on DG2 and PVC.
+/// @param byte_offsets the vector of 32-bit offsets in bytes.
+/// For each i, (byte_offsets[i]) must be element size aligned.
+/// @param vals The vector of values to store.
+/// @param props The optional compile-time properties. Only 'alignment'
+/// property is used.
+template <typename T, int N, int VS = 1, typename OffsetSimdViewT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
+            PropertyListT props = {}) {
+  return slm_scatter<T, N, VS>(byte_offsets.read(), vals, props);
 }
 
 /// Store a scalar value into the Shared Local Memory.
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 069898f14e747..42b77137c1a91 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -1424,21 +1424,7 @@ template <typename T, int NElts = 1,
 __ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd<uint32_t, N> offsets,
                                  __ESIMD_NS::simd<T, N * NElts> vals,
                                  __ESIMD_NS::simd_mask<N> pred = 1) {
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  using CstT = typename detail::lsc_bitcast_type<T>::type;
-  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
-  __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, _AddressScale,
-                        _ImmOffset, _DS, _VS, _Transposed, N>(
-      pred.data(), offsets.data(), Tmp.data());
+  __ESIMD_DNS::slm_scatter_impl<T, NElts, DS>(offsets, vals, pred);
 }
 
 /// Transposed SLM scatter with 1 channel.

From 19c3dd611cdfd8521fb39dea790284d6222b2b98 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine@intel.com>
Date: Thu, 1 Feb 2024 22:09:36 -0800
Subject: [PATCH 2/6] Add tests

---
 .../unified_memory_api/Inputs/scatter.hpp     | 219 ++++++++++++++++++
 .../ESIMD/unified_memory_api/slm_scatter.cpp  |  33 +++
 .../slm_scatter_dg2_pvc.cpp                   |  34 +++
 sycl/test/esimd/memory_properties.cpp         |  39 ++++
 4 files changed, 325 insertions(+)
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp

diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
index 270ecc4e698af..f61bab0e8cc27 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
@@ -225,3 +225,222 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
 
   return Passed;
 }
+
+template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UseProperties,
+          typename ScatterPropertiesT>
+bool testSLM(queue Q, uint32_t MaskStride,
+             ScatterPropertiesT ScatterProperties) {
+  constexpr uint32_t Groups = 8;
+  constexpr uint32_t Threads = 1;
+  constexpr size_t Size = Groups * Threads * N;
+  static_assert(VS > 0 && N % VS == 0,
+                "Incorrect VS parameter. N must be divisible by VS.");
+  constexpr int NOffsets = N / VS;
+  using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
+
+  std::cout << "SLM case: T=" << esimd_test::type_name<T>() << ",N=" << N
+            << ", VS=" << VS << ",UseMask=" << UseMask
+            << ",UseProperties=" << UseProperties << std::endl;
+
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{Threads};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+
+  T *Out = static_cast<T *>(sycl::malloc_shared(Size * sizeof(T), Q));
+  for (size_t i = 0; i < Size; i++)
+    Out[i] = i;
+
+  try {
+    Q.submit([&](handler &cgh) {
+       cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+         ScatterPropertiesT Props{};
+         uint16_t GlobalID = ndi.get_global_id(0);
+         uint16_t LocalID = ndi.get_local_id(0);
+         uint32_t GlobalElemOffset = GlobalID * N;
+         uint32_t LocalElemOffset = LocalID * N;
+
+         constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T);
+         slm_init<SLMSize>();
+
+         if (LocalID == 0) {
+           for (int I = 0; I < Threads * N; I += 8) {
+             simd<T, 8> InVec(Out + GlobalElemOffset + I);
+             simd<uint32_t, 8> offsets(I * sizeof(T), sizeof(T));
+             slm_scatter<T>(offsets, InVec);
+           }
+         }
+         barrier();
+
+         simd<uint32_t, NOffsets> ByteOffsets(LocalElemOffset * sizeof(T),
+                                              VS * sizeof(T));
+         auto ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
+
+         simd<T, N> Vals = slm_gather<T, N, VS>(ByteOffsets, Props);
+         Vals *= 2;
+
+         auto ValsView = Vals.template select<N, 1>();
+         simd_mask<NOffsets> Pred = 0;
+         for (int I = 0; I < NOffsets; I++)
+           Pred[I] = (I % MaskStride == 0) ? 1 : 0;
+         if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter<T, N, VS>(ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N, VS>(ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N, VS>(ByteOffsetsView, ValsView);
+             }
+           }
+         } else { // VS == 1
+           if constexpr (UseMask) {
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals, Pred, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter(ByteOffsetsView, Vals, Pred, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView, Pred, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView, Pred, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals, Pred);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter<T, N>(ByteOffsetsView, Vals, Pred);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView, Pred);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView, Pred);
+             }
+           } else { // UseMask == false
+             if constexpr (UseProperties) {
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals, Props);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter(ByteOffsetsView, Vals, Props);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView, Props);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView, Props);
+             } else { // UseProperties == false
+               if (GlobalID % 4 == 0)
+                 slm_scatter(ByteOffsets, Vals);
+               else if (GlobalID % 4 == 1)
+                 slm_scatter(ByteOffsetsView, Vals);
+               else if (GlobalID % 4 == 2)
+                 slm_scatter<T, N>(ByteOffsets, ValsView);
+               else if (GlobalID % 4 == 3)
+                 slm_scatter<T, N>(ByteOffsetsView, ValsView);
+             }
+           }
+         }
+         barrier();
+         if (LocalID == 0) {
+           for (int I = 0; I < Threads * N; I += 8) {
+             simd<uint32_t, 8> offsets(I * sizeof(T), sizeof(T));
+             simd<T, 8> OutVec = slm_gather<T>(offsets);
+             OutVec.copy_to(Out + GlobalElemOffset + I);
+           }
+         }
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    sycl::free(Out, Q);
+    return false;
+  }
+
+  bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask);
+
+  sycl::free(Out, Q);
+
+  return Passed;
+}
+
+template <typename T, TestFeatures Features> bool testSLM(queue Q) {
+  constexpr bool CheckMask = true;
+  constexpr bool CheckProperties = true;
+  properties EmptyProps;
+  properties AlignElemProps{alignment<sizeof(T)>};
+
+  bool Passed = true;
+
+  // Test scatter() that is available on Gen12 and PVC.
+  Passed &= testSLM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 2, 1, !CheckMask, CheckProperties>(Q, 1, EmptyProps);
+  Passed &= testSLM<T, 4, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 8, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 16, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, !CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 2, 1, CheckMask, CheckProperties>(Q, 1, EmptyProps);
+  Passed &= testSLM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 16, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, CheckMask, CheckProperties>(Q, 2, EmptyProps);
+
+  // // Test scatter() without passing compile-time properties argument.
+  Passed &= testSLM<T, 16, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, !CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 16, 1, CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+  Passed &= testSLM<T, 32, 1, CheckMask, !CheckProperties>(Q, 2, EmptyProps);
+
+  if constexpr (Features == TestFeatures::PVC ||
+                Features == TestFeatures::DG2) {
+    properties LSCProps{alignment<sizeof(T)>};
+    Passed &= testSLM<T, 1, 1, !CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testSLM<T, 2, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testSLM<T, 4, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+    Passed &= testSLM<T, 8, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    Passed &= testSLM<T, 32, 1, CheckMask, CheckProperties>(Q, 2, LSCProps);
+
+    // Check VS > 1. GPU supports only dwords and qwords in this mode.
+    if constexpr (sizeof(T) >= 4) {
+      // TODO: This test case causes flaky fail. Enable it after the issue
+      // in GPU driver is fixed.
+      // Passed &=
+      //     testSLM<T, 16, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps)
+      Passed &=
+          testSLM<T, 32, 2, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testSLM<T, 32, 2, CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+      Passed &=
+          testSLM<T, 32, 2, CheckMask, !CheckProperties>(Q, 2, AlignElemProps);
+    }
+  } // TestPVCFeatures
+
+  return Passed;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
new file mode 100644
index 0000000000000..96c49fcdae8af
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp
@@ -0,0 +1,33 @@
+//==------- slm_scatter.cpp - DPC++ ESIMD on-device test ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_scatter() functions accepting
+// optional compile-time esimd::properties.
+// The scatter() calls in this test do not use DG2/PVC features.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::Generic;
+  bool Passed = true;
+
+  Passed &= testSLM<int8_t, TestFeatures>(Q);
+  Passed &= testSLM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLM<sycl::half, TestFeatures>(Q);
+  Passed &= testSLM<uint32_t, TestFeatures>(Q);
+  Passed &= testSLM<float, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp
new file mode 100644
index 0000000000000..26106a5d9b3af
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp
@@ -0,0 +1,34 @@
+//==------- slm_scatter_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_scatter() functions accepting
+// optional compile-time esimd::properties.
+// The slm_scatter() calls in this test use DG2 or PVC features.
+
+#include "Inputs/scatter.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::PVC;
+  bool Passed = true;
+
+  Passed &= testSLM<int8_t, TestFeatures>(Q);
+  Passed &= testSLM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLM<sycl::half, TestFeatures>(Q);
+  Passed &= testSLM<uint32_t, TestFeatures>(Q);
+  Passed &= testSLM<float, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 31dbc3e889f4b..87f43ef7f3553 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -1303,4 +1303,43 @@ test_slm_gather_scatter(int byte_offset32) {
                                  props_align4);
   slm = slm_gather<float, 32, 2>(ioffset_n16_view, mask_n16, pass_thru_view,
                                  props_align4);
+
+  // Test SLM scatter using this plan:
+  // 1) slm_scatter(offsets): offsets is simd or simd_view
+  // 2) slm_scatter(offsets, mask): offsets is simd or simd_view
+  // 4) slm_scatter(...): same as (1), (2) above, but with VS > 1.
+
+  // 1) slm_scatter(offsets): offsets is simd or simd_view
+  // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm);
+  slm_scatter<float, 32>(ioffset_n32_view, slm);
+
+  // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm, props_align8);
+
+  // 2) slm_gather(offsets, mask): offsets is simd or simd_view
+  // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm, mask_n32);
+  slm_scatter<float, 32>(ioffset_n32_view, slm, mask_n32);
+
+  // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  slm_scatter<float>(ioffset_n32, slm, mask_n32, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm, mask_n32, props_align8);
+
+  // 4) slm_gather(...): same as (1), (2), above, but with VS > 1.
+  // CHECK-COUNT-8: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0)
+  // 4a) check VS > 1. no 'mask' operand first.
+  slm_scatter<float, 32, 2>(ioffset_n16, slm);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm);
+
+  slm_scatter<float, 32, 2>(ioffset_n16, slm, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm, props_align4);
+
+  // 4b) check VS > 1. Pass the 'mask' operand this time.
+  slm_scatter<float, 32, 2>(ioffset_n16, slm, mask_n16);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm, mask_n16);
+
+  slm_scatter<float, 32, 2>(ioffset_n16, slm, mask_n16, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm, mask_n16, props_align4);
 }

From 04451837f1d2888b7f831965a5bff651618f965d Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine@intel.com>
Date: Thu, 1 Feb 2024 22:21:01 -0800
Subject: [PATCH 3/6] Update the test

---
 sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
index f61bab0e8cc27..07d471d1fc19a 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
@@ -265,8 +265,8 @@ bool testSLM(queue Q, uint32_t MaskStride,
          if (LocalID == 0) {
            for (int I = 0; I < Threads * N; I += 8) {
              simd<T, 8> InVec(Out + GlobalElemOffset + I);
-             simd<uint32_t, 8> offsets(I * sizeof(T), sizeof(T));
-             slm_scatter<T>(offsets, InVec);
+             simd<uint32_t, 8> Offsets(I * sizeof(T), sizeof(T));
+             slm_scatter<T>(Offsets, InVec);
            }
          }
          barrier();
@@ -370,8 +370,8 @@ bool testSLM(queue Q, uint32_t MaskStride,
          barrier();
          if (LocalID == 0) {
            for (int I = 0; I < Threads * N; I += 8) {
-             simd<uint32_t, 8> offsets(I * sizeof(T), sizeof(T));
-             simd<T, 8> OutVec = slm_gather<T>(offsets);
+             simd<uint32_t, 8> Offsets(I * sizeof(T), sizeof(T));
+             simd<T, 8> OutVec = slm_gather<T>(Offsets);
              OutVec.copy_to(Out + GlobalElemOffset + I);
            }
          }

From 76223e5005dbc578d371942adece3955ae37821f Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine@intel.com>
Date: Fri, 2 Feb 2024 12:24:10 -0800
Subject: [PATCH 4/6] Fix test failure for Gen12 and adress PR comments

---
 sycl/include/sycl/ext/intel/esimd/memory.hpp                | 4 ++--
 sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp   | 6 +++---
 .../ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp        | 3 +++
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index d173069502082..14390fba7b7cd 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -3958,7 +3958,7 @@ slm_gather(simd<uint32_t, N / VS> byte_offsets, simd_mask<N / VS> mask,
   static_assert(Alignment >= sizeof(T),
                 "slm_gather() requires at least element-size alignment");
 
-  if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
+  if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) &&
                            !detail::isMaskedGatherScatterLLVMAvailable())) {
     simd<T, N> PassThru; // Intentionally undefined
     return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
@@ -4251,7 +4251,7 @@ slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
                 "slm_scatter() requires at least element-size alignment");
 
   // Use LSC lowering if VS > 1.
-  if constexpr (VS > 1) {
+  if constexpr (VS > 1 || !(detail::isPowerOf2(N, 32) && sizeof(T) <= 4)) {
     __ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
         byte_offsets, vals, mask);
   } else {
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
index 07d471d1fc19a..3a5c624328688 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp
@@ -369,9 +369,9 @@ bool testSLM(queue Q, uint32_t MaskStride,
          }
          barrier();
          if (LocalID == 0) {
-           for (int I = 0; I < Threads * N; I += 8) {
-             simd<uint32_t, 8> Offsets(I * sizeof(T), sizeof(T));
-             simd<T, 8> OutVec = slm_gather<T>(Offsets);
+           for (int I = 0; I < Threads * N; I++) {
+             simd<uint32_t, 1> Offsets(I * sizeof(T), sizeof(T));
+             simd<T, 1> OutVec = slm_gather<T>(Offsets);
              OutVec.copy_to(Out + GlobalElemOffset + I);
            }
          }
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp
index 26106a5d9b3af..4ebe20e376cf1 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp
@@ -28,6 +28,9 @@ int main() {
     Passed &= testSLM<sycl::half, TestFeatures>(Q);
   Passed &= testSLM<uint32_t, TestFeatures>(Q);
   Passed &= testSLM<float, TestFeatures>(Q);
+  Passed &= testSLM<uint64_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testSLM<double, TestFeatures>(Q);
 
   std::cout << (Passed ? "Passed\n" : "FAILED\n");
   return Passed ? 0 : 1;

From bb2f87838ec3921666c1615152fa6d1b27acc715 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine@intel.com>
Date: Tue, 6 Feb 2024 11:27:06 -0800
Subject: [PATCH 5/6] Address PR comments

---
 sycl/include/sycl/ext/intel/esimd/memory.hpp | 42 +++++++++-----------
 sycl/test/esimd/memory_properties.cpp        | 33 +++++++++++----
 2 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index f57b7b1f66e74..3dafe61e3256a 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -2822,10 +2822,9 @@ gather_impl(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
 /// @return is a vector of type T and size N * NElts.
 ///
 template <typename T, int NElts, lsc_data_size DS, int N>
-__ESIMD_API __ESIMD_NS::simd<T, N * NElts>
-slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
-                __ESIMD_NS::simd_mask<N> pred,
-                __ESIMD_NS::simd<T, N * NElts> pass_thru) {
+__ESIMD_API simd<T, N * NElts> slm_gather_impl(simd<uint32_t, N> offsets,
+                                               simd_mask<N> pred,
+                                               simd<T, N * NElts> pass_thru) {
   check_lsc_vector_size<NElts>();
   check_lsc_data_size<T, DS>();
   constexpr uint16_t AddressScale = 1;
@@ -2834,9 +2833,8 @@ slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
   constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
   using MsgT = typename lsc_expand_type<T>::type;
-  __ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
-      lsc_format_input<MsgT>(pass_thru);
-  __ESIMD_NS::simd<MsgT, N * NElts> Result =
+  simd<MsgT, N * NElts> PassThruExpanded = lsc_format_input<MsgT>(pass_thru);
+  simd<MsgT, N * NElts> Result =
       __esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
                                  AddressScale, ImmOffset, EDS, LSCVS,
                                  Transposed, N>(pred.data(), offsets.data(),
@@ -2859,21 +2857,17 @@ slm_gather_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
 /// @param pred is predicates.
 ///
 template <typename T, int NElts, lsc_data_size DS, int N>
-__ESIMD_API void slm_scatter_impl(__ESIMD_NS::simd<uint32_t, N> offsets,
-                                  __ESIMD_NS::simd<T, N * NElts> vals,
-                                  __ESIMD_NS::simd_mask<N> pred) {
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
+__ESIMD_API void slm_scatter_impl(simd<uint32_t, N> offsets,
+                                  simd<T, N * NElts> vals, simd_mask<N> pred) {
+  check_lsc_vector_size<NElts>();
+  check_lsc_data_size<T, DS>();
   constexpr uint16_t AddressScale = 1;
   constexpr int ImmOffset = 0;
-  constexpr lsc_data_size EDS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size LSCVS = detail::to_lsc_vector_size<NElts>();
-  constexpr detail::lsc_data_order Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  using CstT = __ESIMD_DNS::uint_type_t<sizeof(T)>;
-  __ESIMD_NS::simd<MsgT, N * NElts> Tmp = vals.template bit_cast_view<CstT>();
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size LSCVS = to_lsc_vector_size<NElts>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  using MsgT = typename lsc_expand_type<T>::type;
+  simd<MsgT, N * NElts> Tmp = lsc_format_input<MsgT, T>(vals);
   __esimd_lsc_store_slm<MsgT, cache_hint::none, cache_hint::none, AddressScale,
                         ImmOffset, EDS, LSCVS, Transposed, N>(
       pred.data(), offsets.data(), Tmp.data());
@@ -4181,8 +4175,8 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
 /// template <typename T, int N, int VS = 1,
 ///           typename PropertyListT = empty_properties_t>
 /// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
-///                   simd<T, N> vals, simd_mask<N / VS> mask,
-///                   PropertyListT props = {});                   // (slm-sc-1)
+///                  simd<T, N> vals, simd_mask<N / VS> mask,
+///                  PropertyListT props = {});                   // (slm-sc-1)
 /// void slm_scatter(simd<uint32_t, N / VS> byte_offsets,
 ///                   simd<T, N> vals, PropertyListT props = {});  // (slm-sc-2)
 ///
@@ -4252,7 +4246,7 @@ slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
 /// @tparam N Number of elements to read.
 /// @tparam VS Vector size. It can also be read as the number of reads per each
 /// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
-/// only on DG2 and PVC.
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors..
 /// @param byte_offsets the vector of 32-bit offsets in bytes.
 /// For each i, (byte_offsets[i]) must be element size aligned.
 /// @param vals The vector of values to store.
@@ -4283,7 +4277,7 @@ slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
 /// @tparam N Number of elements to read.
 /// @tparam VS Vector size. It can also be read as the number of reads per each
 /// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
-/// only on DG2 and PVC.
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors..
 /// @param byte_offsets the vector of 32-bit offsets in bytes.
 /// For each i, (byte_offsets[i]) must be element size aligned.
 /// If the alignment property is not passed, then it is assumed that each
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 1103ed11bc26d..eb629935347db 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -1302,6 +1302,7 @@ test_slm_gather_scatter(int byte_offset32) {
   simd<float, 32> slm;
   simd<float, 32> pass_thru;
   auto pass_thru_view = pass_thru.select<32, 1>();
+  auto slm_view = slm.select<32, 1>();
 
   // Test SLM gather using this plan:
   // 1) slm_gather(offsets): offsets is simd or simd_view
@@ -1375,41 +1376,57 @@ test_slm_gather_scatter(int byte_offset32) {
                                  props_align4);
 
   // Test SLM scatter using this plan:
-  // 1) slm_scatter(offsets): offsets is simd or simd_view
-  // 2) slm_scatter(offsets, mask): offsets is simd or simd_view
-  // 4) slm_scatter(...): same as (1), (2) above, but with VS > 1.
+  // 1) slm_scatter(offsets, vals): offsets/vals is simd or simd_view
+  // 2) slm_scatter(offsets, vals, mask): offsets/vals is simd or simd_view
+  // 3) slm_scatter(...): same as (1), (2) above, but with VS > 1.
 
   // 1) slm_scatter(offsets): offsets is simd or simd_view
-  // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
   slm_scatter<float>(ioffset_n32, slm);
   slm_scatter<float, 32>(ioffset_n32_view, slm);
+  slm_scatter<float, 32>(ioffset_n32, slm_view);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view);
 
-  // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
   slm_scatter<float>(ioffset_n32, slm, props_align8);
   slm_scatter<float, 32>(ioffset_n32_view, slm, props_align8);
+  slm_scatter<float, 32>(ioffset_n32, slm_view, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view, props_align8);
 
   // 2) slm_gather(offsets, mask): offsets is simd or simd_view
-  // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
   slm_scatter<float>(ioffset_n32, slm, mask_n32);
   slm_scatter<float, 32>(ioffset_n32_view, slm, mask_n32);
+  slm_scatter<float, 32>(ioffset_n32, slm_view, mask_n32);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view, mask_n32);
 
-  // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
+  // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}})
   slm_scatter<float>(ioffset_n32, slm, mask_n32, props_align8);
   slm_scatter<float, 32>(ioffset_n32_view, slm, mask_n32, props_align8);
+  slm_scatter<float, 32>(ioffset_n32, slm_view, mask_n32, props_align8);
+  slm_scatter<float, 32>(ioffset_n32_view, slm_view, mask_n32, props_align8);
 
   // 4) slm_gather(...): same as (1), (2), above, but with VS > 1.
-  // CHECK-COUNT-8: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0)
+  // CHECK-COUNT-16: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0)
   // 4a) check VS > 1. no 'mask' operand first.
   slm_scatter<float, 32, 2>(ioffset_n16, slm);
   slm_scatter<float, 32, 2>(ioffset_n16_view, slm);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view);
 
   slm_scatter<float, 32, 2>(ioffset_n16, slm, props_align4);
   slm_scatter<float, 32, 2>(ioffset_n16_view, slm, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view, props_align4);
 
   // 4b) check VS > 1. Pass the 'mask' operand this time.
   slm_scatter<float, 32, 2>(ioffset_n16, slm, mask_n16);
   slm_scatter<float, 32, 2>(ioffset_n16_view, slm, mask_n16);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view, mask_n16);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view, mask_n16);
 
   slm_scatter<float, 32, 2>(ioffset_n16, slm, mask_n16, props_align4);
   slm_scatter<float, 32, 2>(ioffset_n16_view, slm, mask_n16, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16, slm_view, mask_n16, props_align4);
+  slm_scatter<float, 32, 2>(ioffset_n16_view, slm_view, mask_n16, props_align4);
 }

From 4fa7497acf0ffad34b87ec1bfed23800655930d3 Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine@intel.com>
Date: Tue, 6 Feb 2024 13:13:19 -0800
Subject: [PATCH 6/6] Address PR comments

---
 sycl/include/sycl/ext/intel/esimd/memory.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 3dafe61e3256a..7f4833017d38a 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -4184,10 +4184,10 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
 /// and were added only to support simd_view instead of simd for byte_offsets.
 /// template <typename T, int N, int VS = 1, typename OffsetObjT,
 ///           typename OffsetRegionT, typename PropertyListT = empty_props_t>
-/// void slm_scatter(simd_view<OffsetObjT, OffsetRegionT> byte_offsets,
+/// void slm_scatter(OffsetSimdViewT byte_offsets,
 ///             simd<T, N> vals, simd_mask<N / VS> mask,
 ///             PropertyListT props = {});                         // (slm-sc-3)
-/// void slm_scatter(simd_view<OffsetObjT, OffsetRegionT> byte_offsets,
+/// void slm_scatter(OffsetSimdViewT byte_offsets,
 ///             simd<T, N> vals, PropertyListT props = {});        // (slm-sc-4)
 
 /// template <typename T, int N, int VS = 1,
@@ -4307,7 +4307,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// @tparam N Number of elements to read.
 /// @tparam VS Vector size. It can also be read as the number of reads per each
 /// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported
-/// only on DG2 and PVC.
+/// only on DG2 and PVC and only for 4- and 8-byte element vectors.
 /// @param byte_offsets the vector of 32-bit offsets in bytes.
 /// For each i, (byte_offsets[i]) must be element size aligned.
 /// @param vals The vector of values to store.