Add oneAPI SYCL n-body

alpaka-group · Nov 21, 2023 · cd6d792 · cd6d792
1 parent c402750
commit cd6d792
Show file tree

Hide file tree

Showing 4 changed files with 215 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -193,6 +193,12 @@ if (LLAMA_BUILD_EXAMPLES)
 		add_subdirectory("examples/cuda/pitch")
 		add_subdirectory("examples/cuda/viewcopy")
 	endif()
+
+	# SYCL examples
+	find_package(IntelSYCL)
+	if (IntelSYCL_FOUND)
+		add_subdirectory("examples/sycl/nbody")
+	endif()
 endif()
 
 # install

diff --git a/examples/sycl/nbody/CMakeLists.txt b/examples/sycl/nbody/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required (VERSION 3.20.5)
+project(llama-sycl-nbody)
+
+if (NOT TARGET llama::llama)
+	find_package(llama REQUIRED)
+endif()
+add_executable(${PROJECT_NAME} nbody.cpp)
+target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
+target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
+add_sycl_to_target(TARGET ${PROJECT_NAME} SOURCES nbody.cpp)
diff --git a/examples/sycl/nbody/nbody.cpp b/examples/sycl/nbody/nbody.cpp
@@ -0,0 +1,169 @@
+// Copyright 2022 Bernhard Manfred Gruber
+// SPDX-License-Identifier: LGPL-3.0-or-later
+
+#include "../../common/Stopwatch.hpp"
+#include "../../common/hostname.hpp"
+
+#include <chrono>
+#include <iostream>
+#include <llama/llama.hpp>
+#include <random>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+using FP = float;
+
+constexpr auto problemSize = 64 * 1024; ///< total number of particles
+constexpr auto steps = 5; ///< number of steps to calculate
+constexpr auto runUpate = true; // run update step. Useful to disable for benchmarking the move step.
+constexpr auto allowRsqrt = true;
+constexpr auto aosoaLanes = 8;
+
+constexpr auto timestep = FP{0.0001};
+constexpr auto eps2 = FP{0.01};
+
+constexpr auto rngSeed = 42;
+
+// clang-format off
+namespace tag
+{
+    struct Pos{};
+    struct Vel{};
+    struct X{};
+    struct Y{};
+    struct Z{};
+    struct Mass{};
+}
+
+using Vec3 = llama::Record<
+    llama::Field<tag::X, FP>,
+    llama::Field<tag::Y, FP>,
+    llama::Field<tag::Z, FP>>;
+using Particle = llama::Record<
+    llama::Field<tag::Pos, Vec3>,
+    llama::Field<tag::Vel, Vec3>,
+    llama::Field<tag::Mass, FP>>;
+// clang-format on
+
+// using SharedMemoryParticle = Particle;
+// using SharedMemoryParticle = ParticleJ;
+
+template<typename ParticleRefI, typename ParticleRefJ>
+void pPInteraction(ParticleRefI& pi, ParticleRefJ pj)
+{
+    auto dist = pi(tag::Pos{}) - pj(tag::Pos{});
+    dist *= dist;
+    const FP distSqr = eps2 + dist(tag::X{}) + dist(tag::Y{}) + dist(tag::Z{});
+    const FP distSixth = distSqr * distSqr * distSqr;
+    const FP invDistCube = allowRsqrt ? sycl::rsqrt(distSixth) : (1.0f / sycl::sqrt(distSixth));
+    const FP sts = pj(tag::Mass{}) * invDistCube * +timestep;
+    pi(tag::Vel{}) += dist * sts;
+}
+
+template<int Mapping>
+int run(sycl::queue& queue)
+{
+    auto mappingName = [](int m) -> std::string
+    {
+        if(m == 0)
+            return "AoS";
+        if(m == 1)
+            return "SoA MB";
+        if(m == 2)
+            return "AoSoA" + std::to_string(aosoaLanes);
+        std::abort();
+    };
+    std::cout << "LLAMA " << mappingName(Mapping) << "\n";
+
+    Stopwatch watch;
+    auto mapping = [&]
+    {
+        using ArrayExtents = llama::ArrayExtentsDynamic<std::size_t, 1>;
+        const auto extents = ArrayExtents{problemSize};
+        if constexpr(Mapping == 0)
+            return llama::mapping::AoS<ArrayExtents, Particle>{extents};
+        if constexpr(Mapping == 1)
+            return llama::mapping::SoA<ArrayExtents, Particle, llama::mapping::Blobs::OnePerField>{extents};
+        if constexpr(Mapping == 2)
+            return llama::mapping::AoSoA<ArrayExtents, Particle, aosoaLanes>{extents};
+    }();
+
+    auto particles = llama::allocViewUninitialized(std::move(mapping), llama::bloballoc::SyclMallocShared{queue});
+    watch.printAndReset("alloc");
+
+    std::default_random_engine engine{rngSeed};
+    std::normal_distribution<FP> dist(FP{0}, FP{1});
+    for(std::size_t i = 0; i < problemSize; ++i)
+    {
+        auto p = particles(i);
+        p(tag::Pos{}, tag::X{}) = dist(engine);
+        p(tag::Pos{}, tag::Y{}) = dist(engine);
+        p(tag::Pos{}, tag::Z{}) = dist(engine);
+        p(tag::Vel{}, tag::X{}) = dist(engine) / FP(10);
+        p(tag::Vel{}, tag::Y{}) = dist(engine) / FP(10);
+        p(tag::Vel{}, tag::Z{}) = dist(engine) / FP(10);
+        p(tag::Mass{}) = dist(engine) / FP(100);
+    }
+    watch.printAndReset("init");
+    for(int i = 0; i < particles.mapping().blobCount; i++)
+        queue.prefetch(&particles.blobs()[i][0], particles.mapping().blobSize(i));
+    watch.printAndReset("prefetch");
+
+    const auto range = sycl::range<1>{problemSize};
+
+    double sumUpdate = 0;
+    double sumMove = 0;
+    for(std::size_t s = 0; s < steps; ++s)
+    {
+        queue.submit(
+            [&](sycl::handler& h)
+            {
+                h.parallel_for(
+                    range,
+                    [particles = llama::shallowCopy(particles)](sycl::item<1> it)
+                    {
+                        const auto i = it[0];
+                        llama::One<Particle> pi = particles(i);
+                        for(std::size_t j = 0; j < problemSize; j++)
+                            pPInteraction(pi, particles(j));
+                        particles(i)(tag::Vel{}) = pi(tag::Vel{});
+                    });
+            });
+        queue.wait();
+        sumUpdate += watch.printAndReset("update", '\t');
+        queue.submit(
+            [&](sycl::handler& h)
+            {
+                h.parallel_for(
+                    range,
+                    [particles = llama::shallowCopy(particles)](sycl::item<1> it)
+                    { particles(it[0])(tag::Pos{}) += particles(it[0])(tag::Vel{}) * timestep; });
+            });
+        queue.wait();
+        sumMove += watch.printAndReset("move");
+    }
+
+    return 0;
+}
+
+auto main(int argc, char** argv) -> int
+try
+{
+    std::cout << problemSize / 1000 << "k particles "
+              << "(" << problemSize * sizeof(float) * 7 / 1024 << "kiB)\n";
+
+    auto queue = sycl::queue{sycl::cpu_selector_v};
+    std::cout << "running on device: " << queue.get_device().get_info<sycl::info::device::name>() << "\n";
+    std::cout << std::fixed;
+
+    int r = 0;
+    r += run<0>(queue);
+    r += run<1>(queue);
+    r += run<2>(queue);
+    return r;
+}
+catch(const std::exception& e)
+{
+    std::cerr << "Exception: " << e.what() << "\n";
+}
diff --git a/include/llama/BlobAllocators.hpp b/include/llama/BlobAllocators.hpp
@@ -16,6 +16,9 @@
 #if __has_include(<alpaka/alpaka.hpp>)
 #    include <alpaka/alpaka.hpp>
 #endif
+#if __has_include(<sycl/sycl.hpp>)
+#    include <sycl/sycl.hpp>
+#endif
 
 namespace llama::bloballoc
 {
@@ -163,6 +166,7 @@ namespace llama::bloballoc
 #endif
 
 #if __has_include(<alpaka/alpaka.hpp>)
+    /// Allocates alpaka buffers as blobs.
     LLAMA_EXPORT
     template<typename Size, typename Dev>
     struct AlpakaBuf
@@ -176,4 +180,30 @@ namespace llama::bloballoc
         }
     };
 #endif
+
+#if __has_include(<sycl/sycl.hpp>)
+    /// Allocates shared USM memory using sycl::aligned_alloc_shared. The memory is managed by a std::unique_ptr with a
+    /// deleter that calles sycl::free. If you want to use a view created with this allocator in a kernel, call \ref
+    /// shallowCopy on the view before passing it to the kernel.
+    LLAMA_EXPORT
+    struct SyclMallocShared
+    {
+        sycl::queue queue;
+
+        static auto makeDeleter(sycl::queue q)
+        {
+            // create lambda in function independent of FieldAlignment template paramter to avoid different blob types
+            return [q](void* p) { sycl::free(p, q); };
+        }
+
+        template<std::size_t FieldAlignment>
+        inline auto operator()(std::integral_constant<std::size_t, FieldAlignment>, std::size_t count) const
+        {
+            std::byte* p = sycl::aligned_alloc_shared<std::byte>(FieldAlignment, count, queue);
+            if(reinterpret_cast<std::uintptr_t>(p) & (FieldAlignment - 1 != 0u))
+                throw std::runtime_error{"sycl::malloc_shared does not align sufficiently"};
+            return std::unique_ptr<std::byte[], decltype(makeDeleter(queue))>(p, makeDeleter(queue));
+        }
+    };
+#endif
 } // namespace llama::bloballoc