Skip to content

Commit

Permalink
Add oneAPI SYCL n-body
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Nov 21, 2023
1 parent c402750 commit cd6d792
Show file tree
Hide file tree
Showing 4 changed files with 215 additions and 0 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,12 @@ if (LLAMA_BUILD_EXAMPLES)
add_subdirectory("examples/cuda/pitch")
add_subdirectory("examples/cuda/viewcopy")
endif()

# SYCL examples
find_package(IntelSYCL)
if (IntelSYCL_FOUND)
add_subdirectory("examples/sycl/nbody")
endif()
endif()

# install
Expand Down
10 changes: 10 additions & 0 deletions examples/sycl/nbody/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cmake_minimum_required (VERSION 3.20.5)
project(llama-sycl-nbody)

if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()
add_executable(${PROJECT_NAME} nbody.cpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
add_sycl_to_target(TARGET ${PROJECT_NAME} SOURCES nbody.cpp)
169 changes: 169 additions & 0 deletions examples/sycl/nbody/nbody.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
// Copyright 2022 Bernhard Manfred Gruber
// SPDX-License-Identifier: LGPL-3.0-or-later

#include "../../common/Stopwatch.hpp"
#include "../../common/hostname.hpp"

#include <chrono>
#include <iostream>
#include <llama/llama.hpp>
#include <random>
#include <sycl/sycl.hpp>
#include <utility>
#include <vector>

using FP = float;

constexpr auto problemSize = 64 * 1024; ///< total number of particles
constexpr auto steps = 5; ///< number of steps to calculate
constexpr auto runUpate = true; // run update step. Useful to disable for benchmarking the move step.
constexpr auto allowRsqrt = true;
constexpr auto aosoaLanes = 8;

constexpr auto timestep = FP{0.0001};
constexpr auto eps2 = FP{0.01};

constexpr auto rngSeed = 42;

// clang-format off
namespace tag
{
struct Pos{};
struct Vel{};
struct X{};
struct Y{};
struct Z{};
struct Mass{};
}

using Vec3 = llama::Record<
llama::Field<tag::X, FP>,
llama::Field<tag::Y, FP>,
llama::Field<tag::Z, FP>>;
using Particle = llama::Record<
llama::Field<tag::Pos, Vec3>,
llama::Field<tag::Vel, Vec3>,
llama::Field<tag::Mass, FP>>;
// clang-format on

// using SharedMemoryParticle = Particle;
// using SharedMemoryParticle = ParticleJ;

template<typename ParticleRefI, typename ParticleRefJ>
void pPInteraction(ParticleRefI& pi, ParticleRefJ pj)
{
auto dist = pi(tag::Pos{}) - pj(tag::Pos{});
dist *= dist;
const FP distSqr = eps2 + dist(tag::X{}) + dist(tag::Y{}) + dist(tag::Z{});
const FP distSixth = distSqr * distSqr * distSqr;
const FP invDistCube = allowRsqrt ? sycl::rsqrt(distSixth) : (1.0f / sycl::sqrt(distSixth));
const FP sts = pj(tag::Mass{}) * invDistCube * +timestep;
pi(tag::Vel{}) += dist * sts;
}

template<int Mapping>
int run(sycl::queue& queue)
{
auto mappingName = [](int m) -> std::string
{
if(m == 0)
return "AoS";
if(m == 1)
return "SoA MB";
if(m == 2)
return "AoSoA" + std::to_string(aosoaLanes);
std::abort();
};
std::cout << "LLAMA " << mappingName(Mapping) << "\n";

Stopwatch watch;
auto mapping = [&]
{
using ArrayExtents = llama::ArrayExtentsDynamic<std::size_t, 1>;
const auto extents = ArrayExtents{problemSize};
if constexpr(Mapping == 0)
return llama::mapping::AoS<ArrayExtents, Particle>{extents};
if constexpr(Mapping == 1)
return llama::mapping::SoA<ArrayExtents, Particle, llama::mapping::Blobs::OnePerField>{extents};
if constexpr(Mapping == 2)
return llama::mapping::AoSoA<ArrayExtents, Particle, aosoaLanes>{extents};
}();

auto particles = llama::allocViewUninitialized(std::move(mapping), llama::bloballoc::SyclMallocShared{queue});
watch.printAndReset("alloc");

std::default_random_engine engine{rngSeed};
std::normal_distribution<FP> dist(FP{0}, FP{1});
for(std::size_t i = 0; i < problemSize; ++i)
{
auto p = particles(i);
p(tag::Pos{}, tag::X{}) = dist(engine);
p(tag::Pos{}, tag::Y{}) = dist(engine);
p(tag::Pos{}, tag::Z{}) = dist(engine);
p(tag::Vel{}, tag::X{}) = dist(engine) / FP(10);
p(tag::Vel{}, tag::Y{}) = dist(engine) / FP(10);
p(tag::Vel{}, tag::Z{}) = dist(engine) / FP(10);
p(tag::Mass{}) = dist(engine) / FP(100);
}
watch.printAndReset("init");
for(int i = 0; i < particles.mapping().blobCount; i++)
queue.prefetch(&particles.blobs()[i][0], particles.mapping().blobSize(i));
watch.printAndReset("prefetch");

const auto range = sycl::range<1>{problemSize};

double sumUpdate = 0;
double sumMove = 0;
for(std::size_t s = 0; s < steps; ++s)
{
queue.submit(
[&](sycl::handler& h)
{
h.parallel_for(
range,
[particles = llama::shallowCopy(particles)](sycl::item<1> it)
{
const auto i = it[0];
llama::One<Particle> pi = particles(i);
for(std::size_t j = 0; j < problemSize; j++)
pPInteraction(pi, particles(j));
particles(i)(tag::Vel{}) = pi(tag::Vel{});
});
});
queue.wait();
sumUpdate += watch.printAndReset("update", '\t');
queue.submit(
[&](sycl::handler& h)
{
h.parallel_for(
range,
[particles = llama::shallowCopy(particles)](sycl::item<1> it)
{ particles(it[0])(tag::Pos{}) += particles(it[0])(tag::Vel{}) * timestep; });
});
queue.wait();
sumMove += watch.printAndReset("move");
}

return 0;
}

auto main(int argc, char** argv) -> int
try
{
std::cout << problemSize / 1000 << "k particles "
<< "(" << problemSize * sizeof(float) * 7 / 1024 << "kiB)\n";

auto queue = sycl::queue{sycl::cpu_selector_v};
std::cout << "running on device: " << queue.get_device().get_info<sycl::info::device::name>() << "\n";
std::cout << std::fixed;

int r = 0;
r += run<0>(queue);
r += run<1>(queue);
r += run<2>(queue);
return r;
}
catch(const std::exception& e)
{
std::cerr << "Exception: " << e.what() << "\n";
}
30 changes: 30 additions & 0 deletions include/llama/BlobAllocators.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#if __has_include(<alpaka/alpaka.hpp>)
# include <alpaka/alpaka.hpp>
#endif
#if __has_include(<sycl/sycl.hpp>)
# include <sycl/sycl.hpp>
#endif

namespace llama::bloballoc
{
Expand Down Expand Up @@ -163,6 +166,7 @@ namespace llama::bloballoc
#endif

#if __has_include(<alpaka/alpaka.hpp>)
/// Allocates alpaka buffers as blobs.
LLAMA_EXPORT
template<typename Size, typename Dev>
struct AlpakaBuf
Expand All @@ -176,4 +180,30 @@ namespace llama::bloballoc
}
};
#endif

#if __has_include(<sycl/sycl.hpp>)
/// Allocates shared USM memory using sycl::aligned_alloc_shared. The memory is managed by a std::unique_ptr with a
/// deleter that calles sycl::free. If you want to use a view created with this allocator in a kernel, call \ref
/// shallowCopy on the view before passing it to the kernel.
LLAMA_EXPORT
struct SyclMallocShared
{
sycl::queue queue;

static auto makeDeleter(sycl::queue q)
{
// create lambda in function independent of FieldAlignment template paramter to avoid different blob types
return [q](void* p) { sycl::free(p, q); };
}

template<std::size_t FieldAlignment>
inline auto operator()(std::integral_constant<std::size_t, FieldAlignment>, std::size_t count) const
{
std::byte* p = sycl::aligned_alloc_shared<std::byte>(FieldAlignment, count, queue);
if(reinterpret_cast<std::uintptr_t>(p) & (FieldAlignment - 1 != 0u))
throw std::runtime_error{"sycl::malloc_shared does not align sufficiently"};
return std::unique_ptr<std::byte[], decltype(makeDeleter(queue))>(p, makeDeleter(queue));
}
};
#endif
} // namespace llama::bloballoc

0 comments on commit cd6d792

Please sign in to comment.