Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring #701

Merged
merged 2 commits into from
Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 33 additions & 23 deletions examples/nbody/nbody.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1298,21 +1298,38 @@ namespace manualAoSSIMD
xsimd::make_batch_constant<xsimd::batch<int, typename Simd::arch_type>, GenStrides>());

template<typename Simd>
void update(Particle* particles, int threads)
auto gather(const typename Simd::value_type* p) -> Simd
{
const auto strides = particleStrides<Simd>;
return Simd::gather(p, strides);
// Simd simd;
// for(auto i = 0; i < Simd::size; i++)
// reinterpret_cast<typename Simd::value_type*>(&simd)[i] = p[i * (sizeof(Particle) / sizeof(FP))];
// return simd;
}

template<typename Simd>
void scatter(Simd simd, typename Simd::value_type* p)
{
constexpr auto lanes = Simd::size;
const auto strides = particleStrides<Simd>;
simd.scatter(p, strides);
// for(auto i = 0; i < Simd::size; i++)
// p[i * (sizeof(Particle) / sizeof(FP))] = simd.get(i);
}

template<typename Simd>
void update(Particle* particles, int threads)
{
# pragma omp parallel for schedule(static) num_threads(threads)
for(std::ptrdiff_t i = 0; i < problemSize; i += lanes)
for(std::ptrdiff_t i = 0; i < problemSize; i += Simd::size)
{
auto& pi = particles[i];
const Simd piposx = Simd::gather(&pi.pos.x, strides);
const Simd piposy = Simd::gather(&pi.pos.y, strides);
const Simd piposz = Simd::gather(&pi.pos.z, strides);
Simd pivelx = Simd::gather(&pi.vel.x, strides);
Simd pively = Simd::gather(&pi.vel.y, strides);
Simd pivelz = Simd::gather(&pi.vel.z, strides);
const Simd piposx = gather<Simd>(&pi.pos.x);
const Simd piposy = gather<Simd>(&pi.pos.y);
const Simd piposz = gather<Simd>(&pi.pos.z);
Simd pivelx = gather<Simd>(&pi.vel.x);
Simd pively = gather<Simd>(&pi.vel.y);
Simd pivelz = gather<Simd>(&pi.vel.z);

for(std::size_t j = 0; j < problemSize; j++)
{
Expand All @@ -1325,29 +1342,22 @@ namespace manualAoSSIMD
pPInteraction(piposx, piposy, piposz, pivelx, pively, pivelz, pjposx, pjposy, pjposz, pjmass);
}

// scatter
pivelx.scatter(&pi.vel.x, strides);
pively.scatter(&pi.vel.y, strides);
pivelz.scatter(&pi.vel.z, strides);
scatter(pivelx, &pi.vel.x);
scatter(pively, &pi.vel.y);
scatter(pivelz, &pi.vel.z);
}
}

template<typename Simd>
void move(Particle* particles, int threads)
{
constexpr auto lanes = Simd::size;
const auto strides = particleStrides<Simd>;

# pragma omp parallel for schedule(static) num_threads(threads)
for(std::ptrdiff_t i = 0; i < problemSize; i += lanes)
for(std::ptrdiff_t i = 0; i < problemSize; i += Simd::size)
{
auto& pi = particles[i];
(Simd::gather(&pi.pos.x, strides) + Simd::gather(&pi.vel.x, strides) * timestep)
.scatter(&pi.pos.x, strides);
(Simd::gather(&pi.pos.y, strides) + Simd::gather(&pi.vel.y, strides) * timestep)
.scatter(&pi.pos.y, strides);
(Simd::gather(&pi.pos.z, strides) + Simd::gather(&pi.vel.z, strides) * timestep)
.scatter(&pi.pos.z, strides);
scatter(gather<Simd>(&pi.pos.x) + gather<Simd>(&pi.vel.x) * timestep, &pi.pos.x);
scatter(gather<Simd>(&pi.pos.y) + gather<Simd>(&pi.vel.y) * timestep, &pi.pos.y);
scatter(gather<Simd>(&pi.pos.z) + gather<Simd>(&pi.vel.z) * timestep, &pi.pos.z);
}
}

Expand Down
4 changes: 3 additions & 1 deletion include/llama/mapping/BitPackedFloat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ namespace llama::mapping
= std::conditional_t<mp_contains<FlatRecordDim<RecordDim>, double>::value, std::uint64_t, std::uint32_t>;
} // namespace internal

// TODO(bgruber): I would like to allow zero mantissa bits, which would then no longer support INF. Likewise,
// support to skip the sign bit would also be great.
/// Struct of array mapping using bit packing to reduce size/precision of floating-point data types. The bit layout
/// is [1 sign bit, exponentBits bits from the exponent, mantissaBits bits from the mantissa]+ and tries to follow
/// IEEE 754. Infinity and NAN are supported. If the packed exponent bits are not big enough to hold a number, it
Expand All @@ -189,7 +191,7 @@ namespace llama::mapping
/// \tparam ExponentBits If ExponentBits is llama::Constant<N>, the compile-time N specifies the number of bits to
/// use to store the exponent. If ExponentBits is llama::Value<T>, the number of bits is specified at runtime,
/// passed to the constructor and stored as type T. Must not be zero.
/// \tparam MantissaBits Like ExponentBits but for the mantissa bits. May be zero.
/// \tparam MantissaBits Like ExponentBits but for the mantissa bits. Must not be zero (otherwise values turn INF).
/// \tparam TLinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
/// how big the linear domain gets.
/// \tparam TStoredIntegral Integral type used as storage of reduced precision floating-point values.
Expand Down