Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ports prefixScan, OneToManyAssoc and HistoContainer from CUDAUtilities. #43064

Merged
merged 3 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h
#define HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h

#include <cstdint>

#include <alpaka/alpaka.hpp>

namespace cms::alpakatools {

class AtomicPairCounter {
public:
using DoubleWord = uint64_t;

ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter() : counter_{0} {}
ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter(uint32_t first, uint32_t second) : counter_{pack(first, second)} {}
ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter(DoubleWord values) : counter_{values} {}

ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter& operator=(DoubleWord values) {
counter_.as_doubleword = values;
return *this;
}

struct Counters {
uint32_t first; // in a "One to Many" association is the number of "One"
uint32_t second; // in a "One to Many" association is the total number of associations
};

ALPAKA_FN_ACC constexpr Counters get() const { return counter_.as_counters; }

// atomically add as_counters, and return the previous value
template <typename TAcc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Counters add(const TAcc& acc, Counters c) {
Packer value{pack(c.first, c.second)};
Packer ret{0};
ret.as_doubleword =
alpaka::atomicAdd(acc, &counter_.as_doubleword, value.as_doubleword, alpaka::hierarchy::Blocks{});
return ret.as_counters;
}

// atomically increment first and add i to second, and return the previous value
template <typename TAcc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE Counters constexpr inc_add(const TAcc& acc, uint32_t i) {
return add(acc, {1u, i});
}

private:
union Packer {
DoubleWord as_doubleword;
Counters as_counters;
constexpr Packer(DoubleWord _as_doubleword) : as_doubleword(_as_doubleword) { ; };
constexpr Packer(Counters _as_counters) : as_counters(_as_counters) { ; };
};

// pack two uint32_t values in a DoubleWord (aka uint64_t)
// this is needed because in c++17 a union can only be aggregate-initialised to its first type
// it can be probably removed with c++20, and replace with a designated initialiser
static constexpr DoubleWord pack(uint32_t first, uint32_t second) {
Packer ret{0};
ret.as_counters = {first, second};
return ret.as_doubleword;
}

Packer counter_;
};

} // namespace cms::alpakatools

#endif // HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h
47 changes: 47 additions & 0 deletions HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@

#ifndef HeterogeneousCore_AlpakaInterface_interface_FlexiStorage_h
#define HeterogeneousCore_AlpakaInterface_interface_FlexiStorage_h

#include <cstdint>

namespace cms::alpakatools {

template <typename I, int S>
class FlexiStorage {
public:
constexpr int capacity() const { return S; }

constexpr I& operator[](int i) { return m_v[i]; }
constexpr const I& operator[](int i) const { return m_v[i]; }

constexpr I* data() { return m_v; }
constexpr I const* data() const { return m_v; }

private:
I m_v[S];
};

template <typename I>
class FlexiStorage<I, -1> {
public:
constexpr void init(I* v, int s) {
m_v = v;
m_capacity = s;
}

constexpr int capacity() const { return m_capacity; }

constexpr I& operator[](int i) { return m_v[i]; }
constexpr const I& operator[](int i) const { return m_v[i]; }

constexpr I* data() { return m_v; }
constexpr I const* data() const { return m_v; }

private:
I* m_v;
int m_capacity;
};

} // namespace cms::alpakatools

#endif // HeterogeneousCore_AlpakaInterface_interface_FlexiStorage_h
201 changes: 201 additions & 0 deletions HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h
#define HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <type_traits>

#include <alpaka/alpaka.hpp>

#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h"
#include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h"
#include "HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h"
#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"

namespace cms::alpakatools {

struct countFromVector {
template <typename TAcc, typename Histo, typename T>
ALPAKA_FN_ACC void operator()(const TAcc &acc,
Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) const {
const uint32_t nt = offsets[nh];
for (uint32_t i : elements_with_stride(acc, nt)) {
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
ALPAKA_ASSERT_OFFLOAD((*off) > 0);
int32_t ih = off - offsets - 1;
ALPAKA_ASSERT_OFFLOAD(ih >= 0);
ALPAKA_ASSERT_OFFLOAD(ih < int(nh));
h->count(acc, v[i], ih);
}
}
};

struct fillFromVector {
template <typename TAcc, typename Histo, typename T>
ALPAKA_FN_ACC void operator()(const TAcc &acc,
Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) const {
const uint32_t nt = offsets[nh];
for (uint32_t i : elements_with_stride(acc, nt)) {
auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
ALPAKA_ASSERT_OFFLOAD((*off) > 0);
int32_t ih = off - offsets - 1;
ALPAKA_ASSERT_OFFLOAD(ih >= 0);
ALPAKA_ASSERT_OFFLOAD(ih < int(nh));
h->fill(acc, v[i], i, ih);
}
}
};

template <typename TAcc, typename Histo, typename T, typename TQueue>
ALPAKA_FN_INLINE void fillManyFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
uint32_t nthreads,
TQueue &queue) {
Histo::template launchZero<TAcc>(h, queue);

const auto threadsPerBlockOrElementsPerThread = nthreads;
const auto blocksPerGrid = divide_up_by(totSize, nthreads);
const auto workDiv = make_workdiv<TAcc>(blocksPerGrid, threadsPerBlockOrElementsPerThread);

alpaka::exec<TAcc>(queue, workDiv, countFromVector(), h, nh, v, offsets);
Histo::template launchFinalize<TAcc>(h, queue);

alpaka::exec<TAcc>(queue, workDiv, fillFromVector(), h, nh, v, offsets);
}

template <typename TAcc, typename Histo, typename T, typename TQueue>
ALPAKA_FN_INLINE void fillManyFromVector(Histo *__restrict__ h,
typename Histo::View hv,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
uint32_t nthreads,
TQueue &queue) {
Histo::template launchZero<TAcc>(hv, queue);

const auto threadsPerBlockOrElementsPerThread = nthreads;
const auto blocksPerGrid = divide_up_by(totSize, nthreads);
const auto workDiv = make_workdiv<TAcc>(blocksPerGrid, threadsPerBlockOrElementsPerThread);

alpaka::exec<TAcc>(queue, workDiv, countFromVector(), h, nh, v, offsets);
Histo::template launchFinalize<TAcc>(h, queue);

alpaka::exec<TAcc>(queue, workDiv, fillFromVector(), h, nh, v, offsets);
}

// iteratate over N bins left and right of the one containing "v"
template <typename Hist, typename V, typename Func>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) {
int bs = Hist::bin(value);
int be = std::min(int(Hist::nbins() - 1), bs + n);
bs = std::max(0, bs - n);
ALPAKA_ASSERT_OFFLOAD(be >= bs);
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
func(*pj);
}
}

// iteratate over bins containing all values in window wmin, wmax
template <typename Hist, typename V, typename Func>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
auto bs = Hist::bin(wmin);
auto be = Hist::bin(wmax);
ALPAKA_ASSERT_OFFLOAD(be >= bs);
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
func(*pj);
}
}

template <typename T, // the type of the discretized input values
uint32_t NBINS, // number of bins
int32_t SIZE, // max number of element. If -1 is initialized at runtime using external storage
uint32_t S = sizeof(T) * 8, // number of significant bits in T
typename I = uint32_t, // type stored in the container (usually an index in a vector of the input values)
uint32_t NHISTS = 1 // number of histos stored
>
class HistoContainer : public OneToManyAssocRandomAccess<I, NHISTS * NBINS + 1, SIZE> {
public:
using Base = OneToManyAssocRandomAccess<I, NHISTS * NBINS + 1, SIZE>;
using View = typename Base::View;
using Counter = typename Base::Counter;
using index_type = typename Base::index_type;
using UT = typename std::make_unsigned<T>::type;

static constexpr uint32_t ilog2(uint32_t v) {
constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
constexpr uint32_t s[] = {1, 2, 4, 8, 16};

uint32_t r = 0; // result of log2(v) will go here
for (auto i = 4; i >= 0; i--)
if (v & b[i]) {
v >>= s[i];
r |= s[i];
}
return r;
}

static constexpr uint32_t sizeT() { return S; }
static constexpr int32_t nhists() { return NHISTS; }
static constexpr uint32_t nbins() { return NBINS; }
static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }

static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }

static constexpr UT bin(T t) {
constexpr uint32_t shift = sizeT() - nbits();
constexpr uint32_t mask = (1 << nbits()) - 1;
return (t >> shift) & mask;
}

template <typename TAcc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t) {
uint32_t b = bin(t);
ALPAKA_ASSERT_OFFLOAD(b < nbins());
Base::atomicIncrement(acc, this->off[b]);
}

template <typename TAcc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j) {
uint32_t b = bin(t);
ALPAKA_ASSERT_OFFLOAD(b < nbins());
auto w = Base::atomicDecrement(acc, this->off[b]);
ALPAKA_ASSERT_OFFLOAD(w > 0);
this->content[w - 1] = j;
}

template <typename TAcc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t, uint32_t nh) {
uint32_t b = bin(t);
ALPAKA_ASSERT_OFFLOAD(b < nbins());
b += histOff(nh);
ALPAKA_ASSERT_OFFLOAD(b < totbins());
Base::atomicIncrement(acc, this->off[b]);
}

template <typename TAcc>
ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j, uint32_t nh) {
uint32_t b = bin(t);
ALPAKA_ASSERT_OFFLOAD(b < nbins());
b += histOff(nh);
ALPAKA_ASSERT_OFFLOAD(b < totbins());
auto w = Base::atomicDecrement(acc, this->off[b]);
ALPAKA_ASSERT_OFFLOAD(w > 0);
this->content[w - 1] = j;
}
};
} // namespace cms::alpakatools
#endif // HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h
Loading