From 4af44b358cd4d7b01b4cef37b034c9b28535a294 Mon Sep 17 00:00:00 2001 From: Eric Cano Date: Thu, 19 Oct 2023 11:37:05 +0200 Subject: [PATCH 1/3] Ports prefixScan, OneToManyAssoc and HistoContainer from CUDAUtilities. OneToManyAssoc is also separated into various function depending on use case. Co-authored-by: Andrea Bocci --- .../interface/AtomicPairCounter.h | 68 +++ .../AlpakaInterface/interface/FlexiStorage.h | 47 ++ .../interface/HistoContainer.h | 201 +++++++++ .../interface/OneToManyAssoc.h | 274 +++++++++++ .../AlpakaInterface/interface/SimpleVector.h | 142 ++++++ .../AlpakaInterface/interface/VecArray.h | 114 +++++ .../interface/alpakastdAlgorithm.h | 36 ++ .../AlpakaInterface/interface/prefixScan.h | 226 +++++++++ .../AlpakaInterface/interface/radixSort.h | 427 ++++++++++++++++++ .../AlpakaInterface/test/BuildFile.xml | 57 +++ .../test/alpaka/testAtomicPairCounter.dev.cc | 106 +++++ .../test/alpaka/testHistoContainer.dev.cc | 255 +++++++++++ .../test/alpaka/testOneHistoContainer.dev.cc | 187 ++++++++ .../test/alpaka/testOneRadixSort.dev.cc | 237 ++++++++++ .../test/alpaka/testOneToManyAssoc.dev.cc | 327 ++++++++++++++ .../test/alpaka/testPrefixScan.dev.cc | 219 +++++++++ .../test/alpaka/testRadixSort.dev.cc | 285 ++++++++++++ .../test/alpaka/testSimpleVector.dev.cc | 101 +++++ .../test/alpaka/testWorkDivision.dev.cc | 147 ++++++ .../CUDAUtilities/interface/radixSort.h | 4 +- .../CUDAUtilities/test/oneRadixSort_t.cu | 2 +- 21 files changed, 3459 insertions(+), 3 deletions(-) create mode 100644 HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/VecArray.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/prefixScan.h create mode 100644 HeterogeneousCore/AlpakaInterface/interface/radixSort.h create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc diff --git a/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h b/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h new file mode 100644 index 0000000000000..5c6d1d5719623 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h @@ -0,0 +1,68 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h +#define HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h + +#include + +#include + +namespace cms::alpakatools { + + class AtomicPairCounter { + public: + using DoubleWord = uint64_t; + + ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter() : counter_{0} {} + ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter(uint32_t first, uint32_t second) : counter_{pack(first, second)} {} + ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter(DoubleWord values) : counter_{values} {} + + ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter& operator=(DoubleWord values) { + counter_.as_doubleword = values; + return *this; + } + + struct Counters { + uint32_t first; // in a "One to Many" association is the number of "One" + uint32_t second; // in a "One to Many" association is the total number of associations + }; + + ALPAKA_FN_ACC constexpr Counters get() const { return counter_.as_counters; } + + // atomically add as_counters, and return the previous value + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Counters add(const TAcc& acc, Counters c) { + Packer value{pack(c.first, c.second)}; + Packer ret{0}; + ret.as_doubleword = + alpaka::atomicAdd(acc, &counter_.as_doubleword, value.as_doubleword, alpaka::hierarchy::Blocks{}); + return ret.as_counters; + } + + // atomically increment first and add i to second, and return the previous value + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE Counters constexpr inc_add(const TAcc& acc, uint32_t i) { + return add(acc, {1u, i}); + } + + private: + union Packer { + DoubleWord as_doubleword; + Counters as_counters; + constexpr Packer(DoubleWord _as_doubleword) : as_doubleword(_as_doubleword) { ; }; + constexpr Packer(Counters _as_counters) : as_counters(_as_counters) { ; }; + }; + + // pack two uint32_t values in a DoubleWord (aka uint64_t) + // this is needed because in c++17 a union can only be aggregate-initialised to its first type + // it can be probably removed with c++20, and replace with a designated initialiser + static constexpr DoubleWord pack(uint32_t first, uint32_t second) { + Packer ret{0}; + ret.as_counters = {first, second}; + return ret.as_doubleword; + } + + Packer counter_; + }; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h b/HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h new file mode 100644 index 0000000000000..2bb3aa5f1e15c --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h @@ -0,0 +1,47 @@ + +#ifndef HeterogeneousCore_AlpakaInterface_interface_FlexiStorage_h +#define HeterogeneousCore_AlpakaInterface_interface_FlexiStorage_h + +#include + +namespace cms::alpakatools { + + template + class FlexiStorage { + public: + constexpr int capacity() const { return S; } + + constexpr I& operator[](int i) { return m_v[i]; } + constexpr const I& operator[](int i) const { return m_v[i]; } + + constexpr I* data() { return m_v; } + constexpr I const* data() const { return m_v; } + + private: + I m_v[S]; + }; + + template + class FlexiStorage { + public: + constexpr void init(I* v, int s) { + m_v = v; + m_capacity = s; + } + + constexpr int capacity() const { return m_capacity; } + + constexpr I& operator[](int i) { return m_v[i]; } + constexpr const I& operator[](int i) const { return m_v[i]; } + + constexpr I* data() { return m_v; } + constexpr I const* data() const { return m_v; } + + private: + I* m_v; + int m_capacity; + }; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_FlexiStorage_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h b/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h new file mode 100644 index 0000000000000..304d01ff9fd08 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h @@ -0,0 +1,201 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h +#define HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h + +#include +#include +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h" +#include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h" +#include "HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +namespace cms::alpakatools { + + struct countFromVector { + template + ALPAKA_FN_ACC void operator()(const TAcc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + for (uint32_t i : elements_with_stride(acc, nt)) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + ALPAKA_ASSERT_OFFLOAD((*off) > 0); + int32_t ih = off - offsets - 1; + ALPAKA_ASSERT_OFFLOAD(ih >= 0); + ALPAKA_ASSERT_OFFLOAD(ih < int(nh)); + h->count(acc, v[i], ih); + } + } + }; + + struct fillFromVector { + template + ALPAKA_FN_ACC void operator()(const TAcc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + for (uint32_t i : elements_with_stride(acc, nt)) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + ALPAKA_ASSERT_OFFLOAD((*off) > 0); + int32_t ih = off - offsets - 1; + ALPAKA_ASSERT_OFFLOAD(ih >= 0); + ALPAKA_ASSERT_OFFLOAD(ih < int(nh)); + h->fill(acc, v[i], i, ih); + } + } + }; + + template + ALPAKA_FN_INLINE void fillManyFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + uint32_t totSize, + uint32_t nthreads, + TQueue &queue) { + Histo::template launchZero(h, queue); + + const auto threadsPerBlockOrElementsPerThread = nthreads; + const auto blocksPerGrid = divide_up_by(totSize, nthreads); + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::exec(queue, workDiv, countFromVector(), h, nh, v, offsets); + Histo::template launchFinalize(h, queue); + + alpaka::exec(queue, workDiv, fillFromVector(), h, nh, v, offsets); + } + + template + ALPAKA_FN_INLINE void fillManyFromVector(Histo *__restrict__ h, + typename Histo::View hv, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + uint32_t totSize, + uint32_t nthreads, + TQueue &queue) { + Histo::template launchZero(hv, queue); + + const auto threadsPerBlockOrElementsPerThread = nthreads; + const auto blocksPerGrid = divide_up_by(totSize, nthreads); + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::exec(queue, workDiv, countFromVector(), h, nh, v, offsets); + Histo::template launchFinalize(h, queue); + + alpaka::exec(queue, workDiv, fillFromVector(), h, nh, v, offsets); + } + + // iteratate over N bins left and right of the one containing "v" + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + ALPAKA_ASSERT_OFFLOAD(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + // iteratate over bins containing all values in window wmin, wmax + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + ALPAKA_ASSERT_OFFLOAD(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + template + class HistoContainer : public OneToManyAssocRandomAccess { + public: + using Base = OneToManyAssocRandomAccess; + using View = typename Base::View; + using Counter = typename Base::Counter; + using index_type = typename Base::index_type; + using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } + + static constexpr uint32_t sizeT() { return S; } + static constexpr int32_t nhists() { return NHISTS; } + static constexpr uint32_t nbins() { return NBINS; } + static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } + static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } + + static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + + static constexpr UT bin(T t) { + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1 << nbits()) - 1; + return (t >> shift) & mask; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + Base::atomicIncrement(acc, this->off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + auto w = Base::atomicDecrement(acc, this->off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + this->content[w - 1] = j; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t, uint32_t nh) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + b += histOff(nh); + ALPAKA_ASSERT_OFFLOAD(b < totbins()); + Base::atomicIncrement(acc, this->off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + b += histOff(nh); + ALPAKA_ASSERT_OFFLOAD(b < totbins()); + auto w = Base::atomicDecrement(acc, this->off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + this->content[w - 1] = j; + } + }; +} // namespace cms::alpakatools +#endif // HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h b/HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h new file mode 100644 index 0000000000000..e7834d39a8910 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h @@ -0,0 +1,274 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_OneToManyAssoc_h +#define HeterogeneousCore_AlpakaInterface_interface_OneToManyAssoc_h + +#include +#include +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h" +#include "HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h" +#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +namespace cms::alpakatools { + + template + class OneToManyAssocBase { + public: + using Counter = uint32_t; + + using CountersOnly = OneToManyAssocBase; + + using index_type = I; + + struct View { + OneToManyAssocBase *assoc = nullptr; + Counter *offStorage = nullptr; + index_type *contentStorage = nullptr; + int32_t offSize = -1; + int32_t contentSize = -1; + }; + + static constexpr int32_t ctNOnes() { return ONES; } + constexpr auto totOnes() const { return off.capacity(); } + constexpr auto nOnes() const { return totOnes() - 1; } + static constexpr int32_t ctCapacity() { return SIZE; } + constexpr auto capacity() const { return content.capacity(); } + + ALPAKA_FN_HOST_ACC void initStorage(View view) { + ALPAKA_ASSERT_OFFLOAD(view.assoc == this); + if constexpr (ctCapacity() < 0) { + ALPAKA_ASSERT_OFFLOAD(view.contentStorage); + ALPAKA_ASSERT_OFFLOAD(view.contentSize > 0); + content.init(view.contentStorage, view.contentSize); + } + if constexpr (ctNOnes() < 0) { + ALPAKA_ASSERT_OFFLOAD(view.offStorage); + ALPAKA_ASSERT_OFFLOAD(view.offSize > 0); + off.init(view.offStorage, view.offSize); + } + } + + ALPAKA_FN_HOST_ACC void zero() { + for (int32_t i = 0; i < totOnes(); ++i) { + off[i] = 0; + } + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const TAcc &acc, CountersOnly const &co) { + for (uint32_t i = 0; static_cast(i) < totOnes(); ++i) { + alpaka::atomicAdd(acc, off.data() + i, co.off[i], alpaka::hierarchy::Blocks{}); + } + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE static uint32_t atomicIncrement(const TAcc &acc, Counter &x) { + return alpaka::atomicAdd(acc, &x, 1u, alpaka::hierarchy::Blocks{}); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE static uint32_t atomicDecrement(const TAcc &acc, Counter &x) { + return alpaka::atomicSub(acc, &x, 1u, alpaka::hierarchy::Blocks{}); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, I b) { + ALPAKA_ASSERT_OFFLOAD(b < static_cast(nOnes())); + atomicIncrement(acc, off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, I b, index_type j) { + ALPAKA_ASSERT_OFFLOAD(b < static_cast(nOnes())); + auto w = atomicDecrement(acc, off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + content[w - 1] = j; + } + + // this MUST BE DONE in a single block (or in two kernels!) + struct zeroAndInit { + template + ALPAKA_FN_ACC void operator()(const TAcc &acc, View view) const { + auto h = view.assoc; + ALPAKA_ASSERT_OFFLOAD((1 == alpaka::getWorkDiv(acc)[0])); + ALPAKA_ASSERT_OFFLOAD((0 == alpaka::getIdx(acc)[0])); + + auto first = alpaka::getIdx(acc)[0]; + + if (0 == first) { + h->psws = 0; + h->initStorage(view); + } + alpaka::syncBlockThreads(acc); + // TODO use for_each_element_in_grid_strided (or similar) + for (int i = first, nt = h->totOnes(); i < nt; + i += alpaka::getWorkDiv(acc)[0]) { + h->off[i] = 0; + } + } + }; + + template + ALPAKA_FN_INLINE static void launchZero(OneToManyAssocBase *h, TQueue &queue) { + View view = {h, nullptr, nullptr, -1, -1}; + launchZero(view, queue); + } + + template + ALPAKA_FN_INLINE static void launchZero(View view, TQueue &queue) { + if constexpr (ctCapacity() < 0) { + ALPAKA_ASSERT_OFFLOAD(view.contentStorage); + ALPAKA_ASSERT_OFFLOAD(view.contentSize > 0); + } + if constexpr (ctNOnes() < 0) { + ALPAKA_ASSERT_OFFLOAD(view.offStorage); + ALPAKA_ASSERT_OFFLOAD(view.offSize > 0); + } + if constexpr (!requires_single_thread_per_block_v) { + auto nthreads = 1024; + auto nblocks = 1; // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels); + auto workDiv = cms::alpakatools::make_workdiv(nblocks, nthreads); + alpaka::exec(queue, workDiv, zeroAndInit{}, view); + } else { + auto h = view.assoc; + ALPAKA_ASSERT_OFFLOAD(h); + h->initStorage(view); + h->zero(); + h->psws = 0; + } + } + + constexpr auto size() const { return uint32_t(off[totOnes() - 1]); } + constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } + + constexpr index_type const *begin() const { return content.data(); } + constexpr index_type const *end() const { return begin() + size(); } + + constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; } + constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; } + + FlexiStorage off; + FlexiStorage content; + int32_t psws; // prefix-scan working space + }; + + template + class OneToManyAssocSequential : public OneToManyAssocBase { + public: + using index_type = typename OneToManyAssocBase::index_type; + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int32_t + bulkFill(const TAcc &acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.inc_add(acc, n); + if (int(c.first) >= this->nOnes()) + return -int32_t(c.first); + this->off[c.first] = c.second; + for (uint32_t j = 0; j < n; ++j) + this->content[c.second + j] = v[j]; + return c.first; + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void bulkFinalize(AtomicPairCounter const &apc) { + this->off[apc.get().first] = apc.get().second; + } + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(TAcc &acc, AtomicPairCounter const &apc) { + int f = apc.get().first; + auto s = apc.get().second; + if (f >= this->nOnes()) { // overflow! + this->off[this->nOnes()] = uint32_t(this->off[this->nOnes() - 1]); + return; + } + auto first = f + alpaka::getIdx(acc)[0]; + for (int i = first; i < this->totOnes(); i += alpaka::getWorkDiv(acc)[0]) { + this->off[i] = s; + } + } + + struct finalizeBulk { + template + ALPAKA_FN_ACC void operator()(const TAcc &acc, + AtomicPairCounter const *apc, + OneToManyAssocSequential *__restrict__ assoc) const { + assoc->bulkFinalizeFill(acc, *apc); + } + }; + }; + + template + class OneToManyAssocRandomAccess : public OneToManyAssocBase { + public: + using Counter = typename OneToManyAssocBase::Counter; + using View = typename OneToManyAssocBase::View; + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void finalize(TAcc &acc, Counter *ws = nullptr) { + ALPAKA_ASSERT_OFFLOAD(this->off[this->totOnes() - 1] == 0); + blockPrefixScan(acc, this->off.data(), this->totOnes(), ws); + ALPAKA_ASSERT_OFFLOAD(this->off[this->totOnes() - 1] == this->off[this->totOnes() - 2]); + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void finalize() { + // Single thread finalize. + for (uint32_t i = 1; static_cast(i) < this->totOnes(); ++i) + this->off[i] += this->off[i - 1]; + } + + template + ALPAKA_FN_INLINE static void launchFinalize(OneToManyAssocRandomAccess *h, TQueue &queue) { + View view = {h, nullptr, nullptr, -1, -1}; + launchFinalize(view, queue); + } + + template + ALPAKA_FN_INLINE static void launchFinalize(View view, TQueue &queue) { + // View stores a base pointer, we need to upcast back... + auto h = static_cast(view.assoc); + ALPAKA_ASSERT_OFFLOAD(h); + if constexpr (!requires_single_thread_per_block_v) { + Counter *poff = (Counter *)((char *)(h) + offsetof(OneToManyAssocRandomAccess, off)); + auto nOnes = OneToManyAssocRandomAccess::ctNOnes(); + if constexpr (OneToManyAssocRandomAccess::ctNOnes() < 0) { + ALPAKA_ASSERT_OFFLOAD(view.offStorage); + ALPAKA_ASSERT_OFFLOAD(view.offSize > 0); + nOnes = view.offSize; + poff = view.offStorage; + } + ALPAKA_ASSERT_OFFLOAD(nOnes > 0); + int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(OneToManyAssocRandomAccess, psws)); + auto nthreads = 1024; + auto nblocks = (nOnes + nthreads - 1) / nthreads; + auto workDiv = cms::alpakatools::make_workdiv(nblocks, nthreads); + alpaka::exec(queue, + workDiv, + multiBlockPrefixScan(), + poff, + poff, + nOnes, + nblocks, + ppsws, + alpaka::getWarpSizes(alpaka::getDev(queue))[0]); + } else { + h->finalize(); + } + } + }; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h b/HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h new file mode 100644 index 0000000000000..f4937da8626f8 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h @@ -0,0 +1,142 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_SimpleVector_h +#define HeterogeneousCore_AlpakaInterface_interface_SimpleVector_h + +// author: Felice Pantaleo, CERN, 2018 +// alpaka integration in cmssw: Adriano Di Florio, 2022 + +#include +#include +#include + +namespace cms::alpakatools { + + template + struct SimpleVector { + constexpr SimpleVector() = default; + + // ownership of m_data stays within the caller + constexpr void construct(int capacity, T *data) { + m_size = 0; + m_capacity = capacity; + m_data = data; + } + + constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&...args) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + ALPAKA_FN_ACC inline T &back() { return m_data[m_size - 1]; } + + ALPAKA_FN_ACC inline const T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + template + ALPAKA_FN_ACC int push_back(const TAcc &acc, const T &element) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + template + ALPAKA_FN_ACC int emplace_back(const TAcc &acc, Ts &&...args) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + // thread safe version of resize + template + ALPAKA_FN_ACC int extend(const TAcc &acc, int size = 1) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + template + ALPAKA_FN_ACC int shrink(const TAcc &acc, int size = 1) { + auto previousSize = alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + if (previousSize >= size) { + return previousSize - size; + } else { + alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + inline constexpr bool empty() const { return m_size <= 0; } + inline constexpr bool full() const { return m_size >= m_capacity; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int size() const { return m_size; } + inline constexpr int capacity() const { return m_capacity; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr void set_data(T *data) { m_data = data; } + + private: + int m_size; + int m_capacity; + + T *m_data; + }; + + // ownership of m_data stays within the caller + template + SimpleVector make_SimpleVector(int capacity, T *data) { + SimpleVector ret; + ret.construct(capacity, data); + return ret; + } + + // ownership of m_data stays within the caller + template + SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { + auto ret = new (mem) SimpleVector(); + ret->construct(capacity, data); + return ret; + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_SimpleVector_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h new file mode 100644 index 0000000000000..c4025dd42d4a6 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h @@ -0,0 +1,114 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_VecArray_h +#define HeterogeneousCore_AlpakaInterface_interface_VecArray_h + +// +// Author: Felice Pantaleo, CERN +// + +#include + +#include + +namespace cms::alpakatools { + + template + class VecArray { + public: + using self = VecArray; + using value_t = T; + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&...args) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + inline constexpr T cont& back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + inline constexpr T &back() { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a kernel + template + ALPAKA_FN_ACC int push_back(const TAcc &acc, const T &element) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + template + ALPAKA_FN_ACC int emplace_back(const TAcc &acc, Ts &&...args) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + inline constexpr T pop_back() { + if (m_size > 0) { + auto previousSize = m_size--; + return m_data[previousSize - 1]; + } else + return T(); + } + + inline constexpr T const *begin() const { return m_data; } + inline constexpr T const *end() const { return m_data + m_size; } + inline constexpr T *begin() { return m_data; } + inline constexpr T *end() { return m_data + m_size; } + inline constexpr int size() const { return m_size; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline static constexpr int capacity() { return maxSize; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr bool empty() const { return 0 == m_size; } + inline constexpr bool full() const { return maxSize == m_size; } + + private: + T m_data[maxSize]; + + int m_size; + }; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_VecArray_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h b/HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h new file mode 100644 index 0000000000000..5c768ab4a77ba --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h @@ -0,0 +1,36 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_alpakastdAlgorithm_h +#define HeterogeneousCore_AlpakaInterface_interface_alpakastdAlgorithm_h + +#include +#include +#include + +#include + +// reimplementation of std algorithms able to compile with Alpaka, +// mostly by declaring them constexpr (until C++20, which will make it +// constexpr by default. TODO: drop when moving to C++20) + +namespace alpaka_std { + + template > + ALPAKA_FN_HOST_ACC constexpr RandomIt upper_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) { + auto count = last - first; + + while (count > 0) { + auto it = first; + auto step = count / 2; + it += step; + if (!comp(value, *it)) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + +} // namespace alpaka_std + +#endif // HeterogeneousCore_AlpakaInterface_interface_alpakastdAlgorithm_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/prefixScan.h b/HeterogeneousCore/AlpakaInterface/interface/prefixScan.h new file mode 100644 index 0000000000000..79c0e1f6de995 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/prefixScan.h @@ -0,0 +1,226 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_prefixScan_h +#define HeterogeneousCore_AlpakaInterface_interface_prefixScan_h + +#include + +#include "FWCore/Utilities/interface/CMSUnrollLoop.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +namespace cms::alpakatools { + template >> + constexpr bool isPowerOf2(T v) { + // returns true iif v has only one bit set. + while (v) { + if (v & 1) + return !(v >> 1); + else + v >>= 1; + } + return false; + } + + template >> + ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(const TAcc& acc, int32_t laneId, T const* ci, T* co, uint32_t i) { + // ci and co may be the same + auto x = ci[i]; + CMS_UNROLL_LOOP + for (int32_t offset = 1; offset < alpaka::warp::getSize(acc); offset <<= 1) { + // Force the exact type for integer types otherwise the compiler will find the template resolution ambiguous. + using dataType = std::conditional_t, T, std::int32_t>; + T y = alpaka::warp::shfl(acc, static_cast(x), laneId - offset); + if (laneId >= offset) + x += y; + } + co[i] = x; + } + + template >> + ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(const TAcc& acc, int32_t laneId, T* c, uint32_t i) { + warpPrefixScan(acc, laneId, c, c, i); + } + + // limited to warpSize² elements + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void blockPrefixScan( + const TAcc& acc, T const* ci, T* co, int32_t size, T* ws = nullptr) { + if constexpr (!requires_single_thread_per_block_v) { + const auto warpSize = alpaka::warp::getSize(acc); + int32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); + int32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); + ALPAKA_ASSERT_OFFLOAD(size <= warpSize * warpSize); + ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % warpSize); + auto first = blockThreadIdx; + ALPAKA_ASSERT_OFFLOAD(isPowerOf2(warpSize)); + auto laneId = blockThreadIdx & (warpSize - 1); + + for (auto i = first; i < size; i += blockDimension) { + warpPrefixScan(acc, laneId, ci, co, i); + auto warpId = i / warpSize; + ALPAKA_ASSERT_OFFLOAD(warpId < warpSize); + if ((warpSize - 1) == laneId) + ws[warpId] = co[i]; + } + alpaka::syncBlockThreads(acc); + if (size <= warpSize) + return; + if (blockThreadIdx < warpSize) { + warpPrefixScan(acc, laneId, ws, blockThreadIdx); + } + alpaka::syncBlockThreads(acc); + for (auto i = first + warpSize; i < size; i += blockDimension) { + int32_t warpId = i / warpSize; + co[i] += ws[warpId - 1]; + } + alpaka::syncBlockThreads(acc); + } else { + co[0] = ci[0]; + for (int32_t i = 1; i < size; ++i) + co[i] = ci[i] + co[i - 1]; + } + } + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const TAcc& acc, + T* __restrict__ c, + int32_t size, + T* __restrict__ ws = nullptr) { + if constexpr (!requires_single_thread_per_block_v) { + const auto warpSize = alpaka::warp::getSize(acc); + int32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); + int32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); + ALPAKA_ASSERT_OFFLOAD(size <= warpSize * warpSize); + ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % warpSize); + auto first = blockThreadIdx; + auto laneId = blockThreadIdx & (warpSize - 1); + + for (auto i = first; i < size; i += blockDimension) { + warpPrefixScan(acc, laneId, c, i); + auto warpId = i / warpSize; + ALPAKA_ASSERT_OFFLOAD(warpId < warpSize); + if ((warpSize - 1) == laneId) + ws[warpId] = c[i]; + } + alpaka::syncBlockThreads(acc); + if (size <= warpSize) + return; + if (blockThreadIdx < warpSize) { + warpPrefixScan(acc, laneId, ws, blockThreadIdx); + } + alpaka::syncBlockThreads(acc); + for (auto i = first + warpSize; i < size; i += blockDimension) { + auto warpId = i / warpSize; + c[i] += ws[warpId - 1]; + } + alpaka::syncBlockThreads(acc); + } else { + for (int32_t i = 1; i < size; ++i) + c[i] += c[i - 1]; + } + } + + // in principle not limited.... + template + struct multiBlockPrefixScan { + template + ALPAKA_FN_ACC void operator()( + const TAcc& acc, T const* ci, T* co, uint32_t size, int32_t numBlocks, int32_t* pc, std::size_t warpSize) const { + // Get shared variable. The workspace is needed only for multi-threaded accelerators. + T* ws = nullptr; + if constexpr (!requires_single_thread_per_block_v) { + ws = alpaka::getDynSharedMem(acc); + } + ALPAKA_ASSERT_OFFLOAD(warpSize == static_cast(alpaka::warp::getSize(acc))); + [[maybe_unused]] const auto elementsPerGrid = alpaka::getWorkDiv(acc)[0u]; + const auto elementsPerBlock = alpaka::getWorkDiv(acc)[0u]; + const auto threadsPerBlock = alpaka::getWorkDiv(acc)[0u]; + const auto blocksPerGrid = alpaka::getWorkDiv(acc)[0u]; + const auto blockIdx = alpaka::getIdx(acc)[0u]; + const auto threadIdx = alpaka::getIdx(acc)[0u]; + ALPAKA_ASSERT_OFFLOAD(elementsPerGrid >= size); + // first each block does a scan + [[maybe_unused]] int off = elementsPerBlock * blockIdx; + if (size - off > 0) { + blockPrefixScan(acc, ci + off, co + off, std::min(elementsPerBlock, size - off), ws); + } + + // count blocks that finished + auto& isLastBlockDone = alpaka::declareSharedVar(acc); + //__shared__ bool isLastBlockDone; + if (0 == threadIdx) { + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); + auto value = alpaka::atomicAdd(acc, pc, 1, alpaka::hierarchy::Blocks{}); // block counter + isLastBlockDone = (value == (int(blocksPerGrid) - 1)); + } + + alpaka::syncBlockThreads(acc); + + if (!isLastBlockDone) + return; + + ALPAKA_ASSERT_OFFLOAD(int(blocksPerGrid) == *pc); + + // good each block has done its work and now we are left in last block + + // let's get the partial sums from each block except the last, which receives 0. + T* psum = nullptr; + if constexpr (!requires_single_thread_per_block_v) { + psum = ws + warpSize; + } else { + psum = alpaka::getDynSharedMem(acc); + } + for (int32_t i = threadIdx, ni = blocksPerGrid; i < ni; i += threadsPerBlock) { + auto j = elementsPerBlock * i + elementsPerBlock - 1; + psum[i] = (j < size) ? co[j] : T(0); + } + alpaka::syncBlockThreads(acc); + blockPrefixScan(acc, psum, psum, blocksPerGrid, ws); + + // now it would have been handy to have the other blocks around... + // Simplify the computation by having one version where threads per block = block size + // and a second for the one thread per block accelerator. + if constexpr (!requires_single_thread_per_block_v) { + // Here threadsPerBlock == elementsPerBlock + for (uint32_t i = threadIdx + threadsPerBlock, k = 0; i < size; i += threadsPerBlock, ++k) { + co[i] += psum[k]; + } + } else { + // We are single threaded here, adding partial sums starting with the 2nd block. + for (uint32_t i = elementsPerBlock; i < size; i++) { + co[i] += psum[i / elementsPerBlock - 1]; + } + } + } + }; +} // namespace cms::alpakatools + +// declare the amount of block shared memory used by the multiBlockPrefixScan kernel +namespace alpaka::trait { + // Variable size shared mem + template + struct BlockSharedMemDynSizeBytes, TAcc> { + template + ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes( + cms::alpakatools::multiBlockPrefixScan const& /* kernel */, + TVec const& /* blockThreadExtent */, + TVec const& /* threadElemExtent */, + T const* /* ci */, + T const* /* co */, + int32_t /* size */, + int32_t numBlocks, + int32_t const* /* pc */, + // This trait function does not receive the accelerator object to look up the warp size + std::size_t warpSize) { + // We need workspace (T[warpsize]) + partial sums (T[numblocks]). + if constexpr (cms::alpakatools::requires_single_thread_per_block_v) { + return sizeof(T) * numBlocks; + } else { + return sizeof(T) * (warpSize + numBlocks); + } + } + }; + +} // namespace alpaka::trait + +#endif // HeterogeneousCore_AlpakaInterface_interface_prefixScan_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h new file mode 100644 index 0000000000000..893132905f934 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h @@ -0,0 +1,427 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_radixSort_h +#define HeterogeneousCore_AlpakaInterface_interface_radixSort_h + +#include +#include +#include +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +namespace cms::alpakatools { + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void dummyReorder( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {} + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderSigned( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... + + auto& firstNeg = alpaka::declareSharedVar(acc); + firstNeg = a[ind[0]] < 0 ? 0 : size; + alpaka::syncBlockThreads(acc); + + // find first negative + for(auto idx: elements_with_stride(acc, size - 1)) { + if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) + firstNeg = idx + 1; + } + + alpaka::syncBlockThreads(acc); + + for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[idx - firstNeg] = ind[idx]; } + alpaka::syncBlockThreads(acc); + + for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; } + alpaka::syncBlockThreads(acc); + + for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; } + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderFloat( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... + + auto& firstNeg = alpaka::declareSharedVar(acc); + firstNeg = a[ind[0]] < 0 ? 0 : size; + alpaka::syncBlockThreads(acc); + + // find first negative + for(auto idx: elements_with_stride(acc, size - 1)) { + if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) + firstNeg = idx + 1; + } + alpaka::syncBlockThreads(acc); + + for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[size - idx - 1] = ind[idx]; } + alpaka::syncBlockThreads(acc); + + for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; } + alpaka::syncBlockThreads(acc); + + for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; } + } + + + // Radix sort implements a bytewise lexicographic order on the input data. + // Data is reordered into bins indexed by the byte considered. But considering least significant bytes first + // and respecting the existing order when binning the values, we achieve the lexicographic ordering. + // The number of bytes actually considered is a parameter template parameter. + // The post processing reorder + // function fixes the order when bitwise ordering is not the order for the underlying type (only based on + // most significant bit for signed types, integer or floating point). + // The floating point numbers are reinterpret_cast into integers in the calling wrapper + // This algorithm requires to run in a single block + template // The post processing reorder function. + ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortImpl( + const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { + if constexpr (!requires_single_thread_per_block_v) { + const auto warpSize = alpaka::warp::getSize(acc); + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); + // we expect a power of 2 here + assert(warpSize && (0 == (warpSize & (warpSize - 1)))); + const std::size_t warpMask = warpSize - 1; + + // Define the bin size (d=8 => 1 byte bin). + constexpr int binBits = 8, dataBits = 8 * sizeof(T), totalSortingPassses = dataBits / binBits; + // Make sure the slices are data aligned + static_assert (0 == dataBits % binBits); + // Make sure the NS parameter makes sense + static_assert (NS > 0 && NS <= sizeof(T)); + constexpr int binsNumber = 1 << binBits; + constexpr int binsMask = binsNumber - 1; + // Prefix scan iterations. NS is counted in full bytes and not slices. + constexpr int initialSortingPass = int(sizeof(T)) - NS; + + // Count/index for the prefix scan + // TODO: rename + auto& c = alpaka::declareSharedVar(acc); + // Temporary storage for prefix scan. Only really needed for first-of-warp keeping + // Then used for thread to bin mapping TODO: change type to byte and remap to + auto& ct = alpaka::declareSharedVar(acc); + // Bin to thread index mapping (used to store the highest thread index within a bin number + // batch of threads. + // TODO: currently initialized to an invalid value, but could also be initialized to the + // lowest possible value (change to bytes?) + auto& cu = alpaka::declareSharedVar(acc); + // TODO we could also have an explicit caching of the current index for each thread. + + // TODO: do those have to be shared? + auto& ibs = alpaka::declareSharedVar(acc); + auto& currentSortingPass = alpaka::declareSharedVar(acc); + + ALPAKA_ASSERT_OFFLOAD(size > 0); + // TODO: is this a hard requirement? + ALPAKA_ASSERT_OFFLOAD(blockDimension >= binsNumber); + + currentSortingPass = initialSortingPass; + + auto j = ind; + auto k = ind2; + + // Initializer index order to trivial increment. + for (auto idx: elements_with_stride(acc, size)) { j[idx] = idx; } + alpaka::syncBlockThreads(acc); + + // Iterate on the slices of the data. + while (alpaka::syncBlockThreadsPredicate(acc, (currentSortingPass < totalSortingPassses))) { + for (auto idx: elements_with_stride(acc, binsNumber)) { c[idx] = 0; } + alpaka::syncBlockThreads(acc); + const auto sortingPassShift = binBits * currentSortingPass; + + // fill bins (count elements in each bin) + for (auto idx: elements_with_stride(acc, size)) { + auto bin = (a[j[idx]] >> sortingPassShift) & binsMask; + alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{}); + } + alpaka::syncBlockThreads(acc); + + if (!threadIdxLocal && !alpaka::getIdx(acc)[0]) { + printf("Pass=%d ", currentSortingPass - 1); + size_t total = 0; + for(int i=0; i<(int)binsNumber; i++) { + printf("count[%d]=%d ", i, c[i] ); + total += c[i]; + } + printf("total=%zu\n", total); + assert(total == size); + } + // prefix scan "optimized"???... + // TODO: we might be able to reuse the warpPrefixScan function + // Warp level prefix scan + for (auto idx: elements_with_stride(acc, binsNumber)) { + auto x = c[idx]; + auto laneId = idx & warpMask; + + for (int offset = 1; offset < warpSize; offset <<= 1) { + auto y = alpaka::warp::shfl(acc, x, laneId - offset); + if (laneId >= (uint32_t)offset) + x += y; + } + ct[idx] = x; + } + alpaka::syncBlockThreads(acc); + + // Block level completion of prefix scan (add last sum of each preceding warp) + for (auto idx: elements_with_stride(acc, binsNumber)) { + auto ss = (idx / warpSize) * warpSize - 1; + c[idx] = ct[idx]; + for (int i = ss; i > 0; i -= warpSize) + c[idx] += ct[i]; + } + // Post prefix scan, c[bin] contains the offsets in index counts to the last index +1 for each bin + + /* + //prefix scan for the nulls (for documentation) + if (threadIdxLocal==0) + for (int i = 1; i < sb; ++i) c[i] += c[i-1]; + */ + + // broadcast: we will fill the new index array downward, from offset c[bin], with one thread per + // bin, working on one set of bin size elements at a time. + // This will reorder the indices by the currently considered slice, otherwise preserving the previous order. + ibs = size - 1; + alpaka::syncBlockThreads(acc); + if (!threadIdxLocal && !alpaka::getIdx(acc)[0]) { + printf("Pass=%d (post prefix scan) ", currentSortingPass - 1); + for(int i=0; i<(int)binsNumber; i++) { + printf("offset[%d]=%d ", i, c[i] ); + } + printf("\n"); + } + + // Iterate on bin-sized slices to (size - 1) / binSize + 1 iterations + while (alpaka::syncBlockThreadsPredicate(acc, ibs >= 0)) { + // Init + for (auto idx: elements_with_stride(acc, binsNumber)) { + cu[idx] = -1; + ct[idx] = -1; + } + alpaka::syncBlockThreads(acc); + + // Find the highest index for all the threads dealing with a given bin (in cu[]) + // Also record the bin for each thread (in ct[]) + for (auto idx: elements_with_stride(acc, binsNumber)) { + int i = ibs - idx; + int32_t bin = -1; + if (i >= 0) { + bin = (a[j[i]] >> sortingPassShift) & binsMask; + ct[idx] = bin; + alpaka::atomicMax(acc, &cu[bin], int(i), alpaka::hierarchy::Threads{}); + } + } + alpaka::syncBlockThreads(acc); + if (!threadIdxLocal && !alpaka::getIdx(acc)[0]) { + printf("Pass=%d (max index) ", currentSortingPass - 1); + for(int i=0; i<(int)binsNumber; i++) { + printf("max_i[%d]=%d ", i, cu[i] ); + } + printf("\n"); + } + + + // FIXME: we can slash a memory access. + for (auto idx: elements_with_stride(acc, binsNumber)) { + int i = ibs - idx; + // Are we still in inside the data? + if (i >= 0) { + int32_t bin = ct[idx]; + // Are we the thread with the highest index (from previous pass)? + if (cu[bin] == i) { + // With the highest index, we are actually the lowest thread number. We will + // work "on behalf of" the higher thread numbers (including ourselves) + // No way around scanning and testing for bin in ct[otherThread] number to find the other threads + for (int peerThreadIdx = idx; peerThreadIdx < binsNumber; peerThreadIdx++) { + if (ct[peerThreadIdx] == bin) { + k[--c[bin]] = j[ibs - peerThreadIdx]; + } + } + } + } + /* + int32_t bin = (i >= 0 ? ((a[j[i]] >> sortingPassShift) & binsMask) : -1); + if (i >= 0 && i == cu[bin]) // ensure to keep them in order: only one thread per bin is active, rest is idle. + // + for (int ii = idx; ii < sb; ++ii) + if (ct[ii] == bin) { + auto oi = ii - idx; + // assert(i>=oi);if(i>=oi) + k[--c[bin]] = j[i - oi]; // i = ibs - idx, oi = ii - idx => i - oi = ibs - ii; + } + */ + } + alpaka::syncBlockThreads(acc); + + if (threadIdxLocal == 0) { + ibs -= binsNumber; + // https://github.com/cms-patatrack/pixeltrack-standalone/pull/210 + // TODO: is this really needed? + alpaka::mem_fence(acc, alpaka::memory_scope::Grid{}); + } + alpaka::syncBlockThreads(acc); + } + + /* + // broadcast for the nulls (for documentation) + if (threadIdxLocal==0) + for (int i=size-first-1; i>=0; i--) { // =blockDim.x) { + auto bin = (a[j[i]] >> d*p)&(sb-1); + auto ik = atomicSub(&c[bin],1); + k[ik-1] = j[i]; + } + */ + + alpaka::syncBlockThreads(acc); + ALPAKA_ASSERT_OFFLOAD(c[0] == 0); + + // swap (local, ok) + auto t = j; + j = k; + k = t; + + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if (threadIdxLocal == 0) + ++currentSortingPass; + alpaka::syncBlockThreads(acc); + if (!threadIdxLocal && size == 257) { + printf("Pass=%d ", currentSortingPass - 1); + for(int i=0; i<(int)size; i++) { + printf("k[%d]=%d ", i, k[i] ); + } + printf("\n"); + } + } + + if ((dataBits != 8) && (0 == (NS & 1))) + ALPAKA_ASSERT_OFFLOAD(j == ind); // dataBits/binBits is even so ind is correct (the result is in the right location) + + // TODO this copy is (doubly?) redundant with the reorder + if (j != ind) // odd number of sorting passes, we need to move the result to the right array (ind[]) + for (auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; }; + + alpaka::syncBlockThreads(acc); + + // now move negative first... (if signed) + // TODO: the ind2 => ind copy should have beed deferred. We should pass (j != ind) as an extra parameter + reorder(acc, a, ind, ind2, size); + } else { + //static_assert(false); + } + } + + template ::value && !requires_single_thread_per_block_v, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + if (!alpaka::getIdx(acc)[0]) { + printf("GPU radixSort unsigned, a=%p, block=%d, size=%d\n", a, alpaka::getIdx(acc)[0], size); + } + radixSortImpl(acc, a, ind, ind2, size, dummyReorder); + } + + template ::value && std::is_signed::value && !requires_single_thread_per_block_v, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + if (!alpaka::getIdx(acc)[0]) { + printf("GPU radixSort signed, a=%p, block=%d, size=%d\n", a, alpaka::getIdx(acc)[0], size); + } + radixSortImpl(acc, a, ind, ind2, size, reorderSigned); + } + + template ::value && !requires_single_thread_per_block_v, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + static_assert(sizeof(T) == sizeof(int), "radixSort with the wrong type size"); + using I = int; + if (!alpaka::getIdx(acc)[0]) { + printf("GPU radixSort float, a=%p, block=%d, size=%d\n", a, alpaka::getIdx(acc)[0], size); + } + radixSortImpl(acc, (I const*)(a), ind, ind2, size, reorderFloat); + } + + template , T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + static_assert(requires_single_thread_per_block_v, "CPU sort (not a radixSort) called wtth wrong accelerator"); + // Initialize the index array + for (std::size_t i = 0; i < size; i++) ind[i] = i; + printf("std::sort(a=%p, ind=%p, indmax=%p, size=%d)\n", a, ind, ind + size, size); + for (uint32_t i=0; i<10 && i + ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti(const TAcc& acc, + T const* v, + uint16_t* index, + uint32_t const* offsets, + uint16_t* workspace) { + // TODO: check + // Sort multiple blocks of data in v[] separated by in chunks located at offsets[] + // extern __shared__ uint16_t ws[]; + uint16_t * ws = alpaka::getDynSharedMem(acc); + + const uint32_t blockIdx(alpaka::getIdx(acc)[0u]); + auto a = v + offsets[blockIdx]; + auto ind = index + offsets[blockIdx]; + auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx]; + auto size = offsets[blockIdx + 1] - offsets[blockIdx]; + assert(offsets[blockIdx + 1] >= offsets[blockIdx]); + if (0 == alpaka::getIdx(acc)[0]) { + printf("Block=%d, offsets[blockIdx]=%d, size=%d, v=%p, v[offset[blockId]=%p\n", + blockIdx, offsets[blockIdx], size, v, &v[offsets[blockIdx]]); + } + if (size > 0) + radixSort(acc, a, ind, ind2, size); + } + + template + struct radixSortMultiWrapper { +/* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical +#if defined(__CUDACC__) || defined(__HIPCC__) + //__global__ void __launch_bounds__(256, 4) +#endif +*/ + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const { + radixSortMulti(acc, v, index, offsets, workspace); + } + }; + + template + struct radixSortMultiWrapper2 { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const { + radixSortMulti(acc, v, index, offsets, workspace); + } + }; +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_radixSort_h diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml index 2d204819d740b..463bf5dae4461 100644 --- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml +++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml @@ -23,3 +23,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc new file mode 100644 index 0000000000000..f4073997f5c97 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc @@ -0,0 +1,106 @@ +#include +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestAtomicPair) "]"; + +struct update { + template + ALPAKA_FN_ACC void operator()( + const TAcc &acc, AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const { + for (auto i : elements_with_stride(acc, n)) { + auto m = i % 11; + m = m % 6 + 1; // max 6, no 0 + auto c = dc->inc_add(acc, m); + assert(c.first < n); + ind[c.first] = c.second; + for (uint32_t j = c.second; j < c.second + m; ++j) + cont[j] = i; + } + } +}; + +struct finalize { + template + ALPAKA_FN_ACC void operator()( + const TAcc &acc, AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const { + assert(dc->get().first == n); + ind[n] = dc->get().second; + } +}; + +TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestAtomicPair), s_tag) { + SECTION("AtomicPairCounter") { + auto const &devices = cms::alpakatools::devices(); + REQUIRE(!devices.empty()); + // run the test on each device + for (auto const &device : devices) { + std::cout << "Test AtomicPairCounter on " << alpaka::getName(device) << '\n'; + auto queue = Queue(device); + + auto c_d = make_device_buffer(queue); + alpaka::memset(queue, c_d, 0); + + std::cout << "- size " << sizeof(AtomicPairCounter) << std::endl; + + constexpr uint32_t N = 20000; + constexpr uint32_t M = N * 6; + auto n_d = make_device_buffer(queue, N); + auto m_d = make_device_buffer(queue, M); + + constexpr uint32_t NUM_VALUES = 10000; + + // Update + const auto blocksPerGrid = 2000u; + const auto threadsPerBlockOrElementsPerThread = 512u; + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::enqueue( + queue, alpaka::createTaskKernel(workDiv, update(), c_d.data(), n_d.data(), m_d.data(), NUM_VALUES)); + + // Finalize + const auto blocksPerGridFinalize = 1u; + const auto threadsPerBlockOrElementsPerThreadFinalize = 1u; + const auto workDivFinalize = + make_workdiv(blocksPerGridFinalize, threadsPerBlockOrElementsPerThreadFinalize); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workDivFinalize, finalize(), c_d.data(), n_d.data(), m_d.data(), NUM_VALUES)); + + auto c_h = make_host_buffer(queue); + auto n_h = make_host_buffer(queue, N); + auto m_h = make_host_buffer(queue, M); + + // copy the results from the device to the host + alpaka::memcpy(queue, c_h, c_d); + alpaka::memcpy(queue, n_h, n_d); + alpaka::memcpy(queue, m_h, m_d); + + // wait for all the operations to complete + alpaka::wait(queue); + + REQUIRE(c_h.data()->get().first == NUM_VALUES); + REQUIRE(n_h[NUM_VALUES] == c_h.data()->get().second); + REQUIRE(n_h[0] == 0); + + for (size_t i = 0; i < NUM_VALUES; ++i) { + auto ib = n_h.data()[i]; + auto ie = n_h.data()[i + 1]; + auto k = m_h.data()[ib++]; + REQUIRE(k < NUM_VALUES); + + for (; ib < ie; ++ib) + REQUIRE(m_h.data()[ib] == k); + } + } + } +} diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc new file mode 100644 index 0000000000000..57b80cc9cf275 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc @@ -0,0 +1,255 @@ +#include +#include +#include +#include +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestHistoContainer) "]"; + +template +void checkContents(Hist* h, + unsigned int N, + VERIFY& verify, + INCR& incr, + T window, + uint32_t nParts, + const T* v, + const uint32_t* offsets) { + for (uint32_t j = 0; j < nParts; ++j) { + auto off = Hist::histOff(j); + for (uint32_t i = 0; i < Hist::nbins(); ++i) { + auto ii = i + off; + if (0 == h->size(ii)) + continue; + auto k = *h->begin(ii); + if (j % 2) + k = *(h->begin(ii) + (h->end(ii) - h->begin(ii)) / 2); +#ifndef NDEBUG + [[maybe_unused]] auto bk = h->bin(v[k]); +#endif + ALPAKA_ASSERT_OFFLOAD(bk == i); + ALPAKA_ASSERT_OFFLOAD(k < offsets[j + 1]); + auto kl = h->bin(v[k] - window); + auto kh = h->bin(v[k] + window); + ALPAKA_ASSERT_OFFLOAD(kl != i); + ALPAKA_ASSERT_OFFLOAD(kh != i); + // std::cout << kl << ' ' << kh << std::endl; + + auto me = v[k]; + auto tot = 0; + auto nm = 0; + bool l = true; + auto khh = kh; + incr(khh); + for (auto kk = kl; kk != khh; incr(kk)) { + if (kk != kl && kk != kh) + nm += h->size(kk + off); + for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p) { + if (std::min(std::abs(T(v[*p] - me)), std::abs(T(me - v[*p]))) > window) { + } else { + ++tot; + } + } + if (kk == i) { + l = false; + continue; + } + if (l) + for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p) + verify(i, k, k, (*p)); + else + for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p) + verify(i, k, (*p), k); + } + if (!(tot >= nm)) { + std::cout << "too bad " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + } + if (l) + std::cout << "what? " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + ALPAKA_ASSERT_OFFLOAD(!l); + } + } + int status; + auto* demangled = abi::__cxa_demangle(typeid(Hist).name(), NULL, NULL, &status); + status || printf("Check contents OK with %s\n", demangled); + std::free(demangled); +} + +template +int go(const DevHost& host, const Device& device, Queue& queue) { + std::mt19937 eng(2708); + std::uniform_int_distribution rgen(std::numeric_limits::min(), std::numeric_limits::max()); + + constexpr unsigned int N = 12000; + auto v = make_host_buffer(queue, N); + auto v_d = make_device_buffer(queue, N); + alpaka::memcpy(queue, v_d, v); + + constexpr uint32_t nParts = 10; + constexpr uint32_t partSize = N / nParts; + + using Hist = HistoContainer; + using HistR = HistoContainer; + std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' + << Hist{}.capacity() << ' ' << offsetof(Hist, content) - offsetof(Hist, off) << ' ' + << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; + std::cout << "HistoContainer Runtime sized " << (int)(offsetof(HistR, off)) << ' ' << HistR::nbins() << ' ' + << HistR::totbins() << ' ' << HistR{}.capacity() << ' ' << offsetof(HistR, content) - offsetof(HistR, off) + << ' ' << (std::numeric_limits::max() - std::numeric_limits::min()) / HistR::nbins() << std::endl; + + // Offsets for each histogram. + auto offsets = make_host_buffer(queue, nParts + 1); + auto offsets_d = make_device_buffer(queue, nParts + 1); + + // Compile sized histogram (self contained) + auto h = make_host_buffer(queue); + auto h_d = make_device_buffer(queue); + + // Run time sized histogram + auto hr = make_host_buffer(queue); + // Data storage for histogram content (host) + auto hd = make_host_buffer(queue, N); + auto hr_d = make_device_buffer(queue); + // Data storage for histogram content (device) + auto hd_d = make_device_buffer(queue, N); + + // We iterate the test 5 times. + for (int it = 0; it < 5; ++it) { + offsets[0] = 0; + for (uint32_t j = 1; j < nParts + 1; ++j) { + offsets[j] = offsets[j - 1] + partSize - 3 * j; + ALPAKA_ASSERT_OFFLOAD(offsets[j] <= N); + } + + if (it == 1) { // special cases... + offsets[0] = 0; + offsets[1] = 0; + offsets[2] = 19; + offsets[3] = 32 + offsets[2]; + offsets[4] = 123 + offsets[3]; + offsets[5] = 256 + offsets[4]; + offsets[6] = 311 + offsets[5]; + offsets[7] = 2111 + offsets[6]; + offsets[8] = 256 * 11 + offsets[7]; + offsets[9] = 44 + offsets[8]; + offsets[10] = 3297 + offsets[9]; + } + + alpaka::memcpy(queue, offsets_d, offsets); + + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + + if (it == 2) { // big bin + for (long long j = 1000; j < 2000; j++) + v[j] = sizeof(T) == 1 ? 22 : 3456; + } + + // for(unsigned int i=0;i" << v[i] << std::endl; + // } + alpaka::memcpy(queue, v_d, v); + + alpaka::memset(queue, h_d, 0); + alpaka::memset(queue, hr_d, 0); + alpaka::memset(queue, hd_d, 0); + + alpaka::wait(queue); + + std::cout << "Calling fillManyFromVector - " << h->size() << std::endl; + fillManyFromVector(h_d.data(), nParts, v_d.data(), offsets_d.data(), offsets[10], 256, queue); + + std::cout << "Calling fillManyFromVector(runtime sized) - " << h->size() << std::endl; + typename HistR::View hrv_d; + hrv_d.assoc = hr_d.data(); + hrv_d.offSize = -1; + hrv_d.offStorage = nullptr; + hrv_d.contentSize = N; + hrv_d.contentStorage = hd_d.data(); + fillManyFromVector(hr_d.data(), hrv_d, nParts, v_d.data(), offsets_d.data(), offsets[10], 256, queue); + + alpaka::memcpy(queue, h, h_d); + // For the runtime sized version: + // We need the histogram for non external data (here, the offsets) + // .. and external data storage (here, the contents) + // .. and plug the data storage address into the histo container + alpaka::memcpy(queue, hr, hr_d); + alpaka::memcpy(queue, hd, hd_d); + + // std::cout << "Calling fillManyFromVector - " << h->size() << std::endl; + alpaka::wait(queue); + + // We cannot update the contents address of the histo container before the copy from device happened + typename HistR::View hrv; + hrv.assoc = hr.data(); + hrv.offSize = -1; + hrv.offStorage = nullptr; + hrv.contentSize = N; + hrv.contentStorage = hd.data(); + hr->initStorage(hrv); + + std::cout << "Copied results" << std::endl; + // for(int i =0;i<=10;i++) + // { + // std::cout << offsets[i] <<" - "<< h->size() << std::endl; + // } + + ALPAKA_ASSERT_OFFLOAD(0 == h->off[0]); + ALPAKA_ASSERT_OFFLOAD(offsets[10] == h->size()); + ALPAKA_ASSERT_OFFLOAD(0 == hr->off[0]); + ALPAKA_ASSERT_OFFLOAD(offsets[10] == hr->size()); + + auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) { + ALPAKA_ASSERT_OFFLOAD(t1 < N); + ALPAKA_ASSERT_OFFLOAD(t2 < N); + if (T(v[t1] - v[t2]) <= 0) + std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl; + }; + + auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); }; + + // make sure it spans 3 bins... + auto window = T(1300); + checkContents(h.data(), N, verify, incr, window, nParts, v.data(), offsets.data()); + checkContents(hr.data(), N, verify, incr, window, nParts, v.data(), offsets.data()); + } + + return 0; +} + +TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestHistoContainer), s_tag) { + SECTION("HistoContainerKernel") { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + auto const& host = cms::alpakatools::host(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return; + } + // run the test on each device + for (auto const& device : devices) { + std::cout << "Test Histo Container on " << alpaka::getName(device) << '\n'; + auto queue = Queue(device); + + REQUIRE(go(host, device, queue) == 0); + REQUIRE(go(host, device, queue) == 0); + } + } +} diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc new file mode 100644 index 0000000000000..5e08c05725888 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc @@ -0,0 +1,187 @@ +#include +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +struct mykernel { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* __restrict__ v, uint32_t N) const { + ALPAKA_ASSERT_OFFLOAD(v); + ALPAKA_ASSERT_OFFLOAD(N == 12000); + + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if (threadIdxLocal == 0) { + printf("start kernel for %d data\n", N); + } + + using Hist = HistoContainer; + + auto& hist = alpaka::declareSharedVar(acc); + auto& ws = alpaka::declareSharedVar(acc); + + // set off zero + for (auto j : elements_with_stride(acc, Hist::totbins())) { + hist.off[j] = 0; + } + alpaka::syncBlockThreads(acc); + + // set bins zero + for (auto j : elements_with_stride(acc, Hist::totbins())) { + hist.content[j] = 0; + } + alpaka::syncBlockThreads(acc); + + // count + for (auto j : elements_with_stride(acc, N)) { + hist.count(acc, v[j]); + } + alpaka::syncBlockThreads(acc); + + ALPAKA_ASSERT_OFFLOAD(0 == hist.size()); + alpaka::syncBlockThreads(acc); + + // finalize + hist.finalize(acc, ws); + alpaka::syncBlockThreads(acc); + + ALPAKA_ASSERT_OFFLOAD(N == hist.size()); + + // verify + for ([[maybe_unused]] auto j : elements_with_stride(acc, Hist::nbins())) { + ALPAKA_ASSERT_OFFLOAD(hist.off[j] <= hist.off[j + 1]); + } + alpaka::syncBlockThreads(acc); + + for (auto j : elements_with_stride(acc, 32)) { + ws[j] = 0; // used by prefix scan... + } + alpaka::syncBlockThreads(acc); + + // fill + for (auto j : elements_with_stride(acc, N)) { + hist.fill(acc, v[j], j); + } + alpaka::syncBlockThreads(acc); + + ALPAKA_ASSERT_OFFLOAD(0 == hist.off[0]); + ALPAKA_ASSERT_OFFLOAD(N == hist.size()); + + // bin +#ifndef NDEBUG + for (auto j : elements_with_stride(acc, hist.size() - 1)) { + auto p = hist.begin() + j; + ALPAKA_ASSERT_OFFLOAD((*p) < N); + [[maybe_unused]] auto k1 = Hist::bin(v[*p]); + [[maybe_unused]] auto k2 = Hist::bin(v[*(p + 1)]); + ALPAKA_ASSERT_OFFLOAD(k2 >= k1); + } +#endif + + // forEachInWindow + for (auto i : elements_with_stride(acc, hist.size())) { + auto p = hist.begin() + i; + auto j = *p; +#ifndef NDEBUG + auto b0 = Hist::bin(v[j]); +#endif + int tot = 0; + auto ftest = [&](unsigned int k) { + ALPAKA_ASSERT_OFFLOAD(k < N); + ++tot; + }; + forEachInWindow(hist, v[j], v[j], ftest); +#ifndef NDEBUG + [[maybe_unused]] int rtot = hist.size(b0); + ALPAKA_ASSERT_OFFLOAD(tot == rtot); +#endif + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + ALPAKA_ASSERT_OFFLOAD(vp >= vm); + forEachInWindow(hist, vm, vp, ftest); +#ifndef NDEBUG + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp) - hist.begin(bm); + ALPAKA_ASSERT_OFFLOAD(tot == rtot); +#endif + } + } +}; + +template +void go(const DevHost& host, const Device& device, Queue& queue) { + std::mt19937 eng; + + int rmin = std::numeric_limits::min(); + int rmax = std::numeric_limits::max(); + if (NBINS != 128) { + rmin = 0; + rmax = NBINS * 2 - 1; + } + + std::uniform_int_distribution rgen(rmin, rmax); + constexpr unsigned int N = 12000; + + using Hist = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist{}.capacity() << ' ' + << (rmax - rmin) / Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + + auto v = make_host_buffer(queue, N); + auto v_d = make_device_buffer(queue, N); + + for (int it = 0; it < 5; ++it) { + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + if (it == 2) + for (long long j = N / 2; j < N / 2 + N / 4; j++) + v[j] = 4; + + alpaka::memcpy(queue, v_d, v); + + const auto threadsPerBlockOrElementsPerThread = 256u; + const auto blocksPerGrid = 1u; + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, mykernel(), v_d.data(), N)); + } + alpaka::wait(queue); +} + +int main() { + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return 0; + } + + auto const& host = cms::alpakatools::host(); + + // run the test on each device + for (auto const& device : devices) { + std::cout << "Test One Histo Container on " << alpaka::getName(device) << '\n'; + + auto queue = Queue(device); + + go(host, device, queue); + go(host, device, queue); + go(host, device, queue); + } + + return 0; +} diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc new file mode 100644 index 0000000000000..c781f83ff1225 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc @@ -0,0 +1,237 @@ +// #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +#include +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h" +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +using FLOAT = double; + +// A templated unsigned integer type with N bytes +template +struct uintN; + +template <> +struct uintN<8> { + using type = uint8_t; +}; + +template <> +struct uintN<16> { + using type = uint16_t; +}; + +template <> +struct uintN<32> { + using type = uint32_t; +}; + +template <> +struct uintN<64> { + using type = uint64_t; +}; + +template +using uintN_t = typename uintN::type; + +// A templated unsigned integer type with the same size as T +template +using uintT_t = uintN_t; + +// Keep only the `N` most significant bytes of `t`, and set the others to zero +template > +ALPAKA_FN_HOST_ACC T truncate(T const& t) { + const int shift = 8 * (sizeof(T) - N); + union { + T t; + uintT_t u; + } c; + c.t = t; + c.u = c.u >> shift << shift; + return c.t; +} + +namespace { + struct testKernel { + template + ALPAKA_FN_ACC void operator() (const TAcc &acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const { + //size_t firstElement = threadIdx.x + blockIdx.x * blockDim.x; // This is going to be the track index + //size_t gridSize = blockDim.x * gridDim.x; + bool threadZero = !alpaka::getIdx(acc)[0u]; + + // radix sort works in a single block (and the assert macro does not like the comma of the template parameters). + const auto blocksPerGrid = alpaka::getWorkDiv(acc)[0u]; + const auto blocksIdx = alpaka::getIdx(acc)[0u]; + assert(1 == blocksPerGrid); + assert(0 == blocksIdx); + assert(elements <= 2048); + + auto &order = alpaka::declareSharedVar(acc); + auto &sws = alpaka::declareSharedVar(acc); + auto &z = alpaka::declareSharedVar(acc); + auto &iz = alpaka::declareSharedVar(acc); +// __shared__ uint16_t order[2048]; +// __shared__ uint16_t sws[2048]; +// __shared__ float z[2048]; +// __shared__ int iz[2048]; + for (auto itrack: elements_with_stride(acc, elements)) { + z[itrack] = gpu_input[itrack]; + iz[itrack] = 10000 * gpu_input[itrack]; + // order[itrack] = itrack; + } + alpaka::syncBlockThreads(acc); + radixSort(acc, z, order, sws, elements); + alpaka::syncBlockThreads(acc); + + //verify + for (auto itrack: elements_with_stride(acc, elements - 1)) { + auto ntrack = order[itrack]; + auto mtrack = order[itrack + 1]; + assert(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack])); + } + + alpaka::syncBlockThreads(acc); + + if (doPrint) + if (threadZero) { + for (int itrackO = 0; itrackO < elements; itrackO++) { + int itrack = order[itrackO]; + printf( + "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to " + "radixSort %f\n", + elements, + itrackO, + itrack, + gpu_input[itrack], + z[itrack]); + gpu_product[itrackO] = itrack; + } + } + + alpaka::syncBlockThreads(acc); + radixSort(acc, iz, order, sws, elements); + alpaka::syncBlockThreads(acc); + + for (auto itrack: elements_with_stride(acc, elements - 1)) { + auto ntrack = order[itrack]; + auto mtrack = order[itrack + 1]; + assert(iz[ntrack] <= iz[mtrack]); + } + + if (doPrint) + if (threadZero) { + for (int itrackO = 0; itrackO < elements; itrackO++) { + int itrack = order[itrackO]; + printf( + "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to " + "radixSort %d\n", + elements, + itrackO, + itrack, + gpu_input[itrack], + iz[itrack]); + gpu_product[itrackO] = itrack; + } + } + } + }; + + void testWrapper(Queue & queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) { + auto blockSize = 512; // somewhat arbitrary + auto gridSize = 1; // round up to cover the sample size + const auto workdiv = make_workdiv(gridSize, blockSize); + alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint)); + alpaka::wait(queue); + } +} // namespace + +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +int main() { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return 0; + } + + // run the test on each device + for (auto const& device : devices) { + Queue queue(device); +// FLOAT* gpu_input; +// int* gpu_product; + + int nmax = 4 * 260; + auto gpu_input_h = cms::alpakatools::make_host_buffer(queue, nmax); + auto i = gpu_input_h.data(); + for (auto v: { + 30.0, 30.0, -4.4, -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, 2.2666890621, + 2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453, 2.2852313519, 2.2877883911, 2.2946476936, 2.2960267067, + 2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652, 2.783257246, 2.8440306187, 2.9017834663, 2.9252648354, + 2.9254128933, 2.927520752, 2.9422419071, 2.9453969002, 2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948, + 2.9575133324, 2.9575304985, 2.9586606026, 2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377, 2.9646151066, + 2.9676523209, 2.9708273411, 2.974111557, 2.9742531776, 2.9772830009, 2.9877333641, 2.9960610867, 3.013969183, + 3.0187871456, 3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913, 3.0560519695, 3.0592908859, 3.0599737167, + 3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904, 3.0643932819, 3.0645000935, 3.0666446686, 3.068046093, + 3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026, 3.0733406544, 3.0738227367, 3.0738801956, 3.0738828182, + 3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146, 3.0773906708, 3.0778541565, 3.0780284405, 3.0780889988, + 3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217, 3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494, + 3.0815284252, 3.0817306042, 3.0819730759, 3.0820026398, 3.0838682652, 3.084009409, 3.0848178864, 3.0853257179, + 3.0855510235, 3.0856611729, 3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093, 3.0895674229, 3.0901503563, + 3.0903317928, 3.0912668705, 3.0920717716, 3.0954346657, 3.096424818, 3.0995628834, 3.1001036167, 3.1173279285, + 3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508, 3.1676850319, 3.1886672974, 3.3769197464, 3.3821125031, + 3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872, 3.5225863457, 3.5271244049, 3.5298995972, 3.5417425632, + 3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239, 3.5526945591, 3.5531234741, 3.5538012981, 3.5544877052, + 3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172, 3.5560717583, 3.5560848713, 3.5584278107, 3.558681488, + 3.5587313175, 3.5592217445, 3.559384346, 3.5604712963, 3.5634038448, 3.563803196, 3.564593792, 3.5660364628, + 3.5683133602, 3.5696356297, 3.569729805, 3.5740811825, 3.5757565498, 3.5760207176, 3.5760478973, 3.5836098194, + 3.5839796066, 3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438, 3.7187042236, 3.9741659164, 4.4111995697, + 4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405, 4.6868157387, 4.6868577003, 4.6879930496, 4.6888813972, + 4.6910686493, 4.6925001144, 4.6957530975, 4.698094368, 4.6997032166, 4.7017259598, 4.7020640373, 4.7024269104, + 4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456, 4.7044372559, 4.7050771713, 4.7055773735, 4.7060651779, + 4.7062759399, 4.7065420151, 4.70657444, 4.7066287994, 4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582, + 4.7075614929, 4.7075891495, 4.7076044083, 4.7077374458, 4.7080879211, 4.70819664, 4.7086658478, 4.708937645, + 4.7092385292, 4.709479332, 4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566, 4.7105507851, 4.71118927, + 4.7113513947, 4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361, 4.7117333412, 4.7117910385, 4.7119007111, + 4.7120013237, 4.712003231, 4.712044239, 4.7122926712, 4.7135767937, 4.7143669128, 4.7145690918, 4.7148418427, + 4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611, 4.717815876, 4.718059063, 4.7188801765, 4.7190728188, + 4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678, 4.7243990898, 4.7273659706, 4.7294125557, 4.7296204567, + 4.7325615883, 4.7356877327, 4.740146637, 4.742254734, 4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065, + 4.7723139628, 4.774812736, 4.8577151299, 4.890037536}) { + *(i++) = v; + } + auto input = gpu_input_h.data(); + for (int i = 0; i < 260; i++) { + input[i + 260] = -input[i]; + input[i + 2 * 260] = input[i] + 10; + input[i + 3 * 260] = -input[i] - 10; + } + auto gpu_input_d = cms::alpakatools::make_device_buffer(queue, nmax); + //cudaCheck(cudaMalloc(&gpu_input, sizeof(FLOAT) * nmax)); +// cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax)); + auto gpu_product_d = cms::alpakatools::make_device_buffer(queue, nmax); + // copy the input data to the GPU + alpaka::memcpy(queue, gpu_input_d, gpu_input_h); + //cudaCheck(cudaMemcpy(gpu_input, input, sizeof(FLOAT) * nmax, cudaMemcpyHostToDevice)); + + for (int k = 2; k <= nmax; k++) { + std::random_shuffle(input, input + k); + printf("Test with %d items\n", k); + // sort on the GPU + testWrapper(queue, gpu_input_d.data(), gpu_product_d.data(), k, false); + alpaka::wait(queue); + } + } + return 0; +} diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc new file mode 100644 index 0000000000000..d1de1f1c17cca --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc @@ -0,0 +1,327 @@ +#include +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h" + +constexpr uint32_t MaxElem = 64000; +constexpr uint32_t MaxTk = 8000; +constexpr uint32_t MaxAssocs = 4 * MaxTk; + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +using AssocRandomAccess = OneToManyAssocRandomAccess; +using AssocSequential = OneToManyAssocSequential; +using SmallAssoc = OneToManyAssocSequential; +using Multiplicity = OneToManyAssocRandomAccess; +using TK = std::array; + +namespace { + template + ALPAKA_FN_HOST_ACC typename std::make_signed::type toSigned(T v) { + return static_cast::type>(v); + } +} // namespace + +struct countMultiLocal { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + TK const* __restrict__ tk, + Multiplicity* __restrict__ assoc, + uint32_t n) const { + for (auto i : elements_with_stride(acc, n)) { + auto& local = alpaka::declareSharedVar(acc); + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + const bool oncePerSharedMemoryAccess = (threadIdxLocal == 0); + if (oncePerSharedMemoryAccess) { + local.zero(); + } + alpaka::syncBlockThreads(acc); + local.count(acc, 2 + i % 4); + alpaka::syncBlockThreads(acc); + if (oncePerSharedMemoryAccess) { + assoc->add(acc, local); + } + } + } +}; + +struct countMulti { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + TK const* __restrict__ tk, + Multiplicity* __restrict__ assoc, + uint32_t n) const { + for (auto i : elements_with_stride(acc, n)) { + assoc->count(acc, 2 + i % 4); + } + } +}; + +struct verifyMulti { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const { + for ([[maybe_unused]] auto i : elements_with_stride(acc, Multiplicity{}.totOnes())) { + ALPAKA_ASSERT_OFFLOAD(m1->off[i] == m2->off[i]); + } + } +}; + +struct count { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + TK const* __restrict__ tk, + AssocRandomAccess* __restrict__ assoc, + uint32_t n) const { + for (auto i : elements_with_stride(acc, 4 * n)) { + auto k = i / 4; + auto j = i - 4 * k; + ALPAKA_ASSERT_OFFLOAD(j < 4); + if (k >= n) { + return; + } + if (tk[k][j] < MaxElem) { + assoc->count(acc, tk[k][j]); + } + } + } +}; + +struct fill { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + TK const* __restrict__ tk, + AssocRandomAccess* __restrict__ assoc, + uint32_t n) const { + for (auto i : elements_with_stride(acc, 4 * n)) { + auto k = i / 4; + auto j = i - 4 * k; + ALPAKA_ASSERT_OFFLOAD(j < 4); + if (k >= n) { + return; + } + if (tk[k][j] < MaxElem) { + assoc->fill(acc, tk[k][j], k); + } + } + } +}; + +struct verify { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, Assoc* __restrict__ assoc) const { + ALPAKA_ASSERT_OFFLOAD(assoc->size() < Assoc{}.capacity()); + } +}; + +struct fillBulk { + template + ALPAKA_FN_ACC void operator()( + const TAcc& acc, AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { + for (auto k : elements_with_stride(acc, n)) { + auto m = tk[k][3] < MaxElem ? 4 : 3; + assoc->bulkFill(acc, *apc, &tk[k][0], m); + } + } +}; + +struct verifyBulk { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) const { + if (::toSigned(apc->get().first) >= Assoc::ctNOnes()) { + printf("Overflow %d %d\n", apc->get().first, Assoc::ctNOnes()); + } + ALPAKA_ASSERT_OFFLOAD(toSigned(assoc->size()) < Assoc::ctCapacity()); + } +}; + +int main() { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return 0; + } + + // run the test on each device + for (auto const& device : devices) { + Queue queue(device); + + std::cout << "OneToManyAssocRandomAccess " << sizeof(AssocRandomAccess) << " Ones=" << AssocRandomAccess{}.totOnes() + << " Capacity=" << AssocRandomAccess{}.capacity() << std::endl; + std::cout << "OneToManyAssocSequential " << sizeof(AssocSequential) << " Ones=" << AssocSequential{}.totOnes() + << " Capacity=" << AssocSequential{}.capacity() << std::endl; + std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << " Ones=" << SmallAssoc{}.totOnes() + << " Capacity=" << SmallAssoc{}.capacity() << std::endl; + + std::mt19937 eng; + std::geometric_distribution rdm(0.8); + + constexpr uint32_t N = 4000; + + auto tr = make_host_buffer[]>(queue, N); + // fill with "index" to element + long long ave = 0; + int imax = 0; + auto n = 0U; + auto z = 0U; + auto nz = 0U; + for (auto i = 0U; i < 4U; ++i) { + auto j = 0U; + while (j < N && n < MaxElem) { + if (z == 11) { + ++n; + z = 0; + ++nz; + continue; + } // a bit of not assoc + auto x = rdm(eng); + auto k = std::min(j + x + 1, N); + if (i == 3 && z == 3) { // some triplets time to time + for (; j < k; ++j) + tr[j][i] = MaxElem + 1; + } else { + ave += x + 1; + imax = std::max(imax, x); + for (; j < k; ++j) + tr[j][i] = n; + ++n; + } + ++z; + } + ALPAKA_ASSERT_OFFLOAD(n <= MaxElem); + ALPAKA_ASSERT_OFFLOAD(j <= N); + } + std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl; + + auto v_d = make_device_buffer[]>(queue, N); + alpaka::memcpy(queue, v_d, tr); + + auto ara_d = make_device_buffer(queue); + alpaka::memset(queue, ara_d, 0); + + const auto threadsPerBlockOrElementsPerThread = 256u; + const auto blocksPerGrid4N = divide_up_by(4 * N, threadsPerBlockOrElementsPerThread); + const auto workDiv4N = make_workdiv(blocksPerGrid4N, threadsPerBlockOrElementsPerThread); + + AssocRandomAccess::template launchZero(ara_d.data(), queue); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv4N, count(), v_d.data(), ara_d.data(), N)); + + AssocRandomAccess::template launchFinalize(ara_d.data(), queue); + + alpaka::enqueue(queue, alpaka::createTaskKernel(WorkDiv1D{1u, 1u, 1u}, verify(), ara_d.data())); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv4N, fill(), v_d.data(), ara_d.data(), N)); + + auto ara_h = make_host_buffer(queue); + alpaka::memcpy(queue, ara_h, ara_d); + alpaka::wait(queue); + + std::cout << ara_h->size() << std::endl; + imax = 0; + ave = 0; + z = 0; + for (auto i = 0U; i < n; ++i) { + auto x = ara_h->size(i); + if (x == 0) { + z++; + continue; + } + ave += x; + imax = std::max(imax, int(x)); + } + ALPAKA_ASSERT_OFFLOAD(0 == ara_h->size(n)); + std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl; + + // now the inverse map (actually this is the direct....) + auto dc_d = make_device_buffer(queue); + alpaka::memset(queue, dc_d, 0); + + const auto blocksPerGrid = divide_up_by(N, threadsPerBlockOrElementsPerThread); + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + auto as_d = make_device_buffer(queue); + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDiv, fillBulk(), dc_d.data(), v_d.data(), as_d.data(), N)); + + alpaka::enqueue( + queue, alpaka::createTaskKernel(workDiv, AssocSequential::finalizeBulk(), dc_d.data(), as_d.data())); + + alpaka::enqueue(queue, + alpaka::createTaskKernel(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), as_d.data(), dc_d.data())); + + auto as_h = make_host_buffer(queue); + alpaka::memcpy(queue, as_h, as_d); + + auto dc_h = make_host_buffer(queue); + alpaka::memcpy(queue, dc_h, dc_d); + alpaka::wait(queue); + + alpaka::memset(queue, dc_d, 0); + auto sa_d = make_device_buffer(queue); + alpaka::memset(queue, sa_d, 0); + + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDiv, fillBulk(), dc_d.data(), v_d.data(), sa_d.data(), N)); + + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDiv, SmallAssoc::finalizeBulk(), dc_d.data(), sa_d.data())); + + alpaka::enqueue(queue, + alpaka::createTaskKernel(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), sa_d.data(), dc_d.data())); + + std::cout << "final counter value " << dc_h->get().second << ' ' << dc_h->get().first << std::endl; + + std::cout << as_h->size() << std::endl; + imax = 0; + ave = 0; + for (auto i = 0U; i < N; ++i) { + auto x = as_h->size(i); + if (!(x == 4 || x == 3)) { + std::cout << "i=" << i << " x=" << x << std::endl; + } + ALPAKA_ASSERT_OFFLOAD(x == 4 || x == 3); + ave += x; + imax = std::max(imax, int(x)); + } + ALPAKA_ASSERT_OFFLOAD(0 == as_h->size(N)); + std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl; + + // here verify use of block local counters + auto m1_d = make_device_buffer(queue); + alpaka::memset(queue, m1_d, 0); + auto m2_d = make_device_buffer(queue); + alpaka::memset(queue, m2_d, 0); + + Multiplicity::template launchZero(m1_d.data(), queue); + Multiplicity::template launchZero(m2_d.data(), queue); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv4N, countMulti(), v_d.data(), m1_d.data(), N)); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv4N, countMultiLocal(), v_d.data(), m2_d.data(), N)); + + const auto blocksPerGridTotBins = 1u; + const auto threadsPerBlockOrElementsPerThreadTotBins = Multiplicity::ctNOnes(); + const auto workDivTotBins = make_workdiv(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data())); + + Multiplicity::launchFinalize(m1_d.data(), queue); + Multiplicity::launchFinalize(m2_d.data(), queue); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data())); + + alpaka::wait(queue); + + return 0; + } +} diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc new file mode 100644 index 0000000000000..bffee8f1f533d --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc @@ -0,0 +1,219 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +// static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestPrefixScan) "]"; + +template +struct format_traits { +public: + static const constexpr char* failed_msg = "failed(int) size=%d, i=%d, blockDimension=%d: c[i]=%d c[i-1]=%d\n"; +}; + +template <> +struct format_traits { +public: + static const constexpr char* failed_msg = "failed(float size=%d, i=%d, blockDimension=%d: c[i]=%f c[i-1]=%f\n"; +}; + +template +struct testPrefixScan { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, unsigned int size) const { + auto& ws = alpaka::declareSharedVar(acc); + auto& c = alpaka::declareSharedVar(acc); + auto& co = alpaka::declareSharedVar(acc); + + for (auto i : elements_with_stride(acc, size)) { + c[i] = 1; + }; + + alpaka::syncBlockThreads(acc); + + blockPrefixScan(acc, c, co, size, ws); + blockPrefixScan(acc, c, size, ws); + + ALPAKA_ASSERT_OFFLOAD(1 == c[0]); + ALPAKA_ASSERT_OFFLOAD(1 == co[0]); + + // TODO: not needed? Not in multi kernel version, not in CUDA version + alpaka::syncBlockThreads(acc); + + for (auto i : elements_with_stride(acc, size)) { + if (0 == i) + continue; + if constexpr (!std::is_floating_point_v) { + if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i]))) + printf("c[%d]=%d, co[%d]=%d\n", i, c[i], i, co[i]); + } else { + if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i]))) + printf("c[%d]=%f, co[%d]=%f\n", i, c[i], i, co[i]); + } + ALPAKA_ASSERT_OFFLOAD(c[i] == c[i - 1] + 1); + ALPAKA_ASSERT_OFFLOAD(c[i] == i + 1); + ALPAKA_ASSERT_OFFLOAD(c[i] == co[i]); + } + } +}; + +/* + * NB: GPU-only, so do not care about elements here. + */ +template +struct testWarpPrefixScan { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t size) const { + if constexpr (!requires_single_thread_per_block_v) { + ALPAKA_ASSERT_OFFLOAD(size <= 32); + auto& c = alpaka::declareSharedVar(acc); + auto& co = alpaka::declareSharedVar(acc); + + uint32_t const blockDimension = alpaka::getWorkDiv(acc)[0u]; + uint32_t const blockThreadIdx = alpaka::getIdx(acc)[0u]; + auto i = blockThreadIdx; + c[i] = 1; + alpaka::syncBlockThreads(acc); + auto laneId = blockThreadIdx & 0x1f; + + warpPrefixScan(acc, laneId, c, co, i); + warpPrefixScan(acc, laneId, c, i); + + alpaka::syncBlockThreads(acc); + + ALPAKA_ASSERT_OFFLOAD(1 == c[0]); + ALPAKA_ASSERT_OFFLOAD(1 == co[0]); + if (i != 0) { + if (c[i] != c[i - 1] + 1) + printf(format_traits::failed_msg, size, i, blockDimension, c[i], c[i - 1]); + ALPAKA_ASSERT_OFFLOAD(c[i] == c[i - 1] + 1); + ALPAKA_ASSERT_OFFLOAD(c[i] == static_cast(i + 1)); + ALPAKA_ASSERT_OFFLOAD(c[i] == co[i]); + } + } else { + // We should never be called outsie of the GPU. + ALPAKA_ASSERT_OFFLOAD(false); + } + } +}; + +struct init { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t* v, uint32_t val, uint32_t n) const { + for (auto index : elements_with_stride(acc, n)) { + v[index] = val; + + if (index == 0) + printf("init\n"); + } + } +}; + +struct verify { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t const* v, uint32_t n) const { + for (auto index : elements_with_stride(acc, n)) { + ALPAKA_ASSERT_OFFLOAD(v[index] == index + 1); + + if (index == 0) + printf("verify\n"); + } + } +}; + +int main() { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + // auto const& host = cms::alpakatools::host(); + + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return 0; + } + + for (auto const& device : devices) { + std::cout << "Test prefix scan on " << alpaka::getName(device) << '\n'; + auto queue = Queue(device); + const auto warpSize = alpaka::getWarpSizes(device)[0]; + // WARP PREFIXSCAN (OBVIOUSLY GPU-ONLY) + if constexpr (!requires_single_thread_per_block_v) { + std::cout << "warp level" << std::endl; + + const auto threadsPerBlockOrElementsPerThread = 32; + const auto blocksPerGrid = 1; + const auto workDivWarp = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 32)); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 16)); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 5)); + } + + // PORTABLE BLOCK PREFIXSCAN + std::cout << "block level" << std::endl; + + // Running kernel with 1 block, and bs threads per block or elements per thread. + // NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version. + for (int bs = 32; bs <= 1024; bs += 32) { + const auto blocksPerGrid2 = 1; + const auto workDivSingleBlock = make_workdiv(blocksPerGrid2, bs); + + std::cout << "blocks per grid: " << blocksPerGrid2 << ", threads per block or elements per thread: " << bs + << std::endl; + + // Problem size + for (int j = 1; j <= 1024; ++j) { + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivSingleBlock, testPrefixScan(), j)); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivSingleBlock, testPrefixScan(), j)); + } + } + + // PORTABLE MULTI-BLOCK PREFIXSCAN + uint32_t num_items = 200; + for (int ksize = 1; ksize < 4; ++ksize) { + std::cout << "multiblock" << std::endl; + num_items *= 10; + + auto input_d = make_device_buffer(queue, num_items); + auto output1_d = make_device_buffer(queue, num_items); + auto blockCounter_d = make_device_buffer(queue); + + const auto nThreadsInit = 256; // NB: 1024 would be better + const auto nBlocksInit = divide_up_by(num_items, nThreadsInit); + const auto workDivMultiBlockInit = make_workdiv(nBlocksInit, nThreadsInit); + + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDivMultiBlockInit, init(), input_d.data(), 1, num_items)); + alpaka::memset(queue, blockCounter_d, 0); + + const auto nThreads = 1024; + const auto nBlocks = divide_up_by(num_items, nThreads); + const auto workDivMultiBlock = make_workdiv(nBlocks, nThreads); + + std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl; + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDivMultiBlock, + multiBlockPrefixScan(), + input_d.data(), + output1_d.data(), + num_items, + nBlocks, + blockCounter_d.data(), + warpSize)); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivMultiBlock, verify(), output1_d.data(), num_items)); + + alpaka::wait(queue); // input_d and output1_d end of scope + } // ksize + } + + return 0; +} diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc new file mode 100644 index 0000000000000..d84761e630588 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc @@ -0,0 +1,285 @@ +#include +#include +#include +using namespace std::chrono_literals; +#include +#include +#include +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h" +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +struct RS { + using type = std::uniform_int_distribution; + static auto ud() { return type(std::numeric_limits::min(), std::numeric_limits::max()); } + static constexpr T imax = std::numeric_limits::max(); +}; + +template <> +struct RS { + using T = float; + using type = std::uniform_real_distribution; + static auto ud() { return type(-std::numeric_limits::max() / 2, std::numeric_limits::max() / 2); } + // static auto ud() { return type(0,std::numeric_limits::max()/2);} + static constexpr int imax = std::numeric_limits::max(); +}; + +// A templated unsigned integer type with N bytes +template +struct uintN; + +template <> +struct uintN<8> { + using type = uint8_t; +}; + +template <> +struct uintN<16> { + using type = uint16_t; +}; + +template <> +struct uintN<32> { + using type = uint32_t; +}; + +template <> +struct uintN<64> { + using type = uint64_t; +}; + +template +using uintN_t = typename uintN::type; + +// A templated unsigned integer type with the same size as T +template +using uintT_t = uintN_t; + +// Keep only the `N` most significant bytes of `t`, and set the others to zero +template > +void truncate(T& t) { + const int shift = 8 * (sizeof(T) - N); + union { + T t; + uintT_t u; + } c; + c.t = t; + c.u = c.u >> shift << shift; + t = c.t; +} + +template +void go(Queue & queue, bool useShared) { + std::mt19937 eng; + //std::mt19937 eng2; + auto rgen = RS::ud(); + + std::chrono::high_resolution_clock::duration delta = 0ns; + constexpr int blocks = 10; + constexpr int blockSize = 256 * 32; + constexpr int N = blockSize * blocks; + auto v_h = cms::alpakatools::make_host_buffer(queue, N); + //uint16_t ind_h[N]; + + constexpr bool sgn = T(-1) < T(0); + std::cout << "Will sort " << N << (sgn ? " signed" : " unsigned") + << (std::numeric_limits::is_integer ? " 'ints'" : " 'float'") << " of size " << sizeof(T) << " using " + << NS << " significant bytes" << std::endl; + + for (int i = 0; i < 50; ++i) { + if (i == 49) { + for (long long j = 0; j < N; j++) + v_h[j] = 0; + } else if (i > 30) { + for (long long j = 0; j < N; j++) + v_h[j] = rgen(eng); + } else { + uint64_t imax = (i < 15) ? uint64_t(RS::imax) + 1LL : 255; + for (uint64_t j = 0; j < N; j++) { + v_h[j] = (j % imax); + if (j % 2 && i % 2) + v_h[j] = -v_h[j]; + } + } + + auto offsets_h = cms::alpakatools::make_host_buffer(queue, blocks + 1); + offsets_h[0] = 0; + for (int j = 1; j < blocks + 1; ++j) { + offsets_h[j] = offsets_h[j - 1] + blockSize - 3 * j; + assert(offsets_h[j] <= N); + } + + if (i == 1) { // special cases... + offsets_h[0] = 0; + offsets_h[1] = 0; + offsets_h[2] = 19; + offsets_h[3] = 32 + offsets_h[2]; + offsets_h[4] = 123 + offsets_h[3]; + offsets_h[5] = 256 + offsets_h[4]; + offsets_h[6] = 311 + offsets_h[5]; + offsets_h[7] = 2111 + offsets_h[6]; + offsets_h[8] = 256 * 11 + offsets_h[7]; + offsets_h[9] = 44 + offsets_h[8]; + offsets_h[10] = 3297 + offsets_h[9]; + } + + std::random_shuffle(v_h.data(), v_h.data() + N); + + auto v_d = cms::alpakatools::make_device_buffer(queue, N); + auto ind_d = cms::alpakatools::make_device_buffer(queue, N); + auto ind_h = cms::alpakatools::make_host_buffer(queue, N); + auto ws_d = cms::alpakatools::make_device_buffer(queue, N); + auto off_d = cms::alpakatools::make_device_buffer(queue, blocks + 1); + + alpaka::memcpy(queue, v_d, v_h); + alpaka::memcpy(queue, off_d, offsets_h); +// cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); +// cudaCheck(cudaMemcpy(off_d.get(), offsets_h, 4 * (blocks + 1), cudaMemcpyHostToDevice)); + + if (i < 2) + std::cout << "launch for " << offsets_h[blocks] << std::endl; + + auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; + + auto start = std::chrono::high_resolution_clock::now(); + // TODO: manage runtime sized shared memory + [[maybe_unused]] constexpr int MaxSize = 256 * 32; + auto workdiv = make_workdiv(blocks, ntXBl); + if (useShared) + // The original CUDA version used to call a kernel with __launch_bounds__(256, 4) specifier + // + alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, + radixSortMultiWrapper{}, v_d.data(), ind_d.data(), off_d.data(), nullptr)); +// cms::cuda::launch( +// radixSortMultiWrapper, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr); + else +// cms::cuda::launch( +// radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get()); + alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, + radixSortMultiWrapper2{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data())); + + if (i == 0) + std::cout << "done for " << offsets_h[blocks] << std::endl; + + alpaka::memcpy(queue, ind_h, ind_d); + alpaka::wait(queue); + //cudaCheck(cudaMemcpy(ind_h, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost)); + + delta += std::chrono::high_resolution_clock::now() - start; + + if (i == 0) + std::cout << "done for " << offsets_h[blocks] << std::endl; + + if (32 == i) { + std::cout << LL(v_h[ind_h[0]]) << ' ' << LL(v_h[ind_h[1]]) << ' ' << LL(v_h[ind_h[2]]) << std::endl; + std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]]) << std::endl; + std::cout << LL(v_h[ind_h[blockSize / 2 - 1]]) << ' ' << LL(v_h[ind_h[blockSize / 2]]) << ' ' + << LL(v_h[ind_h[blockSize / 2 + 1]]) << std::endl; + } + for (int ib = 0; ib < blocks; ++ib) { + std::set inds; + if (offsets_h[ib + 1] > offsets_h[ib]) + inds.insert(ind_h[offsets_h[ib]]); + for (auto j = offsets_h[ib] + 1; j < offsets_h[ib + 1]; j++) { + if (inds.count(ind_h[j])) { + printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n", + i, ib, j, ind_h[j]); + std::vector counts; + counts.resize(offsets_h[ib + 1] - offsets_h[ib], 0); + for (size_t j2 = offsets_h[ib]; j2 < offsets_h[ib + 1]; j2++) { + counts[ind_h[j2]]++; + } + for (size_t j2 = 0; j2 < counts.size(); j2++) { + if (counts[j2]!=1) + printf("counts[%ld]=%d ", j2, counts[j2]); + } + printf("\n"); + } + assert(0 == inds.count(ind_h[j])); + inds.insert(ind_h[j]); + auto a = v_h.data() + offsets_h[ib]; + auto k1 = a[ind_h[j]]; + auto k2 = a[ind_h[j - 1]]; + truncate(k1); + truncate(k2); + if (k1 < k2) { + std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib] << ", " << offsets_h[ib + 1] - 1 + << "] j=" << j << " ind[j]=" << ind_h[j] << " (k1 < k2) : a1=" << a[ind_h[j]] << " k1=" << k1 + << "a2= " << a[ind_h[j - 1]] << " k2=" << k2 << std::endl; + assert(false); + } + } + if (!inds.empty()) { + assert(0 == *inds.begin()); + assert(inds.size() - 1 == *inds.rbegin()); + } + if (inds.size() != (offsets_h[ib + 1] - offsets_h[ib])) + std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets_h[ib + 1] - offsets_h[ib]) + << std::endl; + // + assert(inds.size() == (offsets_h[ib + 1] - offsets_h[ib])); + } + } // 50 times + std::cout << "Kernel computation took " << std::chrono::duration_cast(delta).count() / 50. + << " ms per pass" << std::endl; +} + +int main() { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return 0; + } + + for (auto const& device : devices) { + Queue queue(device); + bool useShared = false; + + std::cout << "using Global memory" << std::endl; + + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + // go(v_h); + + useShared = true; + + std::cout << "using Shared memory" << std::endl; + + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + + go(queue, useShared); + go(queue, useShared); + go(queue, useShared); + // go(v_h); + } + return 0; +} diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc new file mode 100644 index 0000000000000..c29b571c6d356 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc @@ -0,0 +1,101 @@ +// author: Felice Pantaleo, CERN, 2018 +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h" +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +struct vector_pushback { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, SimpleVector* foo) const { + for (auto index : elements_with_stride(acc)) + foo->push_back(acc, index); + } +}; + +struct vector_reset { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, SimpleVector* foo) const { + foo->reset(); + } +}; + +struct vector_emplace_back { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, SimpleVector* foo) const { + for (auto index : elements_with_stride(acc)) + foo->emplace_back(acc, index); + } +}; + +int main() { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return 0; + } + + // run the test on each device + for (auto const& device : devices) { + Queue queue(device); + auto maxN = 10000; + auto vec_h = make_host_buffer>(queue); + auto vec_d = make_device_buffer>(queue); + auto data_h = make_host_buffer(queue, maxN); + auto data_d = make_device_buffer(queue, maxN); + + [[maybe_unused]] auto v = make_SimpleVector(maxN, data_d.data()); + + // Prepare the vec object on the host + auto tmp_vec_h = make_host_buffer>(queue); + make_SimpleVector(tmp_vec_h.data(), maxN, data_d.data()); + assert(tmp_vec_h->size() == 0); + assert(tmp_vec_h->capacity() == static_cast(maxN)); + + // ... and copy the object to the device. + alpaka::memcpy(queue, vec_d, tmp_vec_h); + alpaka::wait(queue); + + int numBlocks = 5; + int numThreadsPerBlock = 256; + const auto workDiv = make_workdiv(numBlocks, numThreadsPerBlock); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, vector_pushback(), vec_d.data())); + alpaka::wait(queue); + + alpaka::memcpy(queue, vec_h, vec_d); + alpaka::wait(queue); + printf("vec_h->size()=%d, numBlocks * numThreadsPerBlock=%d, maxN=%d\n", + vec_h->size(), + numBlocks * numThreadsPerBlock, + maxN); + assert(vec_h->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN)); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, vector_reset(), vec_d.data())); + alpaka::wait(queue); + + alpaka::memcpy(queue, vec_h, vec_d); + alpaka::wait(queue); + + assert(vec_h->size() == 0); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, vector_emplace_back(), vec_d.data())); + alpaka::wait(queue); + + alpaka::memcpy(queue, vec_h, vec_d); + alpaka::wait(queue); + + assert(vec_h->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN)); + + alpaka::memcpy(queue, data_h, data_d); + } + std::cout << "TEST PASSED" << std::endl; + return 0; +} diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc new file mode 100644 index 0000000000000..f4f4719192c12 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc @@ -0,0 +1,147 @@ +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +// Kernel running a loop over threads/elements +// One function with multiple flavors +enum class RangeType { + Default, ExtentLimited, ExtentLimitedWithShift +}; + +enum class LoopScope { + Block, Grid +}; + +template +bool constexpr firstInLoopRange(TAcc const& acc) { + if constexpr (loopScope == LoopScope::Block) + return !alpaka::getIdx(acc)[0u]; + if constexpr (loopScope == LoopScope::Grid) + return !alpaka::getIdx(acc)[0u]; + assert(false); +} + +template +size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) { + if constexpr (rangeType == RangeType::ExtentLimitedWithShift) + return shift < size ? size - shift: 0; + else if constexpr (rangeType == RangeType::ExtentLimited) + return size; + else /* rangeType == RangeType::Default */ + if constexpr (loopScope == LoopScope::Block) + return alpaka::getWorkDiv(acc)[0u]; + else + return alpaka::getWorkDiv(acc)[0u]; +} + +template +size_t constexpr expectedCount(WorkDiv1D const &workDiv, size_t size, size_t shift) { + if constexpr (rangeType == RangeType::ExtentLimitedWithShift) + return shift < size ? size - shift: 0; + else if constexpr (rangeType == RangeType::ExtentLimited) + return size; + else /* rangeType == RangeType::Default */ + if constexpr (loopScope == LoopScope::Block) + return workDiv.m_blockThreadExtent[0u] * workDiv.m_threadElemExtent[0u]; + else + return workDiv.m_gridBlockExtent[0u] * workDiv.m_blockThreadExtent[0u] * workDiv.m_threadElemExtent[0u]; +} + +template +struct testWordDivisionDefaultRange { + template + ALPAKA_FN_ACC void operator() (TAcc const & acc, size_t size, size_t shift, size_t *globalCounter) const { + size_t & counter = ( loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar(acc)); + // Init the counter for block range. Grid range does so my mean of memset. + if constexpr (loopScope == LoopScope::Block) { + if (firstInLoopRange(acc)) + counter = 0; + alpaka::syncBlockThreads(acc); + } + // The loop we are testing + if constexpr (rangeType == RangeType::Default) + for ([[maybe_unused]] auto idx: elements_with_stride(acc)) + alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{}); + else if constexpr (rangeType == RangeType::ExtentLimited) + for ([[maybe_unused]] auto idx: elements_with_stride(acc, size)) + alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{}); + else if constexpr (rangeType == RangeType::ExtentLimitedWithShift) + for ([[maybe_unused]]auto idx: elements_with_stride(acc, size, shift)) + alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{}); + alpaka::syncBlockThreads(acc); + // Check the result. Grid range will check by memcpy-ing the result. + if constexpr (loopScope == LoopScope::Block) { + if (firstInLoopRange(acc)) { + auto expected = expectedCount(acc, size, shift); + assert(counter == expected); + } + } + } +}; + +int main() { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return 0; + } + + for (auto const& device : devices) { + // Get global memory + Queue queue(device); + auto counter_d = cms::alpakatools::make_device_buffer(queue); + auto counter_h = cms::alpakatools::make_host_buffer(queue); + alpaka::memset(queue, counter_d, 0); + ssize_t BlockSize = 512; + size_t GridSize = 4; + for (size_t blocks = 1; blocks < GridSize * 3; blocks++) + for (auto sizeFuzz: std::initializer_list{ - BlockSize / 2, -1, 0, 1, BlockSize/2 }) + for (auto shift: std::initializer_list{0, 1, BlockSize / 2, BlockSize - 1, BlockSize, BlockSize + 1, BlockSize + BlockSize / 2, 2*BlockSize -1, 2*BlockSize, 2*BlockSize + 1}) { + // Grid level iteration: we need to initialize/check at the grid level + // Default range + alpaka::memset(queue, counter_d, 0); + auto workdiv = make_workdiv(BlockSize, GridSize); + alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::memcpy(queue, counter_h, counter_d); + alpaka::wait(queue); + auto expected = expectedCount(workdiv, blocks* BlockSize + sizeFuzz, shift); + assert(*counter_h.data() == expected); + + // ExtentLimited range + alpaka::memset(queue, counter_d, 0); + alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::memcpy(queue, counter_h, counter_d); + alpaka::wait(queue); + expected = expectedCount(workdiv, blocks* BlockSize + sizeFuzz, shift); + assert(*counter_h.data() == expected); + + // ExtentLimitedWithShift range + alpaka::memset(queue, counter_d, 0); + alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::memcpy(queue, counter_h, counter_d); + alpaka::wait(queue); + expected = expectedCount(workdiv, blocks* BlockSize + sizeFuzz, shift); + assert(*counter_h.data() == expected); + + // Block level auto tests + alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + } + alpaka::wait(queue); + } +} \ No newline at end of file diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h index ee8048431ab6d..416e647c349a4 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h +++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h @@ -203,7 +203,7 @@ __device__ __forceinline__ void radixSortImpl( if (threadIdx.x == 0) ++p; __syncthreads(); - } + } if ((w != 8) && (0 == (NS & 1))) assert(j == ind); // w/d is even so ind is correct @@ -261,7 +261,7 @@ namespace cms { template __global__ void __launch_bounds__(256, 4) - radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { radixSortMulti(v, index, offsets, workspace); } diff --git a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu index 8b6ffd70fd4e6..6b3ee2e4afa9c 100644 --- a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu @@ -74,7 +74,7 @@ namespace { __syncthreads(); radixSort(z, order, sws, elements); __syncthreads(); - + //verify for (unsigned int itrack = firstElement; itrack < (elements - 1); itrack += gridSize) { auto ntrack = order[itrack]; From 9af4710b6185c1cea3bea71fbd17daba6d786b69 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 15 Jan 2024 14:35:23 +0100 Subject: [PATCH 2/3] Moves from elements_with_stride to independent_group_elements --- .../AlpakaInterface/interface/radixSort.h | 32 ++++++++++++------- .../test/alpaka/testRadixSort.dev.cc | 4 +-- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h index 893132905f934..bcf1da16a5533 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h +++ b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h @@ -1,9 +1,13 @@ #ifndef HeterogeneousCore_AlpakaInterface_interface_radixSort_h #define HeterogeneousCore_AlpakaInterface_interface_radixSort_h +#include #include +#include #include + #include + #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" namespace cms::alpakatools { @@ -125,17 +129,17 @@ namespace cms::alpakatools { auto k = ind2; // Initializer index order to trivial increment. - for (auto idx: elements_with_stride(acc, size)) { j[idx] = idx; } + for (auto idx: independent_group_elements(acc, size)) { j[idx] = idx; } alpaka::syncBlockThreads(acc); // Iterate on the slices of the data. while (alpaka::syncBlockThreadsPredicate(acc, (currentSortingPass < totalSortingPassses))) { - for (auto idx: elements_with_stride(acc, binsNumber)) { c[idx] = 0; } + for (auto idx: independent_group_elements(acc, binsNumber)) { c[idx] = 0; } alpaka::syncBlockThreads(acc); const auto sortingPassShift = binBits * currentSortingPass; // fill bins (count elements in each bin) - for (auto idx: elements_with_stride(acc, size)) { + for (auto idx: independent_group_elements(acc, size)) { auto bin = (a[j[idx]] >> sortingPassShift) & binsMask; alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{}); } @@ -154,7 +158,7 @@ namespace cms::alpakatools { // prefix scan "optimized"???... // TODO: we might be able to reuse the warpPrefixScan function // Warp level prefix scan - for (auto idx: elements_with_stride(acc, binsNumber)) { + for (auto idx: independent_group_elements(acc, binsNumber)) { auto x = c[idx]; auto laneId = idx & warpMask; @@ -168,7 +172,7 @@ namespace cms::alpakatools { alpaka::syncBlockThreads(acc); // Block level completion of prefix scan (add last sum of each preceding warp) - for (auto idx: elements_with_stride(acc, binsNumber)) { + for (auto idx: independent_group_elements(acc, binsNumber)) { auto ss = (idx / warpSize) * warpSize - 1; c[idx] = ct[idx]; for (int i = ss; i > 0; i -= warpSize) @@ -198,7 +202,7 @@ namespace cms::alpakatools { // Iterate on bin-sized slices to (size - 1) / binSize + 1 iterations while (alpaka::syncBlockThreadsPredicate(acc, ibs >= 0)) { // Init - for (auto idx: elements_with_stride(acc, binsNumber)) { + for (auto idx: independent_group_elements(acc, binsNumber)) { cu[idx] = -1; ct[idx] = -1; } @@ -206,7 +210,7 @@ namespace cms::alpakatools { // Find the highest index for all the threads dealing with a given bin (in cu[]) // Also record the bin for each thread (in ct[]) - for (auto idx: elements_with_stride(acc, binsNumber)) { + for (auto idx: independent_group_elements(acc, binsNumber)) { int i = ibs - idx; int32_t bin = -1; if (i >= 0) { @@ -226,7 +230,7 @@ namespace cms::alpakatools { // FIXME: we can slash a memory access. - for (auto idx: elements_with_stride(acc, binsNumber)) { + for (auto idx: independent_group_elements(acc, binsNumber)) { int i = ibs - idx; // Are we still in inside the data? if (i >= 0) { @@ -302,7 +306,7 @@ namespace cms::alpakatools { // TODO this copy is (doubly?) redundant with the reorder if (j != ind) // odd number of sorting passes, we need to move the result to the right array (ind[]) - for (auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; }; + for (auto idx: independent_group_elements(acc, size)) { ind[idx] = ind2[idx]; }; alpaka::syncBlockThreads(acc); @@ -360,8 +364,9 @@ namespace cms::alpakatools { const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { static_assert(requires_single_thread_per_block_v, "CPU sort (not a radixSort) called wtth wrong accelerator"); // Initialize the index array - for (std::size_t i = 0; i < size; i++) ind[i] = i; - printf("std::sort(a=%p, ind=%p, indmax=%p, size=%d)\n", a, ind, ind + size, size); + std::iota(ind, ind + size, 0); + /* + printf("std::stable_sort(a=%p, ind=%p, indmax=%p, size=%d)\n", a, ind, ind + size, size); for (uint32_t i=0; i<10 && i diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc index d84761e630588..df41c649b91cb 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc @@ -193,7 +193,7 @@ void go(Queue & queue, bool useShared) { if (offsets_h[ib + 1] > offsets_h[ib]) inds.insert(ind_h[offsets_h[ib]]); for (auto j = offsets_h[ib] + 1; j < offsets_h[ib + 1]; j++) { - if (inds.count(ind_h[j])) { + if (inds.count(ind_h[j]) != 0) { printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n", i, ib, j, ind_h[j]); std::vector counts; @@ -206,6 +206,7 @@ void go(Queue & queue, bool useShared) { printf("counts[%ld]=%d ", j2, counts[j2]); } printf("\n"); + printf("inds.count(ind_h[j] = %lu\n", inds.count(ind_h[j])); } assert(0 == inds.count(ind_h[j])); inds.insert(ind_h[j]); @@ -218,7 +219,6 @@ void go(Queue & queue, bool useShared) { std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib] << ", " << offsets_h[ib + 1] - 1 << "] j=" << j << " ind[j]=" << ind_h[j] << " (k1 < k2) : a1=" << a[ind_h[j]] << " k1=" << k1 << "a2= " << a[ind_h[j - 1]] << " k2=" << k2 << std::endl; - assert(false); } } if (!inds.empty()) { From e03dde38247546aa50d232ab9722ef1eb3961db5 Mon Sep 17 00:00:00 2001 From: Eric Cano Date: Mon, 15 Jan 2024 20:11:47 +0100 Subject: [PATCH 3/3] Final fixes to unit tests of port to Alpaka Reinstates error messages instead of silent call to `REQUIRE(!devices.empty());` Adds maybe_unused attirbut for variable used in assert. --- .../AlpakaInterface/interface/VecArray.h | 2 +- .../AlpakaInterface/interface/radixSort.h | 206 ++++++++++-------- .../AlpakaInterface/interface/workdivision.h | 6 + .../AlpakaInterface/test/BuildFile.xml | 1 - .../test/alpaka/testAtomicPairCounter.dev.cc | 7 +- .../test/alpaka/testOneHistoContainer.dev.cc | 2 +- .../test/alpaka/testOneRadixSort.dev.cc | 110 +++++----- .../test/alpaka/testRadixSort.dev.cc | 64 +++--- .../test/alpaka/testWorkDivision.dev.cc | 114 +++++++--- .../CUDAUtilities/interface/radixSort.h | 4 +- .../CUDAUtilities/test/oneRadixSort_t.cu | 2 +- .../CUDAUtilities/test/radixSort_t.cu | 9 +- 12 files changed, 308 insertions(+), 219 deletions(-) diff --git a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h index c4025dd42d4a6..231170212a37b 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h +++ b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h @@ -42,7 +42,7 @@ namespace cms::alpakatools { } } - inline constexpr T cont& back() const { + inline constexpr T const &back() const { if (m_size > 0) { return m_data[m_size - 1]; } else diff --git a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h index bcf1da16a5533..0f94ad200efd9 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h +++ b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h @@ -9,6 +9,7 @@ #include #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" namespace cms::alpakatools { @@ -26,20 +27,27 @@ namespace cms::alpakatools { alpaka::syncBlockThreads(acc); // find first negative - for(auto idx: elements_with_stride(acc, size - 1)) { - if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) + for (auto idx : independent_group_elements(acc, size - 1)) { + if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) { firstNeg = idx + 1; + } } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[idx - firstNeg] = ind[idx]; } + for (auto idx : independent_group_elements(acc, firstNeg, size)) { + ind2[idx - firstNeg] = ind[idx]; + } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; } + for (auto idx : independent_group_elements(acc, firstNeg)) { + ind2[idx + size - firstNeg] = ind[idx]; + } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; } + for (auto idx : independent_group_elements(acc, size)) { + ind[idx] = ind2[idx]; + } } template @@ -52,51 +60,56 @@ namespace cms::alpakatools { alpaka::syncBlockThreads(acc); // find first negative - for(auto idx: elements_with_stride(acc, size - 1)) { + for (auto idx : independent_group_elements(acc, size - 1)) { if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) firstNeg = idx + 1; } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[size - idx - 1] = ind[idx]; } + for (auto idx : independent_group_elements(acc, firstNeg, size)) { + ind2[size - idx - 1] = ind[idx]; + } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; } + for (auto idx : independent_group_elements(acc, firstNeg)) { + ind2[idx + size - firstNeg] = ind[idx]; + } alpaka::syncBlockThreads(acc); - for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; } + for (auto idx : independent_group_elements(acc, size)) { + ind[idx] = ind2[idx]; + } } - // Radix sort implements a bytewise lexicographic order on the input data. // Data is reordered into bins indexed by the byte considered. But considering least significant bytes first // and respecting the existing order when binning the values, we achieve the lexicographic ordering. // The number of bytes actually considered is a parameter template parameter. // The post processing reorder - // function fixes the order when bitwise ordering is not the order for the underlying type (only based on + // function fixes the order when bitwise ordering is not the order for the underlying type (only based on // most significant bit for signed types, integer or floating point). // The floating point numbers are reinterpret_cast into integers in the calling wrapper // This algorithm requires to run in a single block template // The post processing reorder function. + typename T, // shall be integer, signed or not does not matter here + int NS, // number of significant bytes to use in sorting. + typename RF> // The post processing reorder function. ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortImpl( const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { if constexpr (!requires_single_thread_per_block_v) { const auto warpSize = alpaka::warp::getSize(acc); const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); - const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); + [[maybe_unused]] const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); // we expect a power of 2 here assert(warpSize && (0 == (warpSize & (warpSize - 1)))); - const std::size_t warpMask = warpSize - 1; + const std::size_t warpMask = warpSize - 1; // Define the bin size (d=8 => 1 byte bin). constexpr int binBits = 8, dataBits = 8 * sizeof(T), totalSortingPassses = dataBits / binBits; // Make sure the slices are data aligned - static_assert (0 == dataBits % binBits); + static_assert(0 == dataBits % binBits); // Make sure the NS parameter makes sense - static_assert (NS > 0 && NS <= sizeof(T)); + static_assert(NS > 0 && NS <= sizeof(T)); constexpr int binsNumber = 1 << binBits; constexpr int binsMask = binsNumber - 1; // Prefix scan iterations. NS is counted in full bytes and not slices. @@ -106,7 +119,7 @@ namespace cms::alpakatools { // TODO: rename auto& c = alpaka::declareSharedVar(acc); // Temporary storage for prefix scan. Only really needed for first-of-warp keeping - // Then used for thread to bin mapping TODO: change type to byte and remap to + // Then used for thread to bin mapping TODO: change type to byte and remap to auto& ct = alpaka::declareSharedVar(acc); // Bin to thread index mapping (used to store the highest thread index within a bin number // batch of threads. @@ -114,7 +127,7 @@ namespace cms::alpakatools { // lowest possible value (change to bytes?) auto& cu = alpaka::declareSharedVar(acc); // TODO we could also have an explicit caching of the current index for each thread. - + // TODO: do those have to be shared? auto& ibs = alpaka::declareSharedVar(acc); auto& currentSortingPass = alpaka::declareSharedVar(acc); @@ -129,36 +142,40 @@ namespace cms::alpakatools { auto k = ind2; // Initializer index order to trivial increment. - for (auto idx: independent_group_elements(acc, size)) { j[idx] = idx; } + for (auto idx : independent_group_elements(acc, size)) { + j[idx] = idx; + } alpaka::syncBlockThreads(acc); // Iterate on the slices of the data. while (alpaka::syncBlockThreadsPredicate(acc, (currentSortingPass < totalSortingPassses))) { - for (auto idx: independent_group_elements(acc, binsNumber)) { c[idx] = 0; } + for (auto idx : independent_group_elements(acc, binsNumber)) { + c[idx] = 0; + } alpaka::syncBlockThreads(acc); const auto sortingPassShift = binBits * currentSortingPass; // fill bins (count elements in each bin) - for (auto idx: independent_group_elements(acc, size)) { + for (auto idx : independent_group_elements(acc, size)) { auto bin = (a[j[idx]] >> sortingPassShift) & binsMask; alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{}); - } + } alpaka::syncBlockThreads(acc); - - if (!threadIdxLocal && !alpaka::getIdx(acc)[0]) { - printf("Pass=%d ", currentSortingPass - 1); + + if (!threadIdxLocal && 1 == alpaka::getIdx(acc)[0]) { + // printf("Pass=%d, Block=%d, ", currentSortingPass - 1, alpaka::getIdx(acc)[0]); size_t total = 0; - for(int i=0; i<(int)binsNumber; i++) { - printf("count[%d]=%d ", i, c[i] ); + for (int i = 0; i < (int)binsNumber; i++) { + // printf("count[%d]=%d ", i, c[i] ); total += c[i]; } - printf("total=%zu\n", total); + // printf("total=%zu\n", total); assert(total == size); } // prefix scan "optimized"???... // TODO: we might be able to reuse the warpPrefixScan function // Warp level prefix scan - for (auto idx: independent_group_elements(acc, binsNumber)) { + for (auto idx : independent_group_elements(acc, binsNumber)) { auto x = c[idx]; auto laneId = idx & warpMask; @@ -170,16 +187,16 @@ namespace cms::alpakatools { ct[idx] = x; } alpaka::syncBlockThreads(acc); - + // Block level completion of prefix scan (add last sum of each preceding warp) - for (auto idx: independent_group_elements(acc, binsNumber)) { + for (auto idx : independent_group_elements(acc, binsNumber)) { auto ss = (idx / warpSize) * warpSize - 1; c[idx] = ct[idx]; for (int i = ss; i > 0; i -= warpSize) c[idx] += ct[i]; } // Post prefix scan, c[bin] contains the offsets in index counts to the last index +1 for each bin - + /* //prefix scan for the nulls (for documentation) if (threadIdxLocal==0) @@ -191,18 +208,11 @@ namespace cms::alpakatools { // This will reorder the indices by the currently considered slice, otherwise preserving the previous order. ibs = size - 1; alpaka::syncBlockThreads(acc); - if (!threadIdxLocal && !alpaka::getIdx(acc)[0]) { - printf("Pass=%d (post prefix scan) ", currentSortingPass - 1); - for(int i=0; i<(int)binsNumber; i++) { - printf("offset[%d]=%d ", i, c[i] ); - } - printf("\n"); - } // Iterate on bin-sized slices to (size - 1) / binSize + 1 iterations while (alpaka::syncBlockThreadsPredicate(acc, ibs >= 0)) { // Init - for (auto idx: independent_group_elements(acc, binsNumber)) { + for (auto idx : independent_group_elements(acc, binsNumber)) { cu[idx] = -1; ct[idx] = -1; } @@ -210,7 +220,7 @@ namespace cms::alpakatools { // Find the highest index for all the threads dealing with a given bin (in cu[]) // Also record the bin for each thread (in ct[]) - for (auto idx: independent_group_elements(acc, binsNumber)) { + for (auto idx : independent_group_elements(acc, binsNumber)) { int i = ibs - idx; int32_t bin = -1; if (i >= 0) { @@ -220,24 +230,16 @@ namespace cms::alpakatools { } } alpaka::syncBlockThreads(acc); - if (!threadIdxLocal && !alpaka::getIdx(acc)[0]) { - printf("Pass=%d (max index) ", currentSortingPass - 1); - for(int i=0; i<(int)binsNumber; i++) { - printf("max_i[%d]=%d ", i, cu[i] ); - } - printf("\n"); - } - // FIXME: we can slash a memory access. - for (auto idx: independent_group_elements(acc, binsNumber)) { + for (auto idx : independent_group_elements(acc, binsNumber)) { int i = ibs - idx; // Are we still in inside the data? if (i >= 0) { int32_t bin = ct[idx]; // Are we the thread with the highest index (from previous pass)? if (cu[bin] == i) { - // With the highest index, we are actually the lowest thread number. We will + // With the highest index, we are actually the lowest thread number. We will // work "on behalf of" the higher thread numbers (including ourselves) // No way around scanning and testing for bin in ct[otherThread] number to find the other threads for (int peerThreadIdx = idx; peerThreadIdx < binsNumber; peerThreadIdx++) { @@ -292,21 +294,17 @@ namespace cms::alpakatools { if (threadIdxLocal == 0) ++currentSortingPass; alpaka::syncBlockThreads(acc); - if (!threadIdxLocal && size == 257) { - printf("Pass=%d ", currentSortingPass - 1); - for(int i=0; i<(int)size; i++) { - printf("k[%d]=%d ", i, k[i] ); - } - printf("\n"); - } } if ((dataBits != 8) && (0 == (NS & 1))) - ALPAKA_ASSERT_OFFLOAD(j == ind); // dataBits/binBits is even so ind is correct (the result is in the right location) + ALPAKA_ASSERT_OFFLOAD(j == + ind); // dataBits/binBits is even so ind is correct (the result is in the right location) // TODO this copy is (doubly?) redundant with the reorder if (j != ind) // odd number of sorting passes, we need to move the result to the right array (ind[]) - for (auto idx: independent_group_elements(acc, size)) { ind[idx] = ind2[idx]; }; + for (auto idx : independent_group_elements(acc, size)) { + ind[idx] = ind2[idx]; + }; alpaka::syncBlockThreads(acc); @@ -321,41 +319,36 @@ namespace cms::alpakatools { template ::value && !requires_single_thread_per_block_v, T>::type* = nullptr> + typename std::enable_if::value && !requires_single_thread_per_block_v, T>::type* = + nullptr> ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { - if (!alpaka::getIdx(acc)[0]) { - printf("GPU radixSort unsigned, a=%p, block=%d, size=%d\n", a, alpaka::getIdx(acc)[0], size); - } radixSortImpl(acc, a, ind, ind2, size, dummyReorder); } template ::value && std::is_signed::value && !requires_single_thread_per_block_v, T>::type* = nullptr> + typename std::enable_if::value && std::is_signed::value && + !requires_single_thread_per_block_v, + T>::type* = nullptr> ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { - if (!alpaka::getIdx(acc)[0]) { - printf("GPU radixSort signed, a=%p, block=%d, size=%d\n", a, alpaka::getIdx(acc)[0], size); - } radixSortImpl(acc, a, ind, ind2, size, reorderSigned); } template ::value && !requires_single_thread_per_block_v, T>::type* = nullptr> + typename std::enable_if::value && !requires_single_thread_per_block_v, + T>::type* = nullptr> ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort( const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { static_assert(sizeof(T) == sizeof(int), "radixSort with the wrong type size"); using I = int; - if (!alpaka::getIdx(acc)[0]) { - printf("GPU radixSort float, a=%p, block=%d, size=%d\n", a, alpaka::getIdx(acc)[0], size); - } radixSortImpl(acc, (I const*)(a), ind, ind2, size, reorderFloat); } - + template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti(const TAcc& acc, - T const* v, - uint16_t* index, - uint32_t const* offsets, - uint16_t* workspace) { - // TODO: check + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti( + const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + // TODO: check // Sort multiple blocks of data in v[] separated by in chunks located at offsets[] // extern __shared__ uint16_t ws[]; - uint16_t * ws = alpaka::getDynSharedMem(acc); + uint16_t* ws = alpaka::getDynSharedMem(acc); const uint32_t blockIdx(alpaka::getIdx(acc)[0u]); auto a = v + offsets[blockIdx]; @@ -402,23 +392,24 @@ namespace cms::alpakatools { auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx]; auto size = offsets[blockIdx + 1] - offsets[blockIdx]; assert(offsets[blockIdx + 1] >= offsets[blockIdx]); - if (0 == alpaka::getIdx(acc)[0]) { - printf("Block=%d, offsets[blockIdx]=%d, size=%d, v=%p, v[offset[blockId]=%p\n", - blockIdx, offsets[blockIdx], size, v, &v[offsets[blockIdx]]); - } if (size > 0) radixSort(acc, a, ind, ind2, size); } - + template struct radixSortMultiWrapper { -/* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical + /* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical (keeping CUDA/HIP code for reference for the moment) #if defined(__CUDACC__) || defined(__HIPCC__) //__global__ void __launch_bounds__(256, 4) #endif */ template - ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const { + ALPAKA_FN_ACC void operator()(const TAcc& acc, + T const* v, + uint16_t* index, + uint32_t const* offsets, + uint16_t* workspace, + size_t sharedMemBytes = 0) const { radixSortMulti(acc, v, index, offsets, workspace); } }; @@ -426,10 +417,39 @@ namespace cms::alpakatools { template struct radixSortMultiWrapper2 { template - ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const { + ALPAKA_FN_ACC void operator()(const TAcc& acc, + T const* v, + uint16_t* index, + uint32_t const* offsets, + uint16_t* workspace, + size_t sharedMemBytes = 0) const { radixSortMulti(acc, v, index, offsets, workspace); } }; } // namespace cms::alpakatools +namespace alpaka::trait { + // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of + // block shared dynamic memory for the radixSortMultiWrapper kernel + template + struct BlockSharedMemDynSizeBytes, TAcc> { + // the size in bytes of the shared memory allocated for a block + ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes( + cms::alpakatools::radixSortMultiWrapper const& /* kernel */, + alpaka_common::Vec1D /* threads */, + alpaka_common::Vec1D /* elements */, + T const* /* v */, + uint16_t* /* index */, + uint32_t const* /* offsets */, + uint16_t* workspace, + size_t sharedMemBytes) { + if (workspace != nullptr) + return 0; + /* The shared memory workspace is 'blockspace * 2' in CUDA *but that's a value coincidence... TODO: check */ + //printf ("in BlockSharedMemDynSizeBytes, TAcc>: shared mem size = %d\n", (int)sharedMemBytes); + return sharedMemBytes; + } + }; +} // namespace alpaka::trait + #endif // HeterogeneousCore_AlpakaInterface_interface_radixSort_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 4c0aa9fe5f2b9..e02f4e92f813e 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -701,6 +701,12 @@ namespace cms::alpakatools { stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, extent_{extent} {} + ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc, Idx first, Idx extent) + : elements_{alpaka::getWorkDiv(acc)[0u]}, + thread_{alpaka::getIdx(acc)[0u] * elements_ + first}, + stride_{alpaka::getWorkDiv(acc)[0u] * elements_}, + extent_{extent} {} + class const_iterator; using iterator = const_iterator; diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml index 463bf5dae4461..0198e36f9166f 100644 --- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml +++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml @@ -58,7 +58,6 @@ - diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc index f4073997f5c97..1687feb8c1bab 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc @@ -42,7 +42,12 @@ struct finalize { TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestAtomicPair), s_tag) { SECTION("AtomicPairCounter") { auto const &devices = cms::alpakatools::devices(); - REQUIRE(!devices.empty()); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + REQUIRE(not devices.empty()); + } + // run the test on each device for (auto const &device : devices) { std::cout << "Test AtomicPairCounter on " << alpaka::getName(device) << '\n'; diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc index 5e08c05725888..4ce11cc7facdd 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc @@ -93,7 +93,7 @@ struct mykernel { #ifndef NDEBUG auto b0 = Hist::bin(v[j]); #endif - int tot = 0; + [[maybe_unused]] int tot = 0; auto ftest = [&](unsigned int k) { ALPAKA_ASSERT_OFFLOAD(k < N); ++tot; diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc index c781f83ff1225..21f3477a3dd92 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc @@ -64,7 +64,8 @@ ALPAKA_FN_HOST_ACC T truncate(T const& t) { namespace { struct testKernel { template - ALPAKA_FN_ACC void operator() (const TAcc &acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const { + ALPAKA_FN_ACC void operator()( + const TAcc& acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const { //size_t firstElement = threadIdx.x + blockIdx.x * blockDim.x; // This is going to be the track index //size_t gridSize = blockDim.x * gridDim.x; bool threadZero = !alpaka::getIdx(acc)[0u]; @@ -76,15 +77,15 @@ namespace { assert(0 == blocksIdx); assert(elements <= 2048); - auto &order = alpaka::declareSharedVar(acc); - auto &sws = alpaka::declareSharedVar(acc); - auto &z = alpaka::declareSharedVar(acc); - auto &iz = alpaka::declareSharedVar(acc); -// __shared__ uint16_t order[2048]; -// __shared__ uint16_t sws[2048]; -// __shared__ float z[2048]; -// __shared__ int iz[2048]; - for (auto itrack: elements_with_stride(acc, elements)) { + auto& order = alpaka::declareSharedVar(acc); + auto& sws = alpaka::declareSharedVar(acc); + auto& z = alpaka::declareSharedVar(acc); + auto& iz = alpaka::declareSharedVar(acc); + // __shared__ uint16_t order[2048]; + // __shared__ uint16_t sws[2048]; + // __shared__ float z[2048]; + // __shared__ int iz[2048]; + for (auto itrack : elements_with_stride(acc, elements)) { z[itrack] = gpu_input[itrack]; iz[itrack] = 10000 * gpu_input[itrack]; // order[itrack] = itrack; @@ -94,7 +95,7 @@ namespace { alpaka::syncBlockThreads(acc); //verify - for (auto itrack: elements_with_stride(acc, elements - 1)) { + for (auto itrack : elements_with_stride(acc, elements - 1)) { auto ntrack = order[itrack]; auto mtrack = order[itrack + 1]; assert(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack])); @@ -122,7 +123,7 @@ namespace { radixSort(acc, iz, order, sws, elements); alpaka::syncBlockThreads(acc); - for (auto itrack: elements_with_stride(acc, elements - 1)) { + for (auto itrack : elements_with_stride(acc, elements - 1)) { auto ntrack = order[itrack]; auto mtrack = order[itrack + 1]; assert(iz[ntrack] <= iz[mtrack]); @@ -146,11 +147,12 @@ namespace { } }; - void testWrapper(Queue & queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) { + void testWrapper(Queue& queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) { auto blockSize = 512; // somewhat arbitrary auto gridSize = 1; // round up to cover the sample size const auto workdiv = make_workdiv(gridSize, blockSize); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint)); + alpaka::enqueue(queue, + alpaka::createTaskKernel(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint)); alpaka::wait(queue); } } // namespace @@ -165,50 +167,54 @@ int main() { << ", the test will be skipped.\n"; return 0; } - + // run the test on each device for (auto const& device : devices) { Queue queue(device); -// FLOAT* gpu_input; -// int* gpu_product; + // FLOAT* gpu_input; + // int* gpu_product; int nmax = 4 * 260; auto gpu_input_h = cms::alpakatools::make_host_buffer(queue, nmax); auto i = gpu_input_h.data(); - for (auto v: { - 30.0, 30.0, -4.4, -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, 2.2666890621, - 2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453, 2.2852313519, 2.2877883911, 2.2946476936, 2.2960267067, - 2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652, 2.783257246, 2.8440306187, 2.9017834663, 2.9252648354, - 2.9254128933, 2.927520752, 2.9422419071, 2.9453969002, 2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948, - 2.9575133324, 2.9575304985, 2.9586606026, 2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377, 2.9646151066, - 2.9676523209, 2.9708273411, 2.974111557, 2.9742531776, 2.9772830009, 2.9877333641, 2.9960610867, 3.013969183, - 3.0187871456, 3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913, 3.0560519695, 3.0592908859, 3.0599737167, - 3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904, 3.0643932819, 3.0645000935, 3.0666446686, 3.068046093, - 3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026, 3.0733406544, 3.0738227367, 3.0738801956, 3.0738828182, - 3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146, 3.0773906708, 3.0778541565, 3.0780284405, 3.0780889988, - 3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217, 3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494, - 3.0815284252, 3.0817306042, 3.0819730759, 3.0820026398, 3.0838682652, 3.084009409, 3.0848178864, 3.0853257179, - 3.0855510235, 3.0856611729, 3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093, 3.0895674229, 3.0901503563, - 3.0903317928, 3.0912668705, 3.0920717716, 3.0954346657, 3.096424818, 3.0995628834, 3.1001036167, 3.1173279285, - 3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508, 3.1676850319, 3.1886672974, 3.3769197464, 3.3821125031, - 3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872, 3.5225863457, 3.5271244049, 3.5298995972, 3.5417425632, - 3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239, 3.5526945591, 3.5531234741, 3.5538012981, 3.5544877052, - 3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172, 3.5560717583, 3.5560848713, 3.5584278107, 3.558681488, - 3.5587313175, 3.5592217445, 3.559384346, 3.5604712963, 3.5634038448, 3.563803196, 3.564593792, 3.5660364628, - 3.5683133602, 3.5696356297, 3.569729805, 3.5740811825, 3.5757565498, 3.5760207176, 3.5760478973, 3.5836098194, - 3.5839796066, 3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438, 3.7187042236, 3.9741659164, 4.4111995697, - 4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405, 4.6868157387, 4.6868577003, 4.6879930496, 4.6888813972, - 4.6910686493, 4.6925001144, 4.6957530975, 4.698094368, 4.6997032166, 4.7017259598, 4.7020640373, 4.7024269104, - 4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456, 4.7044372559, 4.7050771713, 4.7055773735, 4.7060651779, - 4.7062759399, 4.7065420151, 4.70657444, 4.7066287994, 4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582, - 4.7075614929, 4.7075891495, 4.7076044083, 4.7077374458, 4.7080879211, 4.70819664, 4.7086658478, 4.708937645, - 4.7092385292, 4.709479332, 4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566, 4.7105507851, 4.71118927, - 4.7113513947, 4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361, 4.7117333412, 4.7117910385, 4.7119007111, - 4.7120013237, 4.712003231, 4.712044239, 4.7122926712, 4.7135767937, 4.7143669128, 4.7145690918, 4.7148418427, - 4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611, 4.717815876, 4.718059063, 4.7188801765, 4.7190728188, - 4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678, 4.7243990898, 4.7273659706, 4.7294125557, 4.7296204567, - 4.7325615883, 4.7356877327, 4.740146637, 4.742254734, 4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065, - 4.7723139628, 4.774812736, 4.8577151299, 4.890037536}) { + for (auto v : {30.0, 30.0, -4.4, -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, + 2.2666890621, 2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453, 2.2852313519, 2.2877883911, + 2.2946476936, 2.2960267067, 2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652, 2.783257246, + 2.8440306187, 2.9017834663, 2.9252648354, 2.9254128933, 2.927520752, 2.9422419071, 2.9453969002, + 2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948, 2.9575133324, 2.9575304985, 2.9586606026, + 2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377, 2.9646151066, 2.9676523209, 2.9708273411, + 2.974111557, 2.9742531776, 2.9772830009, 2.9877333641, 2.9960610867, 3.013969183, 3.0187871456, + 3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913, 3.0560519695, 3.0592908859, 3.0599737167, + 3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904, 3.0643932819, 3.0645000935, 3.0666446686, + 3.068046093, 3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026, 3.0733406544, 3.0738227367, + 3.0738801956, 3.0738828182, 3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146, 3.0773906708, + 3.0778541565, 3.0780284405, 3.0780889988, 3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217, + 3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494, 3.0815284252, 3.0817306042, 3.0819730759, + 3.0820026398, 3.0838682652, 3.084009409, 3.0848178864, 3.0853257179, 3.0855510235, 3.0856611729, + 3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093, 3.0895674229, 3.0901503563, 3.0903317928, + 3.0912668705, 3.0920717716, 3.0954346657, 3.096424818, 3.0995628834, 3.1001036167, 3.1173279285, + 3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508, 3.1676850319, 3.1886672974, 3.3769197464, + 3.3821125031, 3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872, 3.5225863457, 3.5271244049, + 3.5298995972, 3.5417425632, 3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239, 3.5526945591, + 3.5531234741, 3.5538012981, 3.5544877052, 3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172, + 3.5560717583, 3.5560848713, 3.5584278107, 3.558681488, 3.5587313175, 3.5592217445, 3.559384346, + 3.5604712963, 3.5634038448, 3.563803196, 3.564593792, 3.5660364628, 3.5683133602, 3.5696356297, + 3.569729805, 3.5740811825, 3.5757565498, 3.5760207176, 3.5760478973, 3.5836098194, 3.5839796066, + 3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438, 3.7187042236, 3.9741659164, 4.4111995697, + 4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405, 4.6868157387, 4.6868577003, 4.6879930496, + 4.6888813972, 4.6910686493, 4.6925001144, 4.6957530975, 4.698094368, 4.6997032166, 4.7017259598, + 4.7020640373, 4.7024269104, 4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456, 4.7044372559, + 4.7050771713, 4.7055773735, 4.7060651779, 4.7062759399, 4.7065420151, 4.70657444, 4.7066287994, + 4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582, 4.7075614929, 4.7075891495, 4.7076044083, + 4.7077374458, 4.7080879211, 4.70819664, 4.7086658478, 4.708937645, 4.7092385292, 4.709479332, + 4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566, 4.7105507851, 4.71118927, 4.7113513947, + 4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361, 4.7117333412, 4.7117910385, 4.7119007111, + 4.7120013237, 4.712003231, 4.712044239, 4.7122926712, 4.7135767937, 4.7143669128, 4.7145690918, + 4.7148418427, 4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611, 4.717815876, 4.718059063, + 4.7188801765, 4.7190728188, 4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678, 4.7243990898, + 4.7273659706, 4.7294125557, 4.7296204567, 4.7325615883, 4.7356877327, 4.740146637, 4.742254734, + 4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065, 4.7723139628, 4.774812736, 4.8577151299, + 4.890037536}) { *(i++) = v; } auto input = gpu_input_h.data(); @@ -219,7 +225,7 @@ int main() { } auto gpu_input_d = cms::alpakatools::make_device_buffer(queue, nmax); //cudaCheck(cudaMalloc(&gpu_input, sizeof(FLOAT) * nmax)); -// cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax)); + // cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax)); auto gpu_product_d = cms::alpakatools::make_device_buffer(queue, nmax); // copy the input data to the GPU alpaka::memcpy(queue, gpu_input_d, gpu_input_h); diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc index df41c649b91cb..9caf2af20eb9c 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc @@ -81,7 +81,7 @@ void truncate(T& t) { } template -void go(Queue & queue, bool useShared) { +void go(Queue& queue, bool useShared) { std::mt19937 eng; //std::mt19937 eng2; auto rgen = RS::ud(); @@ -145,46 +145,48 @@ void go(Queue & queue, bool useShared) { alpaka::memcpy(queue, v_d, v_h); alpaka::memcpy(queue, off_d, offsets_h); -// cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice)); -// cudaCheck(cudaMemcpy(off_d.get(), offsets_h, 4 * (blocks + 1), cudaMemcpyHostToDevice)); if (i < 2) std::cout << "launch for " << offsets_h[blocks] << std::endl; - auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; + auto ntXBl = 1 == i % 4 ? 256 : 256; auto start = std::chrono::high_resolution_clock::now(); - // TODO: manage runtime sized shared memory - [[maybe_unused]] constexpr int MaxSize = 256 * 32; + // The MaxSize is the max size we allow between offsets (i.e. biggest set to sort when using shared memory). + constexpr int MaxSize = 256 * 32; auto workdiv = make_workdiv(blocks, ntXBl); if (useShared) // The original CUDA version used to call a kernel with __launch_bounds__(256, 4) specifier - // - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - radixSortMultiWrapper{}, v_d.data(), ind_d.data(), off_d.data(), nullptr)); -// cms::cuda::launch( -// radixSortMultiWrapper, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr); + // + alpaka::enqueue(queue, + alpaka::createTaskKernel(workdiv, + radixSortMultiWrapper{}, + v_d.data(), + ind_d.data(), + off_d.data(), + nullptr, + MaxSize * sizeof(uint16_t))); else -// cms::cuda::launch( -// radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get()); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - radixSortMultiWrapper2{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel( + workdiv, radixSortMultiWrapper2{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data())); - if (i == 0) - std::cout << "done for " << offsets_h[blocks] << std::endl; + if (i < 2) + std::cout << "launch done for " << offsets_h[blocks] << std::endl; alpaka::memcpy(queue, ind_h, ind_d); alpaka::wait(queue); - //cudaCheck(cudaMemcpy(ind_h, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost)); delta += std::chrono::high_resolution_clock::now() - start; - if (i == 0) - std::cout << "done for " << offsets_h[blocks] << std::endl; + if (i < 2) + std::cout << "kernel and read back done for " << offsets_h[blocks] << std::endl; if (32 == i) { std::cout << LL(v_h[ind_h[0]]) << ' ' << LL(v_h[ind_h[1]]) << ' ' << LL(v_h[ind_h[2]]) << std::endl; - std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]]) << std::endl; + std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]]) + << std::endl; std::cout << LL(v_h[ind_h[blockSize / 2 - 1]]) << ' ' << LL(v_h[ind_h[blockSize / 2]]) << ' ' << LL(v_h[ind_h[blockSize / 2 + 1]]) << std::endl; } @@ -194,17 +196,16 @@ void go(Queue & queue, bool useShared) { inds.insert(ind_h[offsets_h[ib]]); for (auto j = offsets_h[ib] + 1; j < offsets_h[ib + 1]; j++) { if (inds.count(ind_h[j]) != 0) { - printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n", - i, ib, j, ind_h[j]); + printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n", i, ib, j, ind_h[j]); std::vector counts; counts.resize(offsets_h[ib + 1] - offsets_h[ib], 0); for (size_t j2 = offsets_h[ib]; j2 < offsets_h[ib + 1]; j2++) { counts[ind_h[j2]]++; - } + } for (size_t j2 = 0; j2 < counts.size(); j2++) { - if (counts[j2]!=1) + if (counts[j2] != 1) printf("counts[%ld]=%d ", j2, counts[j2]); - } + } printf("\n"); printf("inds.count(ind_h[j] = %lu\n", inds.count(ind_h[j])); } @@ -216,9 +217,12 @@ void go(Queue & queue, bool useShared) { truncate(k1); truncate(k2); if (k1 < k2) { - std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib] << ", " << offsets_h[ib + 1] - 1 - << "] j=" << j << " ind[j]=" << ind_h[j] << " (k1 < k2) : a1=" << a[ind_h[j]] << " k1=" << k1 - << "a2= " << a[ind_h[j - 1]] << " k2=" << k2 << std::endl; + std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib] << ", " + << offsets_h[ib + 1] - 1 << "] j=" << j << " ind[j]=" << ind_h[j] + << " (k1 < k2) : a1=" << (int64_t)a[ind_h[j]] << " k1=" << (int64_t)k1 + << " a2= " << (int64_t)a[ind_h[j - 1]] << " k2=" << (int64_t)k2 << std::endl; + //sleep(2); + assert(false); } } if (!inds.empty()) { @@ -228,7 +232,7 @@ void go(Queue & queue, bool useShared) { if (inds.size() != (offsets_h[ib + 1] - offsets_h[ib])) std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets_h[ib + 1] - offsets_h[ib]) << std::endl; - // + // assert(inds.size() == (offsets_h[ib + 1] - offsets_h[ib])); } } // 50 times diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc index f4f4719192c12..ce85ad42cb0f4 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc @@ -10,14 +10,14 @@ using namespace ALPAKA_ACCELERATOR_NAMESPACE; // Kernel running a loop over threads/elements // One function with multiple flavors -enum class RangeType { - Default, ExtentLimited, ExtentLimitedWithShift -}; - -enum class LoopScope { - Block, Grid -}; +// The type of elements_with_stride +enum class RangeType { Default, ExtentLimited, ExtentLimitedWithShift }; + +// The concurrency scope between threads +enum class LoopScope { Block, Grid }; + +// Utility for one time initializations template bool constexpr firstInLoopRange(TAcc const& acc) { if constexpr (loopScope == LoopScope::Block) @@ -30,7 +30,7 @@ bool constexpr firstInLoopRange(TAcc const& acc) { template size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) { if constexpr (rangeType == RangeType::ExtentLimitedWithShift) - return shift < size ? size - shift: 0; + return shift < size ? size - shift : 0; else if constexpr (rangeType == RangeType::ExtentLimited) return size; else /* rangeType == RangeType::Default */ @@ -41,9 +41,9 @@ size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) { } template -size_t constexpr expectedCount(WorkDiv1D const &workDiv, size_t size, size_t shift) { +size_t constexpr expectedCount(WorkDiv1D const& workDiv, size_t size, size_t shift) { if constexpr (rangeType == RangeType::ExtentLimitedWithShift) - return shift < size ? size - shift: 0; + return shift < size ? size - shift : 0; else if constexpr (rangeType == RangeType::ExtentLimited) return size; else /* rangeType == RangeType::Default */ @@ -53,12 +53,13 @@ size_t constexpr expectedCount(WorkDiv1D const &workDiv, size_t size, size_t shi return workDiv.m_gridBlockExtent[0u] * workDiv.m_blockThreadExtent[0u] * workDiv.m_threadElemExtent[0u]; } -template +template struct testWordDivisionDefaultRange { template - ALPAKA_FN_ACC void operator() (TAcc const & acc, size_t size, size_t shift, size_t *globalCounter) const { - size_t & counter = ( loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar(acc)); - // Init the counter for block range. Grid range does so my mean of memset. + ALPAKA_FN_ACC void operator()(TAcc const& acc, size_t size, size_t shift, size_t* globalCounter) const { + size_t& counter = + (loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar(acc)); + // Init the counter for block range. Grid range does so my mean of memset. if constexpr (loopScope == LoopScope::Block) { if (firstInLoopRange(acc)) counter = 0; @@ -66,13 +67,13 @@ struct testWordDivisionDefaultRange { } // The loop we are testing if constexpr (rangeType == RangeType::Default) - for ([[maybe_unused]] auto idx: elements_with_stride(acc)) + for ([[maybe_unused]] auto idx : elements_with_stride(acc)) alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{}); else if constexpr (rangeType == RangeType::ExtentLimited) - for ([[maybe_unused]] auto idx: elements_with_stride(acc, size)) + for ([[maybe_unused]] auto idx : elements_with_stride(acc, size)) alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{}); else if constexpr (rangeType == RangeType::ExtentLimitedWithShift) - for ([[maybe_unused]]auto idx: elements_with_stride(acc, size, shift)) + for ([[maybe_unused]] auto idx : elements_with_stride(acc, shift, size)) alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{}); alpaka::syncBlockThreads(acc); // Check the result. Grid range will check by memcpy-ing the result. @@ -103,44 +104,87 @@ int main() { ssize_t BlockSize = 512; size_t GridSize = 4; for (size_t blocks = 1; blocks < GridSize * 3; blocks++) - for (auto sizeFuzz: std::initializer_list{ - BlockSize / 2, -1, 0, 1, BlockSize/2 }) - for (auto shift: std::initializer_list{0, 1, BlockSize / 2, BlockSize - 1, BlockSize, BlockSize + 1, BlockSize + BlockSize / 2, 2*BlockSize -1, 2*BlockSize, 2*BlockSize + 1}) { - // Grid level iteration: we need to initialize/check at the grid level + for (auto sizeFuzz : + std::initializer_list{-10 * BlockSize / 13, -BlockSize / 2, -1, 0, 1, BlockSize / 2}) + for (auto shift : std::initializer_list{0, + 1, + BlockSize / 2, + BlockSize - 1, + BlockSize, + BlockSize + 1, + BlockSize + BlockSize / 2, + 2 * BlockSize - 1, + 2 * BlockSize, + 2 * BlockSize + 1}) { + // Grid level iteration: we need to initialize/check at the grid level // Default range alpaka::memset(queue, counter_d, 0); auto workdiv = make_workdiv(BlockSize, GridSize); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); alpaka::memcpy(queue, counter_h, counter_d); alpaka::wait(queue); - auto expected = expectedCount(workdiv, blocks* BlockSize + sizeFuzz, shift); + auto expected = + expectedCount(workdiv, blocks * BlockSize + sizeFuzz, shift); assert(*counter_h.data() == expected); // ExtentLimited range alpaka::memset(queue, counter_d, 0); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); alpaka::memcpy(queue, counter_h, counter_d); alpaka::wait(queue); - expected = expectedCount(workdiv, blocks* BlockSize + sizeFuzz, shift); + expected = + expectedCount(workdiv, blocks * BlockSize + sizeFuzz, shift); assert(*counter_h.data() == expected); // ExtentLimitedWithShift range alpaka::memset(queue, counter_d, 0); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::enqueue(queue, + alpaka::createTaskKernel( + workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); alpaka::memcpy(queue, counter_h, counter_d); alpaka::wait(queue); - expected = expectedCount(workdiv, blocks* BlockSize + sizeFuzz, shift); + expected = expectedCount( + workdiv, blocks * BlockSize + sizeFuzz, shift); assert(*counter_h.data() == expected); // Block level auto tests - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); - alpaka::enqueue(queue, alpaka::createTaskKernel(workdiv, - testWordDivisionDefaultRange{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); + alpaka::enqueue(queue, + alpaka::createTaskKernel( + workdiv, + testWordDivisionDefaultRange{}, + blocks * BlockSize + sizeFuzz, + shift, + counter_d.data())); } alpaka::wait(queue); } diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h index 416e647c349a4..ee8048431ab6d 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h +++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h @@ -203,7 +203,7 @@ __device__ __forceinline__ void radixSortImpl( if (threadIdx.x == 0) ++p; __syncthreads(); - } + } if ((w != 8) && (0 == (NS & 1))) assert(j == ind); // w/d is even so ind is correct @@ -261,7 +261,7 @@ namespace cms { template __global__ void __launch_bounds__(256, 4) - radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { + radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) { radixSortMulti(v, index, offsets, workspace); } diff --git a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu index 6b3ee2e4afa9c..8b6ffd70fd4e6 100644 --- a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu @@ -74,7 +74,7 @@ namespace { __syncthreads(); radixSort(z, order, sws, elements); __syncthreads(); - + //verify for (unsigned int itrack = firstElement; itrack < (elements - 1); itrack += gridSize) { auto ntrack = order[itrack]; diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu index 209ce97347e25..a15bed0ae8a1f 100644 --- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -151,10 +151,15 @@ void go(bool useShared) { auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256; delta -= (std::chrono::high_resolution_clock::now() - start); + // The MaxSize is the max size we allow between offsets (i.e. biggest set to sort when using shared memory). constexpr int MaxSize = 256 * 32; if (useShared) - cms::cuda::launch( - radixSortMultiWrapper, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr); + cms::cuda::launch(radixSortMultiWrapper, + {blocks, ntXBl, MaxSize * 2 /* sizeof(uint16_t) */}, + v_d.get(), + ind_d.get(), + off_d.get(), + nullptr); else cms::cuda::launch( radixSortMultiWrapper2, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());