From 4af44b358cd4d7b01b4cef37b034c9b28535a294 Mon Sep 17 00:00:00 2001
From: Eric Cano <eric.cano@cern.ch>
Date: Thu, 19 Oct 2023 11:37:05 +0200
Subject: [PATCH 1/3] Ports prefixScan, OneToManyAssoc and HistoContainer from
 CUDAUtilities.

OneToManyAssoc is also separated into various function depending on use case.

Co-authored-by: Andrea Bocci <fwyzard@gmail.com>
---
 .../interface/AtomicPairCounter.h             |  68 +++
 .../AlpakaInterface/interface/FlexiStorage.h  |  47 ++
 .../interface/HistoContainer.h                | 201 +++++++++
 .../interface/OneToManyAssoc.h                | 274 +++++++++++
 .../AlpakaInterface/interface/SimpleVector.h  | 142 ++++++
 .../AlpakaInterface/interface/VecArray.h      | 114 +++++
 .../interface/alpakastdAlgorithm.h            |  36 ++
 .../AlpakaInterface/interface/prefixScan.h    | 226 +++++++++
 .../AlpakaInterface/interface/radixSort.h     | 427 ++++++++++++++++++
 .../AlpakaInterface/test/BuildFile.xml        |  57 +++
 .../test/alpaka/testAtomicPairCounter.dev.cc  | 106 +++++
 .../test/alpaka/testHistoContainer.dev.cc     | 255 +++++++++++
 .../test/alpaka/testOneHistoContainer.dev.cc  | 187 ++++++++
 .../test/alpaka/testOneRadixSort.dev.cc       | 237 ++++++++++
 .../test/alpaka/testOneToManyAssoc.dev.cc     | 327 ++++++++++++++
 .../test/alpaka/testPrefixScan.dev.cc         | 219 +++++++++
 .../test/alpaka/testRadixSort.dev.cc          | 285 ++++++++++++
 .../test/alpaka/testSimpleVector.dev.cc       | 101 +++++
 .../test/alpaka/testWorkDivision.dev.cc       | 147 ++++++
 .../CUDAUtilities/interface/radixSort.h       |   4 +-
 .../CUDAUtilities/test/oneRadixSort_t.cu      |   2 +-
 21 files changed, 3459 insertions(+), 3 deletions(-)
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/VecArray.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/prefixScan.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/radixSort.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
diff --git a/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h b/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h
new file mode 100644
index 0000000000000..5c6d1d5719623
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h
@@ -0,0 +1,68 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h
+#define HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h
+
+#include <cstdint>
+
+#include <alpaka/alpaka.hpp>
+
+namespace cms::alpakatools {
+
+  class AtomicPairCounter {
+  public:
+    using DoubleWord = uint64_t;
+
+    ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter() : counter_{0} {}
+    ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter(uint32_t first, uint32_t second) : counter_{pack(first, second)} {}
+    ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter(DoubleWord values) : counter_{values} {}
+
+    ALPAKA_FN_HOST_ACC constexpr AtomicPairCounter& operator=(DoubleWord values) {
+      counter_.as_doubleword = values;
+      return *this;
+    }
+
+    struct Counters {
+      uint32_t first;   // in a "One to Many" association is the number of "One"
+      uint32_t second;  // in a "One to Many" association is the total number of associations
+    };
+
+    ALPAKA_FN_ACC constexpr Counters get() const { return counter_.as_counters; }
+
+    // atomically add as_counters, and return the previous value
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Counters add(const TAcc& acc, Counters c) {
+      Packer value{pack(c.first, c.second)};
+      Packer ret{0};
+      ret.as_doubleword =
+          alpaka::atomicAdd(acc, &counter_.as_doubleword, value.as_doubleword, alpaka::hierarchy::Blocks{});
+      return ret.as_counters;
+    }
+
+    // atomically increment first and add i to second, and return the previous value
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE Counters constexpr inc_add(const TAcc& acc, uint32_t i) {
+      return add(acc, {1u, i});
+    }
+
+  private:
+    union Packer {
+      DoubleWord as_doubleword;
+      Counters as_counters;
+      constexpr Packer(DoubleWord _as_doubleword) : as_doubleword(_as_doubleword) { ; };
+      constexpr Packer(Counters _as_counters) : as_counters(_as_counters) { ; };
+    };
+
+    // pack two uint32_t values in a DoubleWord (aka uint64_t)
+    // this is needed because in c++17 a union can only be aggregate-initialised to its first type
+    // it can be probably removed with c++20, and replace with a designated initialiser
+    static constexpr DoubleWord pack(uint32_t first, uint32_t second) {
+      Packer ret{0};
+      ret.as_counters = {first, second};
+      return ret.as_doubleword;
+    }
+
+    Packer counter_;
+  };
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h b/HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h
new file mode 100644
index 0000000000000..2bb3aa5f1e15c
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h
@@ -0,0 +1,47 @@
+
+#ifndef HeterogeneousCore_AlpakaInterface_interface_FlexiStorage_h
+#define HeterogeneousCore_AlpakaInterface_interface_FlexiStorage_h
+
+#include <cstdint>
+
+namespace cms::alpakatools {
+
+  template <typename I, int S>
+  class FlexiStorage {
+  public:
+    constexpr int capacity() const { return S; }
+
+    constexpr I& operator[](int i) { return m_v[i]; }
+    constexpr const I& operator[](int i) const { return m_v[i]; }
+
+    constexpr I* data() { return m_v; }
+    constexpr I const* data() const { return m_v; }
+
+  private:
+    I m_v[S];
+  };
+
+  template <typename I>
+  class FlexiStorage<I, -1> {
+  public:
+    constexpr void init(I* v, int s) {
+      m_v = v;
+      m_capacity = s;
+    }
+
+    constexpr int capacity() const { return m_capacity; }
+
+    constexpr I& operator[](int i) { return m_v[i]; }
+    constexpr const I& operator[](int i) const { return m_v[i]; }
+
+    constexpr I* data() { return m_v; }
+    constexpr I const* data() const { return m_v; }
+
+  private:
+    I* m_v;
+    int m_capacity;
+  };
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_FlexiStorage_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h b/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h
new file mode 100644
index 0000000000000..304d01ff9fd08
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h
@@ -0,0 +1,201 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h
+#define HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+namespace cms::alpakatools {
+
+  struct countFromVector {
+    template <typename TAcc, typename Histo, typename T>
+    ALPAKA_FN_ACC void operator()(const TAcc &acc,
+                                  Histo *__restrict__ h,
+                                  uint32_t nh,
+                                  T const *__restrict__ v,
+                                  uint32_t const *__restrict__ offsets) const {
+      const uint32_t nt = offsets[nh];
+      for (uint32_t i : elements_with_stride(acc, nt)) {
+        auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
+        ALPAKA_ASSERT_OFFLOAD((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        ALPAKA_ASSERT_OFFLOAD(ih >= 0);
+        ALPAKA_ASSERT_OFFLOAD(ih < int(nh));
+        h->count(acc, v[i], ih);
+      }
+    }
+  };
+
+  struct fillFromVector {
+    template <typename TAcc, typename Histo, typename T>
+    ALPAKA_FN_ACC void operator()(const TAcc &acc,
+                                  Histo *__restrict__ h,
+                                  uint32_t nh,
+                                  T const *__restrict__ v,
+                                  uint32_t const *__restrict__ offsets) const {
+      const uint32_t nt = offsets[nh];
+      for (uint32_t i : elements_with_stride(acc, nt)) {
+        auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i);
+        ALPAKA_ASSERT_OFFLOAD((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        ALPAKA_ASSERT_OFFLOAD(ih >= 0);
+        ALPAKA_ASSERT_OFFLOAD(ih < int(nh));
+        h->fill(acc, v[i], i, ih);
+      }
+    }
+  };
+
+  template <typename TAcc, typename Histo, typename T, typename TQueue>
+  ALPAKA_FN_INLINE void fillManyFromVector(Histo *__restrict__ h,
+                                           uint32_t nh,
+                                           T const *__restrict__ v,
+                                           uint32_t const *__restrict__ offsets,
+                                           uint32_t totSize,
+                                           uint32_t nthreads,
+                                           TQueue &queue) {
+    Histo::template launchZero<TAcc>(h, queue);
+
+    const auto threadsPerBlockOrElementsPerThread = nthreads;
+    const auto blocksPerGrid = divide_up_by(totSize, nthreads);
+    const auto workDiv = make_workdiv<TAcc>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
+
+    alpaka::exec<TAcc>(queue, workDiv, countFromVector(), h, nh, v, offsets);
+    Histo::template launchFinalize<TAcc>(h, queue);
+
+    alpaka::exec<TAcc>(queue, workDiv, fillFromVector(), h, nh, v, offsets);
+  }
+
+  template <typename TAcc, typename Histo, typename T, typename TQueue>
+  ALPAKA_FN_INLINE void fillManyFromVector(Histo *__restrict__ h,
+                                           typename Histo::View hv,
+                                           uint32_t nh,
+                                           T const *__restrict__ v,
+                                           uint32_t const *__restrict__ offsets,
+                                           uint32_t totSize,
+                                           uint32_t nthreads,
+                                           TQueue &queue) {
+    Histo::template launchZero<TAcc>(hv, queue);
+
+    const auto threadsPerBlockOrElementsPerThread = nthreads;
+    const auto blocksPerGrid = divide_up_by(totSize, nthreads);
+    const auto workDiv = make_workdiv<TAcc>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
+
+    alpaka::exec<TAcc>(queue, workDiv, countFromVector(), h, nh, v, offsets);
+    Histo::template launchFinalize<TAcc>(h, queue);
+
+    alpaka::exec<TAcc>(queue, workDiv, fillFromVector(), h, nh, v, offsets);
+  }
+
+  // iteratate over N bins left and right of the one containing "v"
+  template <typename Hist, typename V, typename Func>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) {
+    int bs = Hist::bin(value);
+    int be = std::min(int(Hist::nbins() - 1), bs + n);
+    bs = std::max(0, bs - n);
+    ALPAKA_ASSERT_OFFLOAD(be >= bs);
+    for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+      func(*pj);
+    }
+  }
+
+  // iteratate over bins containing all values in window wmin, wmax
+  template <typename Hist, typename V, typename Func>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
+    auto bs = Hist::bin(wmin);
+    auto be = Hist::bin(wmax);
+    ALPAKA_ASSERT_OFFLOAD(be >= bs);
+    for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+      func(*pj);
+    }
+  }
+
+  template <typename T,      // the type of the discretized input values
+            uint32_t NBINS,  // number of bins
+            int32_t SIZE,    // max number of element. If -1 is initialized at runtime using external storage
+            uint32_t S = sizeof(T) * 8,  // number of significant bits in T
+            typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
+            uint32_t NHISTS = 1     // number of histos stored
+            >
+  class HistoContainer : public OneToManyAssocRandomAccess<I, NHISTS * NBINS + 1, SIZE> {
+  public:
+    using Base = OneToManyAssocRandomAccess<I, NHISTS * NBINS + 1, SIZE>;
+    using View = typename Base::View;
+    using Counter = typename Base::Counter;
+    using index_type = typename Base::index_type;
+    using UT = typename std::make_unsigned<T>::type;
+
+    static constexpr uint32_t ilog2(uint32_t v) {
+      constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+      constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+      uint32_t r = 0;  // result of log2(v) will go here
+      for (auto i = 4; i >= 0; i--)
+        if (v & b[i]) {
+          v >>= s[i];
+          r |= s[i];
+        }
+      return r;
+    }
+
+    static constexpr uint32_t sizeT() { return S; }
+    static constexpr int32_t nhists() { return NHISTS; }
+    static constexpr uint32_t nbins() { return NBINS; }
+    static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
+    static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
+
+    static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
+
+    static constexpr UT bin(T t) {
+      constexpr uint32_t shift = sizeT() - nbits();
+      constexpr uint32_t mask = (1 << nbits()) - 1;
+      return (t >> shift) & mask;
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t) {
+      uint32_t b = bin(t);
+      ALPAKA_ASSERT_OFFLOAD(b < nbins());
+      Base::atomicIncrement(acc, this->off[b]);
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j) {
+      uint32_t b = bin(t);
+      ALPAKA_ASSERT_OFFLOAD(b < nbins());
+      auto w = Base::atomicDecrement(acc, this->off[b]);
+      ALPAKA_ASSERT_OFFLOAD(w > 0);
+      this->content[w - 1] = j;
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t, uint32_t nh) {
+      uint32_t b = bin(t);
+      ALPAKA_ASSERT_OFFLOAD(b < nbins());
+      b += histOff(nh);
+      ALPAKA_ASSERT_OFFLOAD(b < totbins());
+      Base::atomicIncrement(acc, this->off[b]);
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j, uint32_t nh) {
+      uint32_t b = bin(t);
+      ALPAKA_ASSERT_OFFLOAD(b < nbins());
+      b += histOff(nh);
+      ALPAKA_ASSERT_OFFLOAD(b < totbins());
+      auto w = Base::atomicDecrement(acc, this->off[b]);
+      ALPAKA_ASSERT_OFFLOAD(w > 0);
+      this->content[w - 1] = j;
+    }
+  };
+}  // namespace cms::alpakatools
+#endif  // HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h b/HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h
new file mode 100644
index 0000000000000..e7834d39a8910
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h
@@ -0,0 +1,274 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_OneToManyAssoc_h
+#define HeterogeneousCore_AlpakaInterface_interface_OneToManyAssoc_h
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/FlexiStorage.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+namespace cms::alpakatools {
+
+  template <typename I,    // type stored in the container (usually an index in a vector of the input values)
+            int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage
+            int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage
+            >
+  class OneToManyAssocBase {
+  public:
+    using Counter = uint32_t;
+
+    using CountersOnly = OneToManyAssocBase<I, ONES, 0>;
+
+    using index_type = I;
+
+    struct View {
+      OneToManyAssocBase *assoc = nullptr;
+      Counter *offStorage = nullptr;
+      index_type *contentStorage = nullptr;
+      int32_t offSize = -1;
+      int32_t contentSize = -1;
+    };
+
+    static constexpr int32_t ctNOnes() { return ONES; }
+    constexpr auto totOnes() const { return off.capacity(); }
+    constexpr auto nOnes() const { return totOnes() - 1; }
+    static constexpr int32_t ctCapacity() { return SIZE; }
+    constexpr auto capacity() const { return content.capacity(); }
+
+    ALPAKA_FN_HOST_ACC void initStorage(View view) {
+      ALPAKA_ASSERT_OFFLOAD(view.assoc == this);
+      if constexpr (ctCapacity() < 0) {
+        ALPAKA_ASSERT_OFFLOAD(view.contentStorage);
+        ALPAKA_ASSERT_OFFLOAD(view.contentSize > 0);
+        content.init(view.contentStorage, view.contentSize);
+      }
+      if constexpr (ctNOnes() < 0) {
+        ALPAKA_ASSERT_OFFLOAD(view.offStorage);
+        ALPAKA_ASSERT_OFFLOAD(view.offSize > 0);
+        off.init(view.offStorage, view.offSize);
+      }
+    }
+
+    ALPAKA_FN_HOST_ACC void zero() {
+      for (int32_t i = 0; i < totOnes(); ++i) {
+        off[i] = 0;
+      }
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const TAcc &acc, CountersOnly const &co) {
+      for (uint32_t i = 0; static_cast<int>(i) < totOnes(); ++i) {
+        alpaka::atomicAdd(acc, off.data() + i, co.off[i], alpaka::hierarchy::Blocks{});
+      }
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE static uint32_t atomicIncrement(const TAcc &acc, Counter &x) {
+      return alpaka::atomicAdd(acc, &x, 1u, alpaka::hierarchy::Blocks{});
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE static uint32_t atomicDecrement(const TAcc &acc, Counter &x) {
+      return alpaka::atomicSub(acc, &x, 1u, alpaka::hierarchy::Blocks{});
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, I b) {
+      ALPAKA_ASSERT_OFFLOAD(b < static_cast<uint32_t>(nOnes()));
+      atomicIncrement(acc, off[b]);
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, I b, index_type j) {
+      ALPAKA_ASSERT_OFFLOAD(b < static_cast<uint32_t>(nOnes()));
+      auto w = atomicDecrement(acc, off[b]);
+      ALPAKA_ASSERT_OFFLOAD(w > 0);
+      content[w - 1] = j;
+    }
+
+    // this MUST BE DONE in a single block (or in two kernels!)
+    struct zeroAndInit {
+      template <typename TAcc>
+      ALPAKA_FN_ACC void operator()(const TAcc &acc, View view) const {
+        auto h = view.assoc;
+        ALPAKA_ASSERT_OFFLOAD((1 == alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0]));
+        ALPAKA_ASSERT_OFFLOAD((0 == alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]));
+
+        auto first = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0];
+
+        if (0 == first) {
+          h->psws = 0;
+          h->initStorage(view);
+        }
+        alpaka::syncBlockThreads(acc);
+        // TODO use for_each_element_in_grid_strided (or similar)
+        for (int i = first, nt = h->totOnes(); i < nt;
+             i += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0]) {
+          h->off[i] = 0;
+        }
+      }
+    };
+
+    template <typename TAcc, typename TQueue>
+    ALPAKA_FN_INLINE static void launchZero(OneToManyAssocBase *h, TQueue &queue) {
+      View view = {h, nullptr, nullptr, -1, -1};
+      launchZero<TAcc>(view, queue);
+    }
+
+    template <typename TAcc, typename TQueue>
+    ALPAKA_FN_INLINE static void launchZero(View view, TQueue &queue) {
+      if constexpr (ctCapacity() < 0) {
+        ALPAKA_ASSERT_OFFLOAD(view.contentStorage);
+        ALPAKA_ASSERT_OFFLOAD(view.contentSize > 0);
+      }
+      if constexpr (ctNOnes() < 0) {
+        ALPAKA_ASSERT_OFFLOAD(view.offStorage);
+        ALPAKA_ASSERT_OFFLOAD(view.offSize > 0);
+      }
+      if constexpr (!requires_single_thread_per_block_v<TAcc>) {
+        auto nthreads = 1024;
+        auto nblocks = 1;  // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels);
+        auto workDiv = cms::alpakatools::make_workdiv<TAcc>(nblocks, nthreads);
+        alpaka::exec<TAcc>(queue, workDiv, zeroAndInit{}, view);
+      } else {
+        auto h = view.assoc;
+        ALPAKA_ASSERT_OFFLOAD(h);
+        h->initStorage(view);
+        h->zero();
+        h->psws = 0;
+      }
+    }
+
+    constexpr auto size() const { return uint32_t(off[totOnes() - 1]); }
+    constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
+
+    constexpr index_type const *begin() const { return content.data(); }
+    constexpr index_type const *end() const { return begin() + size(); }
+
+    constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; }
+    constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; }
+
+    FlexiStorage<Counter, ONES> off;
+    FlexiStorage<index_type, SIZE> content;
+    int32_t psws;  // prefix-scan working space
+  };
+
+  template <typename I,    // type stored in the container (usually an index in a vector of the input values)
+            int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage
+            int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage
+            >
+  class OneToManyAssocSequential : public OneToManyAssocBase<I, ONES, SIZE> {
+  public:
+    using index_type = typename OneToManyAssocBase<I, ONES, SIZE>::index_type;
+
+    template <typename TAcc>
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int32_t
+    bulkFill(const TAcc &acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) {
+      auto c = apc.inc_add(acc, n);
+      if (int(c.first) >= this->nOnes())
+        return -int32_t(c.first);
+      this->off[c.first] = c.second;
+      for (uint32_t j = 0; j < n; ++j)
+        this->content[c.second + j] = v[j];
+      return c.first;
+    }
+
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void bulkFinalize(AtomicPairCounter const &apc) {
+      this->off[apc.get().first] = apc.get().second;
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(TAcc &acc, AtomicPairCounter const &apc) {
+      int f = apc.get().first;
+      auto s = apc.get().second;
+      if (f >= this->nOnes()) {  // overflow!
+        this->off[this->nOnes()] = uint32_t(this->off[this->nOnes() - 1]);
+        return;
+      }
+      auto first = f + alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+      for (int i = first; i < this->totOnes(); i += alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0]) {
+        this->off[i] = s;
+      }
+    }
+
+    struct finalizeBulk {
+      template <typename TAcc>
+      ALPAKA_FN_ACC void operator()(const TAcc &acc,
+                                    AtomicPairCounter const *apc,
+                                    OneToManyAssocSequential *__restrict__ assoc) const {
+        assoc->bulkFinalizeFill(acc, *apc);
+      }
+    };
+  };
+
+  template <typename I,    // type stored in the container (usually an index in a vector of the input values)
+            int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage
+            int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage
+            >
+  class OneToManyAssocRandomAccess : public OneToManyAssocBase<I, ONES, SIZE> {
+  public:
+    using Counter = typename OneToManyAssocBase<I, ONES, SIZE>::Counter;
+    using View = typename OneToManyAssocBase<I, ONES, SIZE>::View;
+
+    template <typename TAcc>
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void finalize(TAcc &acc, Counter *ws = nullptr) {
+      ALPAKA_ASSERT_OFFLOAD(this->off[this->totOnes() - 1] == 0);
+      blockPrefixScan(acc, this->off.data(), this->totOnes(), ws);
+      ALPAKA_ASSERT_OFFLOAD(this->off[this->totOnes() - 1] == this->off[this->totOnes() - 2]);
+    }
+
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void finalize() {
+      // Single thread finalize.
+      for (uint32_t i = 1; static_cast<int>(i) < this->totOnes(); ++i)
+        this->off[i] += this->off[i - 1];
+    }
+
+    template <typename TAcc, typename TQueue>
+    ALPAKA_FN_INLINE static void launchFinalize(OneToManyAssocRandomAccess *h, TQueue &queue) {
+      View view = {h, nullptr, nullptr, -1, -1};
+      launchFinalize<TAcc>(view, queue);
+    }
+
+    template <typename TAcc, typename TQueue>
+    ALPAKA_FN_INLINE static void launchFinalize(View view, TQueue &queue) {
+      // View stores a base pointer, we need to upcast back...
+      auto h = static_cast<OneToManyAssocRandomAccess *>(view.assoc);
+      ALPAKA_ASSERT_OFFLOAD(h);
+      if constexpr (!requires_single_thread_per_block_v<TAcc>) {
+        Counter *poff = (Counter *)((char *)(h) + offsetof(OneToManyAssocRandomAccess, off));
+        auto nOnes = OneToManyAssocRandomAccess::ctNOnes();
+        if constexpr (OneToManyAssocRandomAccess::ctNOnes() < 0) {
+          ALPAKA_ASSERT_OFFLOAD(view.offStorage);
+          ALPAKA_ASSERT_OFFLOAD(view.offSize > 0);
+          nOnes = view.offSize;
+          poff = view.offStorage;
+        }
+        ALPAKA_ASSERT_OFFLOAD(nOnes > 0);
+        int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(OneToManyAssocRandomAccess, psws));
+        auto nthreads = 1024;
+        auto nblocks = (nOnes + nthreads - 1) / nthreads;
+        auto workDiv = cms::alpakatools::make_workdiv<TAcc>(nblocks, nthreads);
+        alpaka::exec<TAcc>(queue,
+                           workDiv,
+                           multiBlockPrefixScan<Counter>(),
+                           poff,
+                           poff,
+                           nOnes,
+                           nblocks,
+                           ppsws,
+                           alpaka::getWarpSizes(alpaka::getDev(queue))[0]);
+      } else {
+        h->finalize();
+      }
+    }
+  };
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h b/HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h
new file mode 100644
index 0000000000000..f4937da8626f8
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h
@@ -0,0 +1,142 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_SimpleVector_h
+#define HeterogeneousCore_AlpakaInterface_interface_SimpleVector_h
+
+//  author: Felice Pantaleo, CERN, 2018
+//  alpaka integration in cmssw: Adriano Di Florio, 2022
+
+#include <type_traits>
+#include <utility>
+#include <alpaka/alpaka.hpp>
+
+namespace cms::alpakatools {
+
+  template <class T>
+  struct SimpleVector {
+    constexpr SimpleVector() = default;
+
+    // ownership of m_data stays within the caller
+    constexpr void construct(int capacity, T *data) {
+      m_size = 0;
+      m_capacity = capacity;
+      m_data = data;
+    }
+
+    constexpr int push_back_unsafe(const T &element) {
+      auto previousSize = m_size;
+      m_size++;
+      if (previousSize < m_capacity) {
+        m_data[previousSize] = element;
+        return previousSize;
+      } else {
+        --m_size;
+        return -1;
+      }
+    }
+
+    template <class... Ts>
+    constexpr int emplace_back_unsafe(Ts &&...args) {
+      auto previousSize = m_size;
+      m_size++;
+      if (previousSize < m_capacity) {
+        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+        return previousSize;
+      } else {
+        --m_size;
+        return -1;
+      }
+    }
+
+    ALPAKA_FN_ACC inline T &back() { return m_data[m_size - 1]; }
+
+    ALPAKA_FN_ACC inline const T &back() const {
+      if (m_size > 0) {
+        return m_data[m_size - 1];
+      } else
+        return T();  //undefined behaviour
+    }
+
+    // thread-safe version of the vector, when used in a CUDA kernel
+    template <typename TAcc>
+    ALPAKA_FN_ACC int push_back(const TAcc &acc, const T &element) {
+      auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+      if (previousSize < m_capacity) {
+        m_data[previousSize] = element;
+        return previousSize;
+      } else {
+        alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+        return -1;
+      }
+    }
+
+    template <typename TAcc, class... Ts>
+    ALPAKA_FN_ACC int emplace_back(const TAcc &acc, Ts &&...args) {
+      auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+      if (previousSize < m_capacity) {
+        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+        return previousSize;
+      } else {
+        alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+        return -1;
+      }
+    }
+
+    // thread safe version of resize
+    template <typename TAcc>
+    ALPAKA_FN_ACC int extend(const TAcc &acc, int size = 1) {
+      auto previousSize = alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{});
+      if (previousSize < m_capacity) {
+        return previousSize;
+      } else {
+        alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{});
+        return -1;
+      }
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC int shrink(const TAcc &acc, int size = 1) {
+      auto previousSize = alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{});
+      if (previousSize >= size) {
+        return previousSize - size;
+      } else {
+        alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{});
+        return -1;
+      }
+    }
+
+    inline constexpr bool empty() const { return m_size <= 0; }
+    inline constexpr bool full() const { return m_size >= m_capacity; }
+    inline constexpr T &operator[](int i) { return m_data[i]; }
+    inline constexpr const T &operator[](int i) const { return m_data[i]; }
+    inline constexpr void reset() { m_size = 0; }
+    inline constexpr int size() const { return m_size; }
+    inline constexpr int capacity() const { return m_capacity; }
+    inline constexpr T const *data() const { return m_data; }
+    inline constexpr void resize(int size) { m_size = size; }
+    inline constexpr void set_data(T *data) { m_data = data; }
+
+  private:
+    int m_size;
+    int m_capacity;
+
+    T *m_data;
+  };
+
+  // ownership of m_data stays within the caller
+  template <class T>
+  SimpleVector<T> make_SimpleVector(int capacity, T *data) {
+    SimpleVector<T> ret;
+    ret.construct(capacity, data);
+    return ret;
+  }
+
+  // ownership of m_data stays within the caller
+  template <class T>
+  SimpleVector<T> *make_SimpleVector(SimpleVector<T> *mem, int capacity, T *data) {
+    auto ret = new (mem) SimpleVector<T>();
+    ret->construct(capacity, data);
+    return ret;
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_SimpleVector_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h
new file mode 100644
index 0000000000000..c4025dd42d4a6
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h
@@ -0,0 +1,114 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_VecArray_h
+#define HeterogeneousCore_AlpakaInterface_interface_VecArray_h
+
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include <utility>
+
+#include <alpaka/alpaka.hpp>
+
+namespace cms::alpakatools {
+
+  template <class T, int maxSize>
+  class VecArray {
+  public:
+    using self = VecArray<T, maxSize>;
+    using value_t = T;
+
+    inline constexpr int push_back_unsafe(const T &element) {
+      auto previousSize = m_size;
+      m_size++;
+      if (previousSize < maxSize) {
+        m_data[previousSize] = element;
+        return previousSize;
+      } else {
+        --m_size;
+        return -1;
+      }
+    }
+
+    template <class... Ts>
+    constexpr int emplace_back_unsafe(Ts &&...args) {
+      auto previousSize = m_size;
+      m_size++;
+      if (previousSize < maxSize) {
+        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+        return previousSize;
+      } else {
+        --m_size;
+        return -1;
+      }
+    }
+
+    inline constexpr T cont& back() const {
+      if (m_size > 0) {
+        return m_data[m_size - 1];
+      } else
+        return T();  //undefined behaviour
+    }
+
+    inline constexpr T &back() {
+      if (m_size > 0) {
+        return m_data[m_size - 1];
+      } else
+        return T();  //undefined behaviour
+    }
+
+    // thread-safe version of the vector, when used in a kernel
+    template <typename TAcc>
+    ALPAKA_FN_ACC int push_back(const TAcc &acc, const T &element) {
+      auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+      if (previousSize < maxSize) {
+        m_data[previousSize] = element;
+        return previousSize;
+      } else {
+        alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+        return -1;
+      }
+    }
+
+    template <typename TAcc, class... Ts>
+    ALPAKA_FN_ACC int emplace_back(const TAcc &acc, Ts &&...args) {
+      auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+      if (previousSize < maxSize) {
+        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+        return previousSize;
+      } else {
+        alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{});
+        return -1;
+      }
+    }
+
+    inline constexpr T pop_back() {
+      if (m_size > 0) {
+        auto previousSize = m_size--;
+        return m_data[previousSize - 1];
+      } else
+        return T();
+    }
+
+    inline constexpr T const *begin() const { return m_data; }
+    inline constexpr T const *end() const { return m_data + m_size; }
+    inline constexpr T *begin() { return m_data; }
+    inline constexpr T *end() { return m_data + m_size; }
+    inline constexpr int size() const { return m_size; }
+    inline constexpr T &operator[](int i) { return m_data[i]; }
+    inline constexpr const T &operator[](int i) const { return m_data[i]; }
+    inline constexpr void reset() { m_size = 0; }
+    inline static constexpr int capacity() { return maxSize; }
+    inline constexpr T const *data() const { return m_data; }
+    inline constexpr void resize(int size) { m_size = size; }
+    inline constexpr bool empty() const { return 0 == m_size; }
+    inline constexpr bool full() const { return maxSize == m_size; }
+
+  private:
+    T m_data[maxSize];
+
+    int m_size;
+  };
+
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_VecArray_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h b/HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h
new file mode 100644
index 0000000000000..5c768ab4a77ba
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h
@@ -0,0 +1,36 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_alpakastdAlgorithm_h
+#define HeterogeneousCore_AlpakaInterface_interface_alpakastdAlgorithm_h
+
+#include <algorithm>
+#include <functional>
+#include <utility>
+
+#include <alpaka/alpaka.hpp>
+
+// reimplementation of std algorithms able to compile with Alpaka,
+// mostly by declaring them constexpr (until C++20, which will make it
+// constexpr by default. TODO: drop when moving to C++20)
+
+namespace alpaka_std {
+
+  template <typename RandomIt, typename T, typename Compare = std::less<T>>
+  ALPAKA_FN_HOST_ACC constexpr RandomIt upper_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) {
+    auto count = last - first;
+
+    while (count > 0) {
+      auto it = first;
+      auto step = count / 2;
+      it += step;
+      if (!comp(value, *it)) {
+        first = ++it;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    return first;
+  }
+
+}  // namespace alpaka_std
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_alpakastdAlgorithm_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/prefixScan.h b/HeterogeneousCore/AlpakaInterface/interface/prefixScan.h
new file mode 100644
index 0000000000000..79c0e1f6de995
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/prefixScan.h
@@ -0,0 +1,226 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_prefixScan_h
+#define HeterogeneousCore_AlpakaInterface_interface_prefixScan_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+namespace cms::alpakatools {
+  template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+  constexpr bool isPowerOf2(T v) {
+    // returns true iif v has only one bit set.
+    while (v) {
+      if (v & 1)
+        return !(v >> 1);
+      else
+        v >>= 1;
+    }
+    return false;
+  }
+
+  template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(const TAcc& acc, int32_t laneId, T const* ci, T* co, uint32_t i) {
+    // ci and co may be the same
+    auto x = ci[i];
+    CMS_UNROLL_LOOP
+    for (int32_t offset = 1; offset < alpaka::warp::getSize(acc); offset <<= 1) {
+      // Force the exact type for integer types otherwise the compiler will find the template resolution ambiguous.
+      using dataType = std::conditional_t<std::is_floating_point_v<T>, T, std::int32_t>;
+      T y = alpaka::warp::shfl(acc, static_cast<dataType>(x), laneId - offset);
+      if (laneId >= offset)
+        x += y;
+    }
+    co[i] = x;
+  }
+
+  template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(const TAcc& acc, int32_t laneId, T* c, uint32_t i) {
+    warpPrefixScan(acc, laneId, c, c, i);
+  }
+
+  // limited to warpSize² elements
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void blockPrefixScan(
+      const TAcc& acc, T const* ci, T* co, int32_t size, T* ws = nullptr) {
+    if constexpr (!requires_single_thread_per_block_v<TAcc>) {
+      const auto warpSize = alpaka::warp::getSize(acc);
+      int32_t const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
+      int32_t const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+      ALPAKA_ASSERT_OFFLOAD(ws);
+      ALPAKA_ASSERT_OFFLOAD(size <= warpSize * warpSize);
+      ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % warpSize);
+      auto first = blockThreadIdx;
+      ALPAKA_ASSERT_OFFLOAD(isPowerOf2(warpSize));
+      auto laneId = blockThreadIdx & (warpSize - 1);
+
+      for (auto i = first; i < size; i += blockDimension) {
+        warpPrefixScan(acc, laneId, ci, co, i);
+        auto warpId = i / warpSize;
+        ALPAKA_ASSERT_OFFLOAD(warpId < warpSize);
+        if ((warpSize - 1) == laneId)
+          ws[warpId] = co[i];
+      }
+      alpaka::syncBlockThreads(acc);
+      if (size <= warpSize)
+        return;
+      if (blockThreadIdx < warpSize) {
+        warpPrefixScan(acc, laneId, ws, blockThreadIdx);
+      }
+      alpaka::syncBlockThreads(acc);
+      for (auto i = first + warpSize; i < size; i += blockDimension) {
+        int32_t warpId = i / warpSize;
+        co[i] += ws[warpId - 1];
+      }
+      alpaka::syncBlockThreads(acc);
+    } else {
+      co[0] = ci[0];
+      for (int32_t i = 1; i < size; ++i)
+        co[i] = ci[i] + co[i - 1];
+    }
+  }
+
+  template <typename TAcc, typename T>
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const TAcc& acc,
+                                                           T* __restrict__ c,
+                                                           int32_t size,
+                                                           T* __restrict__ ws = nullptr) {
+    if constexpr (!requires_single_thread_per_block_v<TAcc>) {
+      const auto warpSize = alpaka::warp::getSize(acc);
+      int32_t const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
+      int32_t const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+      ALPAKA_ASSERT_OFFLOAD(ws);
+      ALPAKA_ASSERT_OFFLOAD(size <= warpSize * warpSize);
+      ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % warpSize);
+      auto first = blockThreadIdx;
+      auto laneId = blockThreadIdx & (warpSize - 1);
+
+      for (auto i = first; i < size; i += blockDimension) {
+        warpPrefixScan(acc, laneId, c, i);
+        auto warpId = i / warpSize;
+        ALPAKA_ASSERT_OFFLOAD(warpId < warpSize);
+        if ((warpSize - 1) == laneId)
+          ws[warpId] = c[i];
+      }
+      alpaka::syncBlockThreads(acc);
+      if (size <= warpSize)
+        return;
+      if (blockThreadIdx < warpSize) {
+        warpPrefixScan(acc, laneId, ws, blockThreadIdx);
+      }
+      alpaka::syncBlockThreads(acc);
+      for (auto i = first + warpSize; i < size; i += blockDimension) {
+        auto warpId = i / warpSize;
+        c[i] += ws[warpId - 1];
+      }
+      alpaka::syncBlockThreads(acc);
+    } else {
+      for (int32_t i = 1; i < size; ++i)
+        c[i] += c[i - 1];
+    }
+  }
+
+  // in principle not limited....
+  template <typename T>
+  struct multiBlockPrefixScan {
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(
+        const TAcc& acc, T const* ci, T* co, uint32_t size, int32_t numBlocks, int32_t* pc, std::size_t warpSize) const {
+      // Get shared variable. The workspace is needed only for multi-threaded accelerators.
+      T* ws = nullptr;
+      if constexpr (!requires_single_thread_per_block_v<TAcc>) {
+        ws = alpaka::getDynSharedMem<T>(acc);
+      }
+      ALPAKA_ASSERT_OFFLOAD(warpSize == static_cast<std::size_t>(alpaka::warp::getSize(acc)));
+      [[maybe_unused]] const auto elementsPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u];
+      const auto elementsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
+      const auto threadsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
+      const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
+      const auto blockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
+      const auto threadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
+      ALPAKA_ASSERT_OFFLOAD(elementsPerGrid >= size);
+      // first each block does a scan
+      [[maybe_unused]] int off = elementsPerBlock * blockIdx;
+      if (size - off > 0) {
+        blockPrefixScan(acc, ci + off, co + off, std::min(elementsPerBlock, size - off), ws);
+      }
+
+      // count blocks that finished
+      auto& isLastBlockDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
+      //__shared__ bool isLastBlockDone;
+      if (0 == threadIdx) {
+        alpaka::mem_fence(acc, alpaka::memory_scope::Device{});
+        auto value = alpaka::atomicAdd(acc, pc, 1, alpaka::hierarchy::Blocks{});  // block counter
+        isLastBlockDone = (value == (int(blocksPerGrid) - 1));
+      }
+
+      alpaka::syncBlockThreads(acc);
+
+      if (!isLastBlockDone)
+        return;
+
+      ALPAKA_ASSERT_OFFLOAD(int(blocksPerGrid) == *pc);
+
+      // good each block has done its work and now we are left in last block
+
+      // let's get the partial sums from each block except the last, which receives 0.
+      T* psum = nullptr;
+      if constexpr (!requires_single_thread_per_block_v<TAcc>) {
+        psum = ws + warpSize;
+      } else {
+        psum = alpaka::getDynSharedMem<T>(acc);
+      }
+      for (int32_t i = threadIdx, ni = blocksPerGrid; i < ni; i += threadsPerBlock) {
+        auto j = elementsPerBlock * i + elementsPerBlock - 1;
+        psum[i] = (j < size) ? co[j] : T(0);
+      }
+      alpaka::syncBlockThreads(acc);
+      blockPrefixScan(acc, psum, psum, blocksPerGrid, ws);
+
+      // now it would have been handy to have the other blocks around...
+      // Simplify the computation by having one version where threads per block = block size
+      // and a second for the one thread per block accelerator.
+      if constexpr (!requires_single_thread_per_block_v<TAcc>) {
+        //  Here threadsPerBlock == elementsPerBlock
+        for (uint32_t i = threadIdx + threadsPerBlock, k = 0; i < size; i += threadsPerBlock, ++k) {
+          co[i] += psum[k];
+        }
+      } else {
+        // We are single threaded here, adding partial sums starting with the 2nd block.
+        for (uint32_t i = elementsPerBlock; i < size; i++) {
+          co[i] += psum[i / elementsPerBlock - 1];
+        }
+      }
+    }
+  };
+}  // namespace cms::alpakatools
+
+// declare the amount of block shared memory used by the multiBlockPrefixScan kernel
+namespace alpaka::trait {
+  // Variable size shared mem
+  template <typename TAcc, typename T>
+  struct BlockSharedMemDynSizeBytes<cms::alpakatools::multiBlockPrefixScan<T>, TAcc> {
+    template <typename TVec>
+    ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(
+        cms::alpakatools::multiBlockPrefixScan<T> const& /* kernel */,
+        TVec const& /* blockThreadExtent */,
+        TVec const& /* threadElemExtent */,
+        T const* /* ci */,
+        T const* /* co */,
+        int32_t /* size */,
+        int32_t numBlocks,
+        int32_t const* /* pc */,
+        // This trait function does not receive the accelerator object to look up the warp size
+        std::size_t warpSize) {
+      // We need workspace (T[warpsize]) + partial sums (T[numblocks]).
+      if constexpr (cms::alpakatools::requires_single_thread_per_block_v<TAcc>) {
+        return sizeof(T) * numBlocks;
+      } else {
+        return sizeof(T) * (warpSize + numBlocks);
+      }
+    }
+  };
+
+}  // namespace alpaka::trait
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_prefixScan_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
new file mode 100644
index 0000000000000..893132905f934
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
@@ -0,0 +1,427 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_radixSort_h
+#define HeterogeneousCore_AlpakaInterface_interface_radixSort_h
+
+#include <cstdint>
+#include <type_traits>
+#include <alpaka/alpaka.hpp>
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+namespace cms::alpakatools {
+
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void dummyReorder(
+      const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {}
+
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderSigned(
+      const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+    //move negative first...
+
+    auto& firstNeg = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
+    firstNeg = a[ind[0]] < 0 ? 0 : size;
+    alpaka::syncBlockThreads(acc);
+
+    // find first negative
+    for(auto idx: elements_with_stride(acc, size - 1)) {
+      if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0)
+        firstNeg = idx + 1;
+    }
+
+    alpaka::syncBlockThreads(acc);
+
+    for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[idx - firstNeg] = ind[idx]; }
+    alpaka::syncBlockThreads(acc);
+
+    for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; }
+    alpaka::syncBlockThreads(acc);
+
+    for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; }
+  }
+
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderFloat(
+      const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+    //move negative first...
+
+    auto& firstNeg = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
+    firstNeg = a[ind[0]] < 0 ? 0 : size;
+    alpaka::syncBlockThreads(acc);
+
+    // find first negative
+    for(auto idx: elements_with_stride(acc, size - 1)) {
+      if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0)
+        firstNeg = idx + 1;
+    }
+    alpaka::syncBlockThreads(acc);
+
+    for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[size - idx - 1] = ind[idx]; }
+    alpaka::syncBlockThreads(acc);
+
+    for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; }
+    alpaka::syncBlockThreads(acc);
+
+    for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; }
+  }
+
+  
+  // Radix sort implements a bytewise lexicographic order on the input data.
+  // Data is reordered into bins indexed by the byte considered. But considering least significant bytes first
+  // and respecting the existing order when binning the values, we achieve the lexicographic ordering.
+  // The number of bytes actually considered is a parameter template parameter.
+  // The post processing reorder
+  // function fixes the order when bitwise ordering is not the order for the underlying type (only based on 
+  // most significant bit for signed types, integer or floating point).
+  // The floating point numbers are reinterpret_cast into integers in the calling wrapper
+  // This algorithm requires to run in a single block
+  template <typename TAcc,
+            typename T,  // shall be integer, signed or not does not matter here
+            int NS,      // number of significant bytes to use in sorting. 
+            typename RF> // The post processing reorder function.
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortImpl(
+      const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
+    if constexpr (!requires_single_thread_per_block_v<TAcc>) {
+      const auto warpSize = alpaka::warp::getSize(acc);
+      const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+      const uint32_t blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
+      // we expect a power of 2 here
+      assert(warpSize && (0 == (warpSize & (warpSize - 1))));
+      const std::size_t warpMask  = warpSize - 1;
+
+      // Define the bin size (d=8 => 1 byte bin).
+      constexpr int binBits = 8, dataBits = 8 * sizeof(T), totalSortingPassses = dataBits / binBits;
+      // Make sure the slices are data aligned
+      static_assert (0 == dataBits % binBits);
+      // Make sure the NS parameter makes sense
+      static_assert (NS > 0 && NS <= sizeof(T));
+      constexpr int binsNumber = 1 << binBits;
+      constexpr int binsMask = binsNumber - 1;
+      // Prefix scan iterations. NS is counted in full bytes and not slices.
+      constexpr int initialSortingPass = int(sizeof(T)) - NS;
+
+      // Count/index for the prefix scan
+      // TODO: rename
+      auto& c = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
+      // Temporary storage for prefix scan. Only really needed for first-of-warp keeping
+      // Then used for thread to bin mapping TODO: change type to byte and remap to 
+      auto& ct = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
+      // Bin to thread index mapping (used to store the highest thread index within a bin number
+      // batch of threads.
+      // TODO: currently initialized to an invalid value, but could also be initialized to the
+      // lowest possible value (change to bytes?)
+      auto& cu = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
+      // TODO we could also have an explicit caching of the current index for each thread.
+      
+      // TODO: do those have to be shared?
+      auto& ibs = alpaka::declareSharedVar<int, __COUNTER__>(acc);
+      auto& currentSortingPass = alpaka::declareSharedVar<int, __COUNTER__>(acc);
+
+      ALPAKA_ASSERT_OFFLOAD(size > 0);
+      // TODO: is this a hard requirement?
+      ALPAKA_ASSERT_OFFLOAD(blockDimension >= binsNumber);
+
+      currentSortingPass = initialSortingPass;
+
+      auto j = ind;
+      auto k = ind2;
+
+      // Initializer index order to trivial increment.
+      for (auto idx: elements_with_stride(acc, size)) { j[idx] = idx; }
+      alpaka::syncBlockThreads(acc);
+
+      // Iterate on the slices of the data.
+      while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, (currentSortingPass < totalSortingPassses))) {
+        for (auto idx: elements_with_stride(acc, binsNumber)) { c[idx] = 0; }
+        alpaka::syncBlockThreads(acc);
+        const auto sortingPassShift = binBits * currentSortingPass;
+
+        // fill bins (count elements in each bin)
+        for (auto idx: elements_with_stride(acc, size)) {
+          auto bin = (a[j[idx]] >> sortingPassShift) & binsMask;
+          alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{});
+        } 
+        alpaka::syncBlockThreads(acc);
+        
+        if (!threadIdxLocal && !alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
+          printf("Pass=%d ", currentSortingPass - 1);
+          size_t total = 0;
+          for(int i=0; i<(int)binsNumber; i++) {
+            printf("count[%d]=%d ", i, c[i] );
+            total += c[i];
+          }
+          printf("total=%zu\n", total);
+          assert(total == size);
+        }
+        // prefix scan "optimized"???...
+        // TODO: we might be able to reuse the warpPrefixScan function
+        // Warp level prefix scan
+        for (auto idx: elements_with_stride(acc, binsNumber)) {
+          auto x = c[idx];
+          auto laneId = idx & warpMask;
+
+          for (int offset = 1; offset < warpSize; offset <<= 1) {
+            auto y = alpaka::warp::shfl(acc, x, laneId - offset);
+            if (laneId >= (uint32_t)offset)
+              x += y;
+          }
+          ct[idx] = x;
+        }
+        alpaka::syncBlockThreads(acc);
+        
+        // Block level completion of prefix scan (add last sum of each preceding warp)
+        for (auto idx: elements_with_stride(acc, binsNumber)) {
+          auto ss = (idx / warpSize) * warpSize - 1;
+          c[idx] = ct[idx];
+          for (int i = ss; i > 0; i -= warpSize)
+            c[idx] += ct[i];
+        }
+        // Post prefix scan, c[bin] contains the offsets in index counts to the last index +1 for each bin
+        
+        /*
+        //prefix scan for the nulls  (for documentation)
+        if (threadIdxLocal==0)
+          for (int i = 1; i < sb; ++i) c[i] += c[i-1];
+        */
+
+        // broadcast: we will fill the new index array downward, from offset c[bin], with one thread per
+        // bin, working on one set of bin size elements at a time.
+        // This will reorder the indices by the currently considered slice, otherwise preserving the previous order.
+        ibs = size - 1;
+        alpaka::syncBlockThreads(acc);
+        if (!threadIdxLocal && !alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
+          printf("Pass=%d (post prefix scan) ", currentSortingPass - 1);
+          for(int i=0; i<(int)binsNumber; i++) {
+            printf("offset[%d]=%d ", i, c[i] );
+          }
+          printf("\n");
+        }
+
+        // Iterate on bin-sized slices to (size - 1) / binSize + 1 iterations
+        while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, ibs >= 0)) {
+          // Init
+          for (auto idx: elements_with_stride(acc, binsNumber)) {
+            cu[idx] = -1;
+            ct[idx] = -1;
+          }
+          alpaka::syncBlockThreads(acc);
+
+          // Find the highest index for all the threads dealing with a given bin (in cu[])
+          // Also record the bin for each thread (in ct[])
+          for (auto idx: elements_with_stride(acc, binsNumber)) {
+            int i = ibs - idx;
+            int32_t bin = -1;
+            if (i >= 0) {
+              bin = (a[j[i]] >> sortingPassShift) & binsMask;
+              ct[idx] = bin;
+              alpaka::atomicMax(acc, &cu[bin], int(i), alpaka::hierarchy::Threads{});
+            }
+          }
+          alpaka::syncBlockThreads(acc);
+        if (!threadIdxLocal && !alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
+          printf("Pass=%d (max index) ", currentSortingPass - 1);
+          for(int i=0; i<(int)binsNumber; i++) {
+            printf("max_i[%d]=%d ", i, cu[i] );
+          }
+          printf("\n");
+        }
+
+          
+          // FIXME: we can slash a memory access.
+          for (auto idx: elements_with_stride(acc, binsNumber)) {
+            int i = ibs - idx;
+            // Are we still in inside the data?
+            if (i >= 0) {
+              int32_t bin = ct[idx];
+              // Are we the thread with the highest index (from previous pass)?
+              if (cu[bin] == i) {
+                // With the highest index, we are actually the lowest thread number. We will 
+                // work "on behalf of" the higher thread numbers (including ourselves)
+                // No way around scanning and testing for bin in ct[otherThread] number to find the other threads
+                for (int peerThreadIdx = idx; peerThreadIdx < binsNumber; peerThreadIdx++) {
+                  if (ct[peerThreadIdx] == bin) {
+                    k[--c[bin]] = j[ibs - peerThreadIdx];
+                  }
+                }
+              }
+            }
+            /*
+            int32_t bin = (i >= 0 ? ((a[j[i]] >> sortingPassShift) & binsMask) : -1);
+            if (i >= 0 && i == cu[bin])  // ensure to keep them in order: only one thread per bin is active, rest is idle.
+              // 
+              for (int ii = idx; ii < sb; ++ii)
+                if (ct[ii] == bin) {
+                  auto oi = ii - idx;
+                  // assert(i>=oi);if(i>=oi)
+                  k[--c[bin]] = j[i - oi]; // i = ibs - idx, oi = ii - idx => i - oi = ibs - ii;
+                }
+            */
+          }
+          alpaka::syncBlockThreads(acc);
+
+          if (threadIdxLocal == 0) {
+            ibs -= binsNumber;
+            // https://github.com/cms-patatrack/pixeltrack-standalone/pull/210
+            // TODO: is this really needed?
+            alpaka::mem_fence(acc, alpaka::memory_scope::Grid{});
+          }
+          alpaka::syncBlockThreads(acc);
+        }
+
+        /*
+        // broadcast for the nulls  (for documentation)
+        if (threadIdxLocal==0)
+        for (int i=size-first-1; i>=0; i--) { // =blockDim.x) {
+          auto bin = (a[j[i]] >> d*p)&(sb-1);
+          auto ik = atomicSub(&c[bin],1);
+          k[ik-1] = j[i];
+        }
+        */
+
+        alpaka::syncBlockThreads(acc);
+        ALPAKA_ASSERT_OFFLOAD(c[0] == 0);
+
+        // swap (local, ok)
+        auto t = j;
+        j = k;
+        k = t;
+
+        const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+        if (threadIdxLocal == 0)
+          ++currentSortingPass;
+        alpaka::syncBlockThreads(acc);
+        if (!threadIdxLocal && size == 257) {
+          printf("Pass=%d ", currentSortingPass - 1);
+          for(int i=0; i<(int)size; i++) {
+            printf("k[%d]=%d ", i, k[i] );
+          }
+          printf("\n");
+        }
+      }
+
+      if ((dataBits != 8) && (0 == (NS & 1)))
+        ALPAKA_ASSERT_OFFLOAD(j == ind);  // dataBits/binBits is even so ind is correct (the result is in the right location)
+
+      // TODO this copy is (doubly?) redundant with the reorder
+      if (j != ind)  // odd number of sorting passes, we need to move the result to the right array (ind[])
+        for (auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; };
+
+      alpaka::syncBlockThreads(acc);
+
+      // now move negative first... (if signed)
+      // TODO: the ind2 => ind copy should have beed deferred. We should pass (j != ind) as an extra parameter
+      reorder(acc, a, ind, ind2, size);
+    } else {
+      //static_assert(false);
+    }
+  }
+
+  template <typename TAcc,
+            typename T,
+            int NS = sizeof(T),  // number of significant bytes to use in sorting
+            typename std::enable_if<std::is_unsigned<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
+      const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+    if (!alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]) {
+      printf("GPU radixSort unsigned, a=%p, block=%d, size=%d\n", a, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0], size);
+    }
+    radixSortImpl<TAcc, T, NS>(acc, a, ind, ind2, size, dummyReorder<TAcc, T>);
+  }
+
+  template <typename TAcc,
+            typename T,
+            int NS = sizeof(T),  // number of significant bytes to use in sorting
+            typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
+      const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+    if (!alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]) {
+      printf("GPU radixSort signed, a=%p, block=%d, size=%d\n", a, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0], size);
+    }
+    radixSortImpl<TAcc, T, NS>(acc, a, ind, ind2, size, reorderSigned<TAcc, T>);
+  }
+
+  template <typename TAcc,
+            typename T,
+            int NS = sizeof(T),  // number of significant bytes to use in sorting
+            typename std::enable_if<std::is_floating_point<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
+      const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+    static_assert(sizeof(T) == sizeof(int), "radixSort with the wrong type size");
+    using I = int;
+    if (!alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]) {
+      printf("GPU radixSort float, a=%p, block=%d, size=%d\n", a, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0], size);
+    }
+    radixSortImpl<TAcc, I, NS>(acc, (I const*)(a), ind, ind2, size, reorderFloat<TAcc, I>);
+  }
+  
+  template <typename TAcc,
+            typename T,
+            int NS = sizeof(T),  // number of significant bytes to use in sorting
+            typename std::enable_if<requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
+      const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+    static_assert(requires_single_thread_per_block_v<TAcc>, "CPU sort (not a radixSort) called wtth wrong accelerator");
+    // Initialize the index array
+    for (std::size_t i = 0; i < size; i++) ind[i] = i;
+    printf("std::sort(a=%p, ind=%p, indmax=%p, size=%d)\n", a, ind, ind + size, size);
+    for (uint32_t i=0; i<10 && i<size; i++) {
+      printf ("a[%d]=%ld ", i, (long int)a[i]);
+    }
+    printf("\n");
+    for (uint32_t i=0; i<10 && i<size; i++) {
+      printf ("ind[%d]=%d ", i, ind[i]);
+    }
+    printf("\n");
+    std::sort(ind, ind+size, [a](uint16_t i0, uint16_t i1) { return a[i0] < a[i1]; });
+    for (uint32_t i=0; i<10 && i<size; i++) {
+      printf ("ind[%d]=%d ", i, ind[i]);
+    }
+    printf("\n");
+  }
+  
+                                                                                                                                                                                   template <typename TAcc, typename T, int NS = sizeof(T)>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti(const TAcc& acc,
+                                                 T const* v,
+                                                 uint16_t* index,
+                                                 uint32_t const* offsets,
+                                                 uint16_t* workspace) {
+    // TODO: check 
+    // Sort multiple blocks of data in v[] separated by in chunks located at offsets[]
+    // extern __shared__ uint16_t ws[];
+    uint16_t * ws = alpaka::getDynSharedMem<uint16_t>(acc);
+
+    const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+    auto a = v + offsets[blockIdx];
+    auto ind = index + offsets[blockIdx];
+    auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx];
+    auto size = offsets[blockIdx + 1] - offsets[blockIdx];
+    assert(offsets[blockIdx + 1] >= offsets[blockIdx]);
+    if (0 == alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0]) {
+      printf("Block=%d, offsets[blockIdx]=%d, size=%d, v=%p, v[offset[blockId]=%p\n", 
+              blockIdx, offsets[blockIdx], size, v, &v[offsets[blockIdx]]);
+    }
+    if (size > 0)
+      radixSort<TAcc, T, NS>(acc, a, ind, ind2, size);
+  }
+ 
+  template <typename T, int NS = sizeof(T)>
+  struct radixSortMultiWrapper {
+/* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    //__global__ void __launch_bounds__(256, 4)
+#endif
+*/
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const {
+      radixSortMulti<TAcc, T, NS>(acc, v, index, offsets, workspace);
+    }
+  };
+
+  template <typename T, int NS = sizeof(T)>
+  struct radixSortMultiWrapper2 {
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const {
+      radixSortMulti<TAcc, T, NS>(acc, v, index, offsets, workspace);
+    }
+  };
+}  // namespace cms::alpakatools
+
+#endif  // HeterogeneousCore_AlpakaInterface_interface_radixSort_h
diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
index 2d204819d740b..463bf5dae4461 100644
--- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
+++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
@@ -23,3 +23,60 @@
   <use name="catch2"/>
   <use name="HeterogeneousCore/AlpakaInterface"/>
 </bin>
+
+<bin name="alpakaTestAtomicPairCounter" file="alpaka/testAtomicPairCounter.dev.cc">
+  <use name="alpaka"/>
+  <use name="catch2"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
+<bin name="alpakaTestHistoContainer" file="alpaka/testHistoContainer.dev.cc">
+  <use name="alpaka"/>
+  <use name="catch2"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
+<bin name="alpakaTestPrefixScan" file="alpaka/testPrefixScan.dev.cc">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/> 
+</bin>
+
+<bin name="alpakaTestOneHistoContainer" file="alpaka/testOneHistoContainer.dev.cc">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
+<bin name="alpakaTestOneToManyAssoc" file="alpaka/testOneToManyAssoc.dev.cc">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
+<bin name="alpakaTestSimpleVector" file="alpaka/testSimpleVector.dev.cc">
+  <use name="alpaka"/>
+  <use name="catch2"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
+<bin name="alpakaTestOneRadixSort" file="alpaka/testOneRadixSort.dev.cc">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
+<bin name="alpakaTestRadixSort" file="alpaka/testRadixSort.dev.cc">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
+<bin name="alpakaWorkDivision" file="alpaka/testWorkDivision.dev.cc">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
new file mode 100644
index 0000000000000..f4073997f5c97
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
@@ -0,0 +1,106 @@
+#include <cassert>
+#include <iostream>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h"
+
+using namespace cms::alpakatools;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestAtomicPair) "]";
+
+struct update {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(
+      const TAcc &acc, AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const {
+    for (auto i : elements_with_stride(acc, n)) {
+      auto m = i % 11;
+      m = m % 6 + 1;  // max 6, no 0
+      auto c = dc->inc_add(acc, m);
+      assert(c.first < n);
+      ind[c.first] = c.second;
+      for (uint32_t j = c.second; j < c.second + m; ++j)
+        cont[j] = i;
+    }
+  }
+};
+
+struct finalize {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(
+      const TAcc &acc, AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const {
+    assert(dc->get().first == n);
+    ind[n] = dc->get().second;
+  }
+};
+
+TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestAtomicPair), s_tag) {
+  SECTION("AtomicPairCounter") {
+    auto const &devices = cms::alpakatools::devices<Platform>();
+    REQUIRE(!devices.empty());
+    // run the test on each device
+    for (auto const &device : devices) {
+      std::cout << "Test AtomicPairCounter on " << alpaka::getName(device) << '\n';
+      auto queue = Queue(device);
+
+      auto c_d = make_device_buffer<AtomicPairCounter>(queue);
+      alpaka::memset(queue, c_d, 0);
+
+      std::cout << "- size " << sizeof(AtomicPairCounter) << std::endl;
+
+      constexpr uint32_t N = 20000;
+      constexpr uint32_t M = N * 6;
+      auto n_d = make_device_buffer<uint32_t[]>(queue, N);
+      auto m_d = make_device_buffer<uint32_t[]>(queue, M);
+
+      constexpr uint32_t NUM_VALUES = 10000;
+
+      // Update
+      const auto blocksPerGrid = 2000u;
+      const auto threadsPerBlockOrElementsPerThread = 512u;
+      const auto workDiv = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
+      alpaka::enqueue(
+          queue, alpaka::createTaskKernel<Acc1D>(workDiv, update(), c_d.data(), n_d.data(), m_d.data(), NUM_VALUES));
+
+      // Finalize
+      const auto blocksPerGridFinalize = 1u;
+      const auto threadsPerBlockOrElementsPerThreadFinalize = 1u;
+      const auto workDivFinalize =
+          make_workdiv<Acc1D>(blocksPerGridFinalize, threadsPerBlockOrElementsPerThreadFinalize);
+      alpaka::enqueue(
+          queue,
+          alpaka::createTaskKernel<Acc1D>(workDivFinalize, finalize(), c_d.data(), n_d.data(), m_d.data(), NUM_VALUES));
+
+      auto c_h = make_host_buffer<AtomicPairCounter>(queue);
+      auto n_h = make_host_buffer<uint32_t[]>(queue, N);
+      auto m_h = make_host_buffer<uint32_t[]>(queue, M);
+
+      // copy the results from the device to the host
+      alpaka::memcpy(queue, c_h, c_d);
+      alpaka::memcpy(queue, n_h, n_d);
+      alpaka::memcpy(queue, m_h, m_d);
+
+      // wait for all the operations to complete
+      alpaka::wait(queue);
+
+      REQUIRE(c_h.data()->get().first == NUM_VALUES);
+      REQUIRE(n_h[NUM_VALUES] == c_h.data()->get().second);
+      REQUIRE(n_h[0] == 0);
+
+      for (size_t i = 0; i < NUM_VALUES; ++i) {
+        auto ib = n_h.data()[i];
+        auto ie = n_h.data()[i + 1];
+        auto k = m_h.data()[ib++];
+        REQUIRE(k < NUM_VALUES);
+
+        for (; ib < ie; ++ib)
+          REQUIRE(m_h.data()[ib] == k);
+      }
+    }
+  }
+}
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc
new file mode 100644
index 0000000000000..57b80cc9cf275
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc
@@ -0,0 +1,255 @@
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+
+using namespace cms::alpakatools;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestHistoContainer) "]";
+
+template <typename T, typename Hist, typename VERIFY, typename INCR>
+void checkContents(Hist* h,
+                   unsigned int N,
+                   VERIFY& verify,
+                   INCR& incr,
+                   T window,
+                   uint32_t nParts,
+                   const T* v,
+                   const uint32_t* offsets) {
+  for (uint32_t j = 0; j < nParts; ++j) {
+    auto off = Hist::histOff(j);
+    for (uint32_t i = 0; i < Hist::nbins(); ++i) {
+      auto ii = i + off;
+      if (0 == h->size(ii))
+        continue;
+      auto k = *h->begin(ii);
+      if (j % 2)
+        k = *(h->begin(ii) + (h->end(ii) - h->begin(ii)) / 2);
+#ifndef NDEBUG
+      [[maybe_unused]] auto bk = h->bin(v[k]);
+#endif
+      ALPAKA_ASSERT_OFFLOAD(bk == i);
+      ALPAKA_ASSERT_OFFLOAD(k < offsets[j + 1]);
+      auto kl = h->bin(v[k] - window);
+      auto kh = h->bin(v[k] + window);
+      ALPAKA_ASSERT_OFFLOAD(kl != i);
+      ALPAKA_ASSERT_OFFLOAD(kh != i);
+      // std::cout << kl << ' ' << kh << std::endl;
+
+      auto me = v[k];
+      auto tot = 0;
+      auto nm = 0;
+      bool l = true;
+      auto khh = kh;
+      incr(khh);
+      for (auto kk = kl; kk != khh; incr(kk)) {
+        if (kk != kl && kk != kh)
+          nm += h->size(kk + off);
+        for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p) {
+          if (std::min(std::abs(T(v[*p] - me)), std::abs(T(me - v[*p]))) > window) {
+          } else {
+            ++tot;
+          }
+        }
+        if (kk == i) {
+          l = false;
+          continue;
+        }
+        if (l)
+          for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p)
+            verify(i, k, k, (*p));
+        else
+          for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p)
+            verify(i, k, (*p), k);
+      }
+      if (!(tot >= nm)) {
+        std::cout << "too bad " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/'
+                  << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm
+                  << std::endl;
+      }
+      if (l)
+        std::cout << "what? " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/'
+                  << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm
+                  << std::endl;
+      ALPAKA_ASSERT_OFFLOAD(!l);
+    }
+  }
+  int status;
+  auto* demangled = abi::__cxa_demangle(typeid(Hist).name(), NULL, NULL, &status);
+  status || printf("Check contents OK with %s\n", demangled);
+  std::free(demangled);
+}
+
+template <typename T>
+int go(const DevHost& host, const Device& device, Queue& queue) {
+  std::mt19937 eng(2708);
+  std::uniform_int_distribution<T> rgen(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+
+  constexpr unsigned int N = 12000;
+  auto v = make_host_buffer<T[]>(queue, N);
+  auto v_d = make_device_buffer<T[]>(queue, N);
+  alpaka::memcpy(queue, v_d, v);
+
+  constexpr uint32_t nParts = 10;
+  constexpr uint32_t partSize = N / nParts;
+
+  using Hist = HistoContainer<T, 128, N, 8 * sizeof(T), uint32_t, nParts>;
+  using HistR = HistoContainer<T, 128, -1, 8 * sizeof(T), uint32_t, nParts>;
+  std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
+            << Hist{}.capacity() << ' ' << offsetof(Hist, content) - offsetof(Hist, off) << ' '
+            << (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / Hist::nbins() << std::endl;
+  std::cout << "HistoContainer Runtime sized " << (int)(offsetof(HistR, off)) << ' ' << HistR::nbins() << ' '
+            << HistR::totbins() << ' ' << HistR{}.capacity() << ' ' << offsetof(HistR, content) - offsetof(HistR, off)
+            << ' ' << (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / HistR::nbins() << std::endl;
+
+  // Offsets for each histogram.
+  auto offsets = make_host_buffer<uint32_t[]>(queue, nParts + 1);
+  auto offsets_d = make_device_buffer<uint32_t[]>(queue, nParts + 1);
+
+  // Compile sized histogram (self contained)
+  auto h = make_host_buffer<Hist>(queue);
+  auto h_d = make_device_buffer<Hist>(queue);
+
+  // Run time sized histogram
+  auto hr = make_host_buffer<HistR>(queue);
+  // Data storage for histogram content (host)
+  auto hd = make_host_buffer<typename HistR::index_type[]>(queue, N);
+  auto hr_d = make_device_buffer<HistR>(queue);
+  // Data storage for histogram content (device)
+  auto hd_d = make_device_buffer<typename HistR::index_type[]>(queue, N);
+
+  // We iterate the test 5 times.
+  for (int it = 0; it < 5; ++it) {
+    offsets[0] = 0;
+    for (uint32_t j = 1; j < nParts + 1; ++j) {
+      offsets[j] = offsets[j - 1] + partSize - 3 * j;
+      ALPAKA_ASSERT_OFFLOAD(offsets[j] <= N);
+    }
+
+    if (it == 1) {  // special cases...
+      offsets[0] = 0;
+      offsets[1] = 0;
+      offsets[2] = 19;
+      offsets[3] = 32 + offsets[2];
+      offsets[4] = 123 + offsets[3];
+      offsets[5] = 256 + offsets[4];
+      offsets[6] = 311 + offsets[5];
+      offsets[7] = 2111 + offsets[6];
+      offsets[8] = 256 * 11 + offsets[7];
+      offsets[9] = 44 + offsets[8];
+      offsets[10] = 3297 + offsets[9];
+    }
+
+    alpaka::memcpy(queue, offsets_d, offsets);
+
+    for (long long j = 0; j < N; j++)
+      v[j] = rgen(eng);
+
+    if (it == 2) {  // big bin
+      for (long long j = 1000; j < 2000; j++)
+        v[j] = sizeof(T) == 1 ? 22 : 3456;
+    }
+
+    // for(unsigned int i=0;i<N;i++)
+    // {
+    //   std::cout << "values>" << v[i] << std::endl;
+    // }
+    alpaka::memcpy(queue, v_d, v);
+
+    alpaka::memset(queue, h_d, 0);
+    alpaka::memset(queue, hr_d, 0);
+    alpaka::memset(queue, hd_d, 0);
+
+    alpaka::wait(queue);
+
+    std::cout << "Calling fillManyFromVector - " << h->size() << std::endl;
+    fillManyFromVector<Acc1D>(h_d.data(), nParts, v_d.data(), offsets_d.data(), offsets[10], 256, queue);
+
+    std::cout << "Calling fillManyFromVector(runtime sized) - " << h->size() << std::endl;
+    typename HistR::View hrv_d;
+    hrv_d.assoc = hr_d.data();
+    hrv_d.offSize = -1;
+    hrv_d.offStorage = nullptr;
+    hrv_d.contentSize = N;
+    hrv_d.contentStorage = hd_d.data();
+    fillManyFromVector<Acc1D>(hr_d.data(), hrv_d, nParts, v_d.data(), offsets_d.data(), offsets[10], 256, queue);
+
+    alpaka::memcpy(queue, h, h_d);
+    // For the runtime sized version:
+    // We need the histogram for non external data (here, the offsets)
+    // .. and external data storage (here, the contents)
+    // .. and plug the data storage address into the histo container
+    alpaka::memcpy(queue, hr, hr_d);
+    alpaka::memcpy(queue, hd, hd_d);
+
+    // std::cout << "Calling fillManyFromVector - " <<  h->size() << std::endl;
+    alpaka::wait(queue);
+
+    // We cannot update the contents address of the histo container before the copy from device happened
+    typename HistR::View hrv;
+    hrv.assoc = hr.data();
+    hrv.offSize = -1;
+    hrv.offStorage = nullptr;
+    hrv.contentSize = N;
+    hrv.contentStorage = hd.data();
+    hr->initStorage(hrv);
+
+    std::cout << "Copied results" << std::endl;
+    // for(int i =0;i<=10;i++)
+    // {
+    //   std::cout << offsets[i] <<" - "<< h->size() << std::endl;
+    // }
+
+    ALPAKA_ASSERT_OFFLOAD(0 == h->off[0]);
+    ALPAKA_ASSERT_OFFLOAD(offsets[10] == h->size());
+    ALPAKA_ASSERT_OFFLOAD(0 == hr->off[0]);
+    ALPAKA_ASSERT_OFFLOAD(offsets[10] == hr->size());
+
+    auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) {
+      ALPAKA_ASSERT_OFFLOAD(t1 < N);
+      ALPAKA_ASSERT_OFFLOAD(t2 < N);
+      if (T(v[t1] - v[t2]) <= 0)
+        std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl;
+    };
+
+    auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); };
+
+    // make sure it spans 3 bins...
+    auto window = T(1300);
+    checkContents<T>(h.data(), N, verify, incr, window, nParts, v.data(), offsets.data());
+    checkContents<T>(hr.data(), N, verify, incr, window, nParts, v.data(), offsets.data());
+  }
+
+  return 0;
+}
+
+TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestHistoContainer), s_tag) {
+  SECTION("HistoContainerKernel") {
+    // get the list of devices on the current platform
+    auto const& devices = cms::alpakatools::devices<Platform>();
+    auto const& host = cms::alpakatools::host();
+    if (devices.empty()) {
+      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+                << ", the test will be skipped.\n";
+      return;
+    }
+    // run the test on each device
+    for (auto const& device : devices) {
+      std::cout << "Test Histo Container on " << alpaka::getName(device) << '\n';
+      auto queue = Queue(device);
+
+      REQUIRE(go<int16_t>(host, device, queue) == 0);
+      REQUIRE(go<int8_t>(host, device, queue) == 0);
+    }
+  }
+}
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
new file mode 100644
index 0000000000000..5e08c05725888
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
@@ -0,0 +1,187 @@
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+
+using namespace cms::alpakatools;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+template <int NBINS, int S, int DELTA>
+struct mykernel {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* __restrict__ v, uint32_t N) const {
+    ALPAKA_ASSERT_OFFLOAD(v);
+    ALPAKA_ASSERT_OFFLOAD(N == 12000);
+
+    const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+    if (threadIdxLocal == 0) {
+      printf("start kernel for %d data\n", N);
+    }
+
+    using Hist = HistoContainer<T, NBINS, 12000, S, uint16_t>;
+
+    auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
+    auto& ws = alpaka::declareSharedVar<typename Hist::Counter[32], __COUNTER__>(acc);
+
+    // set off zero
+    for (auto j : elements_with_stride(acc, Hist::totbins())) {
+      hist.off[j] = 0;
+    }
+    alpaka::syncBlockThreads(acc);
+
+    // set bins zero
+    for (auto j : elements_with_stride(acc, Hist::totbins())) {
+      hist.content[j] = 0;
+    }
+    alpaka::syncBlockThreads(acc);
+
+    // count
+    for (auto j : elements_with_stride(acc, N)) {
+      hist.count(acc, v[j]);
+    }
+    alpaka::syncBlockThreads(acc);
+
+    ALPAKA_ASSERT_OFFLOAD(0 == hist.size());
+    alpaka::syncBlockThreads(acc);
+
+    // finalize
+    hist.finalize(acc, ws);
+    alpaka::syncBlockThreads(acc);
+
+    ALPAKA_ASSERT_OFFLOAD(N == hist.size());
+
+    // verify
+    for ([[maybe_unused]] auto j : elements_with_stride(acc, Hist::nbins())) {
+      ALPAKA_ASSERT_OFFLOAD(hist.off[j] <= hist.off[j + 1]);
+    }
+    alpaka::syncBlockThreads(acc);
+
+    for (auto j : elements_with_stride(acc, 32)) {
+      ws[j] = 0;  // used by prefix scan...
+    }
+    alpaka::syncBlockThreads(acc);
+
+    // fill
+    for (auto j : elements_with_stride(acc, N)) {
+      hist.fill(acc, v[j], j);
+    }
+    alpaka::syncBlockThreads(acc);
+
+    ALPAKA_ASSERT_OFFLOAD(0 == hist.off[0]);
+    ALPAKA_ASSERT_OFFLOAD(N == hist.size());
+
+    // bin
+#ifndef NDEBUG
+    for (auto j : elements_with_stride(acc, hist.size() - 1)) {
+      auto p = hist.begin() + j;
+      ALPAKA_ASSERT_OFFLOAD((*p) < N);
+      [[maybe_unused]] auto k1 = Hist::bin(v[*p]);
+      [[maybe_unused]] auto k2 = Hist::bin(v[*(p + 1)]);
+      ALPAKA_ASSERT_OFFLOAD(k2 >= k1);
+    }
+#endif
+
+    // forEachInWindow
+    for (auto i : elements_with_stride(acc, hist.size())) {
+      auto p = hist.begin() + i;
+      auto j = *p;
+#ifndef NDEBUG
+      auto b0 = Hist::bin(v[j]);
+#endif
+      int tot = 0;
+      auto ftest = [&](unsigned int k) {
+        ALPAKA_ASSERT_OFFLOAD(k < N);
+        ++tot;
+      };
+      forEachInWindow(hist, v[j], v[j], ftest);
+#ifndef NDEBUG
+      [[maybe_unused]] int rtot = hist.size(b0);
+      ALPAKA_ASSERT_OFFLOAD(tot == rtot);
+#endif
+      tot = 0;
+      auto vm = int(v[j]) - DELTA;
+      auto vp = int(v[j]) + DELTA;
+      constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits<T>::max();
+      vm = std::max(vm, 0);
+      vm = std::min(vm, vmax);
+      vp = std::min(vp, vmax);
+      vp = std::max(vp, 0);
+      ALPAKA_ASSERT_OFFLOAD(vp >= vm);
+      forEachInWindow(hist, vm, vp, ftest);
+#ifndef NDEBUG
+      int bp = Hist::bin(vp);
+      int bm = Hist::bin(vm);
+      rtot = hist.end(bp) - hist.begin(bm);
+      ALPAKA_ASSERT_OFFLOAD(tot == rtot);
+#endif
+    }
+  }
+};
+
+template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
+void go(const DevHost& host, const Device& device, Queue& queue) {
+  std::mt19937 eng;
+
+  int rmin = std::numeric_limits<T>::min();
+  int rmax = std::numeric_limits<T>::max();
+  if (NBINS != 128) {
+    rmin = 0;
+    rmax = NBINS * 2 - 1;
+  }
+
+  std::uniform_int_distribution<T> rgen(rmin, rmax);
+  constexpr unsigned int N = 12000;
+
+  using Hist = HistoContainer<T, NBINS, N, S>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist{}.capacity() << ' '
+            << (rmax - rmin) / Hist::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
+
+  auto v = make_host_buffer<T[]>(queue, N);
+  auto v_d = make_device_buffer<T[]>(queue, N);
+
+  for (int it = 0; it < 5; ++it) {
+    for (long long j = 0; j < N; j++)
+      v[j] = rgen(eng);
+    if (it == 2)
+      for (long long j = N / 2; j < N / 2 + N / 4; j++)
+        v[j] = 4;
+
+    alpaka::memcpy(queue, v_d, v);
+
+    const auto threadsPerBlockOrElementsPerThread = 256u;
+    const auto blocksPerGrid = 1u;
+    const auto workDiv = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv, mykernel<NBINS, S, DELTA>(), v_d.data(), N));
+  }
+  alpaka::wait(queue);
+}
+
+int main() {
+  auto const& devices = cms::alpakatools::devices<Platform>();
+  if (devices.empty()) {
+    std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+              << ", the test will be skipped.\n";
+    return 0;
+  }
+
+  auto const& host = cms::alpakatools::host();
+
+  // run the test on each device
+  for (auto const& device : devices) {
+    std::cout << "Test One Histo Container on " << alpaka::getName(device) << '\n';
+
+    auto queue = Queue(device);
+
+    go<int16_t>(host, device, queue);
+    go<uint8_t, 128, 8, 4>(host, device, queue);
+    go<uint16_t, 313 / 2, 9, 4>(host, device, queue);
+  }
+
+  return 0;
+}
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
new file mode 100644
index 0000000000000..c781f83ff1225
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
@@ -0,0 +1,237 @@
+// #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <algorithm>
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+using namespace cms::alpakatools;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+using FLOAT = double;
+
+// A templated unsigned integer type with N bytes
+template <int N>
+struct uintN;
+
+template <>
+struct uintN<8> {
+  using type = uint8_t;
+};
+
+template <>
+struct uintN<16> {
+  using type = uint16_t;
+};
+
+template <>
+struct uintN<32> {
+  using type = uint32_t;
+};
+
+template <>
+struct uintN<64> {
+  using type = uint64_t;
+};
+
+template <int N>
+using uintN_t = typename uintN<N>::type;
+
+// A templated unsigned integer type with the same size as T
+template <typename T>
+using uintT_t = uintN_t<sizeof(T) * 8>;
+
+// Keep only the `N` most significant bytes of `t`, and set the others to zero
+template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
+ALPAKA_FN_HOST_ACC T truncate(T const& t) {
+  const int shift = 8 * (sizeof(T) - N);
+  union {
+    T t;
+    uintT_t<T> u;
+  } c;
+  c.t = t;
+  c.u = c.u >> shift << shift;
+  return c.t;
+}
+
+namespace {
+  struct testKernel {
+    template <typename TAcc>
+    ALPAKA_FN_ACC void operator() (const TAcc &acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const {
+      //size_t firstElement = threadIdx.x + blockIdx.x * blockDim.x;  // This is going to be the track index
+      //size_t gridSize = blockDim.x * gridDim.x;
+      bool threadZero = !alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
+
+      // radix sort works in a single block (and the assert macro does not like the comma of the template parameters).
+      const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
+      const auto blocksIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
+      assert(1 == blocksPerGrid);
+      assert(0 == blocksIdx);
+      assert(elements <= 2048);
+
+      auto &order = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
+      auto &sws = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
+      auto &z = alpaka::declareSharedVar<float[2048], __COUNTER__>(acc);
+      auto &iz = alpaka::declareSharedVar<int[2048], __COUNTER__>(acc);
+//      __shared__ uint16_t order[2048];
+//      __shared__ uint16_t sws[2048];
+//      __shared__ float z[2048];
+//      __shared__ int iz[2048];
+      for (auto itrack: elements_with_stride(acc, elements)) {
+        z[itrack] = gpu_input[itrack];
+        iz[itrack] = 10000 * gpu_input[itrack];
+        // order[itrack] = itrack;
+      }
+      alpaka::syncBlockThreads(acc);
+      radixSort<TAcc, float, 2>(acc, z, order, sws, elements);
+      alpaka::syncBlockThreads(acc);
+
+      //verify
+      for (auto itrack: elements_with_stride(acc, elements - 1)) {
+        auto ntrack = order[itrack];
+        auto mtrack = order[itrack + 1];
+        assert(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack]));
+      }
+
+      alpaka::syncBlockThreads(acc);
+
+      if (doPrint)
+        if (threadZero) {
+          for (int itrackO = 0; itrackO < elements; itrackO++) {
+            int itrack = order[itrackO];
+            printf(
+                "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
+                "radixSort %f\n",
+                elements,
+                itrackO,
+                itrack,
+                gpu_input[itrack],
+                z[itrack]);
+            gpu_product[itrackO] = itrack;
+          }
+        }
+
+      alpaka::syncBlockThreads(acc);
+      radixSort<TAcc, int, 4>(acc, iz, order, sws, elements);
+      alpaka::syncBlockThreads(acc);
+
+      for (auto itrack: elements_with_stride(acc, elements - 1)) {
+        auto ntrack = order[itrack];
+        auto mtrack = order[itrack + 1];
+        assert(iz[ntrack] <= iz[mtrack]);
+      }
+
+      if (doPrint)
+        if (threadZero) {
+          for (int itrackO = 0; itrackO < elements; itrackO++) {
+            int itrack = order[itrackO];
+            printf(
+                "Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
+                "radixSort %d\n",
+                elements,
+                itrackO,
+                itrack,
+                gpu_input[itrack],
+                iz[itrack]);
+            gpu_product[itrackO] = itrack;
+          }
+        }
+    }
+  };
+
+  void testWrapper(Queue & queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
+    auto blockSize = 512;  // somewhat arbitrary
+    auto gridSize = 1;     // round up to cover the sample size
+    const auto workdiv = make_workdiv<Acc1D>(gridSize, blockSize);
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint));
+    alpaka::wait(queue);
+  }
+}  // namespace
+
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+int main() {
+  // get the list of devices on the current platform
+  auto const& devices = cms::alpakatools::devices<Platform>();
+  if (devices.empty()) {
+    std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+              << ", the test will be skipped.\n";
+    return 0;
+  }
+  
+  // run the test on each device
+  for (auto const& device : devices) {
+    Queue queue(device);
+//    FLOAT* gpu_input;
+//    int* gpu_product;
+
+    int nmax = 4 * 260;
+    auto gpu_input_h = cms::alpakatools::make_host_buffer<FLOAT[]>(queue, nmax);
+    auto i = gpu_input_h.data();
+    for (auto v: {
+        30.0,         30.0,         -4.4,         -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, 2.2666890621,
+        2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453,  2.2852313519,  2.2877883911, 2.2946476936, 2.2960267067,
+        2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652,  2.783257246,   2.8440306187, 2.9017834663, 2.9252648354,
+        2.9254128933, 2.927520752,  2.9422419071, 2.9453969002,  2.9457902908,  2.9465973377, 2.9492356777, 2.9573802948,
+        2.9575133324, 2.9575304985, 2.9586606026, 2.9605507851,  2.9622797966,  2.9625515938, 2.9641008377, 2.9646151066,
+        2.9676523209, 2.9708273411, 2.974111557,  2.9742531776,  2.9772830009,  2.9877333641, 2.9960610867, 3.013969183,
+        3.0187871456, 3.0379793644, 3.0407221317, 3.0415751934,  3.0470511913,  3.0560519695, 3.0592908859, 3.0599737167,
+        3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904,  3.0643932819,  3.0645000935, 3.0666446686, 3.068046093,
+        3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026,  3.0733406544,  3.0738227367, 3.0738801956, 3.0738828182,
+        3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146,  3.0773906708,  3.0778541565, 3.0780284405, 3.0780889988,
+        3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217,  3.0795567036,  3.0797944069, 3.0806643963, 3.0809247494,
+        3.0815284252, 3.0817306042, 3.0819730759, 3.0820026398,  3.0838682652,  3.084009409,  3.0848178864, 3.0853257179,
+        3.0855510235, 3.0856611729, 3.0873703957, 3.0884618759,  3.0891149044,  3.0893011093, 3.0895674229, 3.0901503563,
+        3.0903317928, 3.0912668705, 3.0920717716, 3.0954346657,  3.096424818,   3.0995628834, 3.1001036167, 3.1173279285,
+        3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508,  3.1676850319,  3.1886672974, 3.3769197464, 3.3821125031,
+        3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872,  3.5225863457,  3.5271244049, 3.5298995972, 3.5417425632,
+        3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239,  3.5526945591,  3.5531234741, 3.5538012981, 3.5544877052,
+        3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172,  3.5560717583,  3.5560848713, 3.5584278107, 3.558681488,
+        3.5587313175, 3.5592217445, 3.559384346,  3.5604712963,  3.5634038448,  3.563803196,  3.564593792,  3.5660364628,
+        3.5683133602, 3.5696356297, 3.569729805,  3.5740811825,  3.5757565498,  3.5760207176, 3.5760478973, 3.5836098194,
+        3.5839796066, 3.5852358341, 3.5901627541, 3.6141786575,  3.6601481438,  3.7187042236, 3.9741659164, 4.4111995697,
+        4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405,  4.6868157387,  4.6868577003, 4.6879930496, 4.6888813972,
+        4.6910686493, 4.6925001144, 4.6957530975, 4.698094368,   4.6997032166,  4.7017259598, 4.7020640373, 4.7024269104,
+        4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456,  4.7044372559,  4.7050771713, 4.7055773735, 4.7060651779,
+        4.7062759399, 4.7065420151, 4.70657444,   4.7066287994,  4.7066788673,  4.7067341805, 4.7072944641, 4.7074551582,
+        4.7075614929, 4.7075891495, 4.7076044083, 4.7077374458,  4.7080879211,  4.70819664,   4.7086658478, 4.708937645,
+        4.7092385292, 4.709479332,  4.7095656395, 4.7100076675,  4.7102108002,  4.7104525566, 4.7105507851, 4.71118927,
+        4.7113513947, 4.7115578651, 4.7116270065, 4.7116751671,  4.7117190361,  4.7117333412, 4.7117910385, 4.7119007111,
+        4.7120013237, 4.712003231,  4.712044239,  4.7122926712,  4.7135767937,  4.7143669128, 4.7145690918, 4.7148418427,
+        4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611,  4.717815876,   4.718059063,  4.7188801765, 4.7190728188,
+        4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678,  4.7243990898,  4.7273659706, 4.7294125557, 4.7296204567,
+        4.7325615883, 4.7356877327, 4.740146637,  4.742254734,   4.7433848381,  4.7454957962, 4.7462964058, 4.7692604065,
+        4.7723139628, 4.774812736,  4.8577151299, 4.890037536}) {
+      *(i++) = v;
+    }
+    auto input = gpu_input_h.data();
+    for (int i = 0; i < 260; i++) {
+      input[i + 260] = -input[i];
+      input[i + 2 * 260] = input[i] + 10;
+      input[i + 3 * 260] = -input[i] - 10;
+    }
+    auto gpu_input_d = cms::alpakatools::make_device_buffer<FLOAT[]>(queue, nmax);
+    //cudaCheck(cudaMalloc(&gpu_input, sizeof(FLOAT) * nmax));
+//    cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax));
+    auto gpu_product_d = cms::alpakatools::make_device_buffer<int[]>(queue, nmax);
+    // copy the input data to the GPU
+    alpaka::memcpy(queue, gpu_input_d, gpu_input_h);
+    //cudaCheck(cudaMemcpy(gpu_input, input, sizeof(FLOAT) * nmax, cudaMemcpyHostToDevice));
+
+    for (int k = 2; k <= nmax; k++) {
+      std::random_shuffle(input, input + k);
+      printf("Test with %d items\n", k);
+      // sort  on the GPU
+      testWrapper(queue, gpu_input_d.data(), gpu_product_d.data(), k, false);
+      alpaka::wait(queue);
+    }
+  }
+  return 0;
+}
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc
new file mode 100644
index 0000000000000..d1de1f1c17cca
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc
@@ -0,0 +1,327 @@
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h"
+
+constexpr uint32_t MaxElem = 64000;
+constexpr uint32_t MaxTk = 8000;
+constexpr uint32_t MaxAssocs = 4 * MaxTk;
+
+using namespace cms::alpakatools;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+using AssocRandomAccess = OneToManyAssocRandomAccess<uint16_t, MaxElem, MaxAssocs>;
+using AssocSequential = OneToManyAssocSequential<uint16_t, MaxElem, MaxAssocs>;
+using SmallAssoc = OneToManyAssocSequential<uint16_t, 128, MaxAssocs>;
+using Multiplicity = OneToManyAssocRandomAccess<uint16_t, 8, MaxTk>;
+using TK = std::array<uint16_t, 4>;
+
+namespace {
+  template <typename T>
+  ALPAKA_FN_HOST_ACC typename std::make_signed<T>::type toSigned(T v) {
+    return static_cast<typename std::make_signed<T>::type>(v);
+  }
+}  // namespace
+
+struct countMultiLocal {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                TK const* __restrict__ tk,
+                                Multiplicity* __restrict__ assoc,
+                                uint32_t n) const {
+    for (auto i : elements_with_stride(acc, n)) {
+      auto& local = alpaka::declareSharedVar<Multiplicity::CountersOnly, __COUNTER__>(acc);
+      const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+      const bool oncePerSharedMemoryAccess = (threadIdxLocal == 0);
+      if (oncePerSharedMemoryAccess) {
+        local.zero();
+      }
+      alpaka::syncBlockThreads(acc);
+      local.count(acc, 2 + i % 4);
+      alpaka::syncBlockThreads(acc);
+      if (oncePerSharedMemoryAccess) {
+        assoc->add(acc, local);
+      }
+    }
+  }
+};
+
+struct countMulti {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                TK const* __restrict__ tk,
+                                Multiplicity* __restrict__ assoc,
+                                uint32_t n) const {
+    for (auto i : elements_with_stride(acc, n)) {
+      assoc->count(acc, 2 + i % 4);
+    }
+  }
+};
+
+struct verifyMulti {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const {
+    for ([[maybe_unused]] auto i : elements_with_stride(acc, Multiplicity{}.totOnes())) {
+      ALPAKA_ASSERT_OFFLOAD(m1->off[i] == m2->off[i]);
+    }
+  }
+};
+
+struct count {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                TK const* __restrict__ tk,
+                                AssocRandomAccess* __restrict__ assoc,
+                                uint32_t n) const {
+    for (auto i : elements_with_stride(acc, 4 * n)) {
+      auto k = i / 4;
+      auto j = i - 4 * k;
+      ALPAKA_ASSERT_OFFLOAD(j < 4);
+      if (k >= n) {
+        return;
+      }
+      if (tk[k][j] < MaxElem) {
+        assoc->count(acc, tk[k][j]);
+      }
+    }
+  }
+};
+
+struct fill {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                TK const* __restrict__ tk,
+                                AssocRandomAccess* __restrict__ assoc,
+                                uint32_t n) const {
+    for (auto i : elements_with_stride(acc, 4 * n)) {
+      auto k = i / 4;
+      auto j = i - 4 * k;
+      ALPAKA_ASSERT_OFFLOAD(j < 4);
+      if (k >= n) {
+        return;
+      }
+      if (tk[k][j] < MaxElem) {
+        assoc->fill(acc, tk[k][j], k);
+      }
+    }
+  }
+};
+
+struct verify {
+  template <typename TAcc, typename Assoc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, Assoc* __restrict__ assoc) const {
+    ALPAKA_ASSERT_OFFLOAD(assoc->size() < Assoc{}.capacity());
+  }
+};
+
+struct fillBulk {
+  template <typename TAcc, typename Assoc>
+  ALPAKA_FN_ACC void operator()(
+      const TAcc& acc, AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const {
+    for (auto k : elements_with_stride(acc, n)) {
+      auto m = tk[k][3] < MaxElem ? 4 : 3;
+      assoc->bulkFill(acc, *apc, &tk[k][0], m);
+    }
+  }
+};
+
+struct verifyBulk {
+  template <typename TAcc, typename Assoc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) const {
+    if (::toSigned(apc->get().first) >= Assoc::ctNOnes()) {
+      printf("Overflow %d %d\n", apc->get().first, Assoc::ctNOnes());
+    }
+    ALPAKA_ASSERT_OFFLOAD(toSigned(assoc->size()) < Assoc::ctCapacity());
+  }
+};
+
+int main() {
+  // get the list of devices on the current platform
+  auto const& devices = cms::alpakatools::devices<Platform>();
+  if (devices.empty()) {
+    std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+              << ", the test will be skipped.\n";
+    return 0;
+  }
+
+  // run the test on each device
+  for (auto const& device : devices) {
+    Queue queue(device);
+
+    std::cout << "OneToManyAssocRandomAccess " << sizeof(AssocRandomAccess) << " Ones=" << AssocRandomAccess{}.totOnes()
+              << " Capacity=" << AssocRandomAccess{}.capacity() << std::endl;
+    std::cout << "OneToManyAssocSequential " << sizeof(AssocSequential) << " Ones=" << AssocSequential{}.totOnes()
+              << " Capacity=" << AssocSequential{}.capacity() << std::endl;
+    std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << " Ones=" << SmallAssoc{}.totOnes()
+              << " Capacity=" << SmallAssoc{}.capacity() << std::endl;
+
+    std::mt19937 eng;
+    std::geometric_distribution<int> rdm(0.8);
+
+    constexpr uint32_t N = 4000;
+
+    auto tr = make_host_buffer<std::array<uint16_t, 4>[]>(queue, N);
+    // fill with "index" to element
+    long long ave = 0;
+    int imax = 0;
+    auto n = 0U;
+    auto z = 0U;
+    auto nz = 0U;
+    for (auto i = 0U; i < 4U; ++i) {
+      auto j = 0U;
+      while (j < N && n < MaxElem) {
+        if (z == 11) {
+          ++n;
+          z = 0;
+          ++nz;
+          continue;
+        }  // a bit of not assoc
+        auto x = rdm(eng);
+        auto k = std::min(j + x + 1, N);
+        if (i == 3 && z == 3) {  // some triplets time to time
+          for (; j < k; ++j)
+            tr[j][i] = MaxElem + 1;
+        } else {
+          ave += x + 1;
+          imax = std::max(imax, x);
+          for (; j < k; ++j)
+            tr[j][i] = n;
+          ++n;
+        }
+        ++z;
+      }
+      ALPAKA_ASSERT_OFFLOAD(n <= MaxElem);
+      ALPAKA_ASSERT_OFFLOAD(j <= N);
+    }
+    std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
+
+    auto v_d = make_device_buffer<std::array<uint16_t, 4>[]>(queue, N);
+    alpaka::memcpy(queue, v_d, tr);
+
+    auto ara_d = make_device_buffer<AssocRandomAccess>(queue);
+    alpaka::memset(queue, ara_d, 0);
+
+    const auto threadsPerBlockOrElementsPerThread = 256u;
+    const auto blocksPerGrid4N = divide_up_by(4 * N, threadsPerBlockOrElementsPerThread);
+    const auto workDiv4N = make_workdiv<Acc1D>(blocksPerGrid4N, threadsPerBlockOrElementsPerThread);
+
+    AssocRandomAccess::template launchZero<Acc1D>(ara_d.data(), queue);
+
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, count(), v_d.data(), ara_d.data(), N));
+
+    AssocRandomAccess::template launchFinalize<Acc1D>(ara_d.data(), queue);
+
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verify(), ara_d.data()));
+
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, fill(), v_d.data(), ara_d.data(), N));
+
+    auto ara_h = make_host_buffer<AssocRandomAccess>(queue);
+    alpaka::memcpy(queue, ara_h, ara_d);
+    alpaka::wait(queue);
+
+    std::cout << ara_h->size() << std::endl;
+    imax = 0;
+    ave = 0;
+    z = 0;
+    for (auto i = 0U; i < n; ++i) {
+      auto x = ara_h->size(i);
+      if (x == 0) {
+        z++;
+        continue;
+      }
+      ave += x;
+      imax = std::max(imax, int(x));
+    }
+    ALPAKA_ASSERT_OFFLOAD(0 == ara_h->size(n));
+    std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl;
+
+    // now the inverse map (actually this is the direct....)
+    auto dc_d = make_device_buffer<AtomicPairCounter>(queue);
+    alpaka::memset(queue, dc_d, 0);
+
+    const auto blocksPerGrid = divide_up_by(N, threadsPerBlockOrElementsPerThread);
+    const auto workDiv = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
+
+    auto as_d = make_device_buffer<AssocSequential>(queue);
+    alpaka::enqueue(queue,
+                    alpaka::createTaskKernel<Acc1D>(workDiv, fillBulk(), dc_d.data(), v_d.data(), as_d.data(), N));
+
+    alpaka::enqueue(
+        queue, alpaka::createTaskKernel<Acc1D>(workDiv, AssocSequential::finalizeBulk(), dc_d.data(), as_d.data()));
+
+    alpaka::enqueue(queue,
+                    alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), as_d.data(), dc_d.data()));
+
+    auto as_h = make_host_buffer<AssocSequential>(queue);
+    alpaka::memcpy(queue, as_h, as_d);
+
+    auto dc_h = make_host_buffer<AtomicPairCounter>(queue);
+    alpaka::memcpy(queue, dc_h, dc_d);
+    alpaka::wait(queue);
+
+    alpaka::memset(queue, dc_d, 0);
+    auto sa_d = make_device_buffer<SmallAssoc>(queue);
+    alpaka::memset(queue, sa_d, 0);
+
+    alpaka::enqueue(queue,
+                    alpaka::createTaskKernel<Acc1D>(workDiv, fillBulk(), dc_d.data(), v_d.data(), sa_d.data(), N));
+
+    alpaka::enqueue(queue,
+                    alpaka::createTaskKernel<Acc1D>(workDiv, SmallAssoc::finalizeBulk(), dc_d.data(), sa_d.data()));
+
+    alpaka::enqueue(queue,
+                    alpaka::createTaskKernel<Acc1D>(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), sa_d.data(), dc_d.data()));
+
+    std::cout << "final counter value " << dc_h->get().second << ' ' << dc_h->get().first << std::endl;
+
+    std::cout << as_h->size() << std::endl;
+    imax = 0;
+    ave = 0;
+    for (auto i = 0U; i < N; ++i) {
+      auto x = as_h->size(i);
+      if (!(x == 4 || x == 3)) {
+        std::cout << "i=" << i << " x=" << x << std::endl;
+      }
+      ALPAKA_ASSERT_OFFLOAD(x == 4 || x == 3);
+      ave += x;
+      imax = std::max(imax, int(x));
+    }
+    ALPAKA_ASSERT_OFFLOAD(0 == as_h->size(N));
+    std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl;
+
+    // here verify use of block local counters
+    auto m1_d = make_device_buffer<Multiplicity>(queue);
+    alpaka::memset(queue, m1_d, 0);
+    auto m2_d = make_device_buffer<Multiplicity>(queue);
+    alpaka::memset(queue, m2_d, 0);
+
+    Multiplicity::template launchZero<Acc1D>(m1_d.data(), queue);
+    Multiplicity::template launchZero<Acc1D>(m2_d.data(), queue);
+
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, countMulti(), v_d.data(), m1_d.data(), N));
+
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv4N, countMultiLocal(), v_d.data(), m2_d.data(), N));
+
+    const auto blocksPerGridTotBins = 1u;
+    const auto threadsPerBlockOrElementsPerThreadTotBins = Multiplicity::ctNOnes();
+    const auto workDivTotBins = make_workdiv<Acc1D>(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins);
+
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data()));
+
+    Multiplicity::launchFinalize<Acc1D>(m1_d.data(), queue);
+    Multiplicity::launchFinalize<Acc1D>(m2_d.data(), queue);
+
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data()));
+
+    alpaka::wait(queue);
+
+    return 0;
+  }
+}
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc
new file mode 100644
index 0000000000000..bffee8f1f533d
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc
@@ -0,0 +1,219 @@
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
+
+using namespace cms::alpakatools;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+// static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestPrefixScan) "]";
+
+template <typename T>
+struct format_traits {
+public:
+  static const constexpr char* failed_msg = "failed(int) size=%d, i=%d, blockDimension=%d: c[i]=%d c[i-1]=%d\n";
+};
+
+template <>
+struct format_traits<float> {
+public:
+  static const constexpr char* failed_msg = "failed(float size=%d, i=%d, blockDimension=%d: c[i]=%f c[i-1]=%f\n";
+};
+
+template <typename T>
+struct testPrefixScan {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, unsigned int size) const {
+    auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
+    auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
+    auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
+
+    for (auto i : elements_with_stride(acc, size)) {
+      c[i] = 1;
+    };
+
+    alpaka::syncBlockThreads(acc);
+
+    blockPrefixScan(acc, c, co, size, ws);
+    blockPrefixScan(acc, c, size, ws);
+
+    ALPAKA_ASSERT_OFFLOAD(1 == c[0]);
+    ALPAKA_ASSERT_OFFLOAD(1 == co[0]);
+
+    // TODO: not needed? Not in multi kernel version, not in CUDA version
+    alpaka::syncBlockThreads(acc);
+
+    for (auto i : elements_with_stride(acc, size)) {
+      if (0 == i)
+        continue;
+      if constexpr (!std::is_floating_point_v<T>) {
+        if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i])))
+          printf("c[%d]=%d, co[%d]=%d\n", i, c[i], i, co[i]);
+      } else {
+        if (!((c[i] == c[i - 1] + 1) && (c[i] == i + 1) && (c[i] == co[i])))
+          printf("c[%d]=%f, co[%d]=%f\n", i, c[i], i, co[i]);
+      }
+      ALPAKA_ASSERT_OFFLOAD(c[i] == c[i - 1] + 1);
+      ALPAKA_ASSERT_OFFLOAD(c[i] == i + 1);
+      ALPAKA_ASSERT_OFFLOAD(c[i] == co[i]);
+    }
+  }
+};
+
+/*
+ * NB: GPU-only, so do not care about elements here.
+ */
+template <typename T>
+struct testWarpPrefixScan {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t size) const {
+    if constexpr (!requires_single_thread_per_block_v<TAcc>) {
+      ALPAKA_ASSERT_OFFLOAD(size <= 32);
+      auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
+      auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
+
+      uint32_t const blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
+      uint32_t const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
+      auto i = blockThreadIdx;
+      c[i] = 1;
+      alpaka::syncBlockThreads(acc);
+      auto laneId = blockThreadIdx & 0x1f;
+
+      warpPrefixScan(acc, laneId, c, co, i);
+      warpPrefixScan(acc, laneId, c, i);
+
+      alpaka::syncBlockThreads(acc);
+
+      ALPAKA_ASSERT_OFFLOAD(1 == c[0]);
+      ALPAKA_ASSERT_OFFLOAD(1 == co[0]);
+      if (i != 0) {
+        if (c[i] != c[i - 1] + 1)
+          printf(format_traits<T>::failed_msg, size, i, blockDimension, c[i], c[i - 1]);
+        ALPAKA_ASSERT_OFFLOAD(c[i] == c[i - 1] + 1);
+        ALPAKA_ASSERT_OFFLOAD(c[i] == static_cast<T>(i + 1));
+        ALPAKA_ASSERT_OFFLOAD(c[i] == co[i]);
+      }
+    } else {
+      // We should never be called outsie of the GPU.
+      ALPAKA_ASSERT_OFFLOAD(false);
+    }
+  }
+};
+
+struct init {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t* v, uint32_t val, uint32_t n) const {
+    for (auto index : elements_with_stride(acc, n)) {
+      v[index] = val;
+
+      if (index == 0)
+        printf("init\n");
+    }
+  }
+};
+
+struct verify {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t const* v, uint32_t n) const {
+    for (auto index : elements_with_stride(acc, n)) {
+      ALPAKA_ASSERT_OFFLOAD(v[index] == index + 1);
+
+      if (index == 0)
+        printf("verify\n");
+    }
+  }
+};
+
+int main() {
+  // get the list of devices on the current platform
+  auto const& devices = cms::alpakatools::devices<Platform>();
+  // auto const& host = cms::alpakatools::host();
+
+  if (devices.empty()) {
+    std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+              << ", the test will be skipped.\n";
+    return 0;
+  }
+
+  for (auto const& device : devices) {
+    std::cout << "Test prefix scan on " << alpaka::getName(device) << '\n';
+    auto queue = Queue(device);
+    const auto warpSize = alpaka::getWarpSizes(device)[0];
+    // WARP PREFIXSCAN (OBVIOUSLY GPU-ONLY)
+    if constexpr (!requires_single_thread_per_block_v<Acc1D>) {
+      std::cout << "warp level" << std::endl;
+
+      const auto threadsPerBlockOrElementsPerThread = 32;
+      const auto blocksPerGrid = 1;
+      const auto workDivWarp = make_workdiv<Acc1D>(blocksPerGrid, threadsPerBlockOrElementsPerThread);
+
+      alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 32));
+      alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 16));
+      alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivWarp, testWarpPrefixScan<int>(), 5));
+    }
+
+    // PORTABLE BLOCK PREFIXSCAN
+    std::cout << "block level" << std::endl;
+
+    // Running kernel with 1 block, and bs threads per block or elements per thread.
+    // NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version.
+    for (int bs = 32; bs <= 1024; bs += 32) {
+      const auto blocksPerGrid2 = 1;
+      const auto workDivSingleBlock = make_workdiv<Acc1D>(blocksPerGrid2, bs);
+
+      std::cout << "blocks per grid: " << blocksPerGrid2 << ", threads per block or elements per thread: " << bs
+                << std::endl;
+
+      // Problem size
+      for (int j = 1; j <= 1024; ++j) {
+        alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivSingleBlock, testPrefixScan<uint16_t>(), j));
+        alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivSingleBlock, testPrefixScan<float>(), j));
+      }
+    }
+
+    // PORTABLE MULTI-BLOCK PREFIXSCAN
+    uint32_t num_items = 200;
+    for (int ksize = 1; ksize < 4; ++ksize) {
+      std::cout << "multiblock" << std::endl;
+      num_items *= 10;
+
+      auto input_d = make_device_buffer<uint32_t[]>(queue, num_items);
+      auto output1_d = make_device_buffer<uint32_t[]>(queue, num_items);
+      auto blockCounter_d = make_device_buffer<int32_t>(queue);
+
+      const auto nThreadsInit = 256;  // NB: 1024 would be better
+      const auto nBlocksInit = divide_up_by(num_items, nThreadsInit);
+      const auto workDivMultiBlockInit = make_workdiv<Acc1D>(nBlocksInit, nThreadsInit);
+
+      alpaka::enqueue(queue,
+                      alpaka::createTaskKernel<Acc1D>(workDivMultiBlockInit, init(), input_d.data(), 1, num_items));
+      alpaka::memset(queue, blockCounter_d, 0);
+
+      const auto nThreads = 1024;
+      const auto nBlocks = divide_up_by(num_items, nThreads);
+      const auto workDivMultiBlock = make_workdiv<Acc1D>(nBlocks, nThreads);
+
+      std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl;
+      alpaka::enqueue(queue,
+                      alpaka::createTaskKernel<Acc1D>(workDivMultiBlock,
+                                                      multiBlockPrefixScan<uint32_t>(),
+                                                      input_d.data(),
+                                                      output1_d.data(),
+                                                      num_items,
+                                                      nBlocks,
+                                                      blockCounter_d.data(),
+                                                      warpSize));
+      alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDivMultiBlock, verify(), output1_d.data(), num_items));
+
+      alpaka::wait(queue);  // input_d and output1_d end of scope
+    }                       // ksize
+  }
+
+  return 0;
+}
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
new file mode 100644
index 0000000000000..d84761e630588
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
@@ -0,0 +1,285 @@
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+using namespace std::chrono_literals;
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+#include <set>
+#include <type_traits>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+using namespace cms::alpakatools;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+template <typename T>
+struct RS {
+  using type = std::uniform_int_distribution<T>;
+  static auto ud() { return type(std::numeric_limits<T>::min(), std::numeric_limits<T>::max()); }
+  static constexpr T imax = std::numeric_limits<T>::max();
+};
+
+template <>
+struct RS<float> {
+  using T = float;
+  using type = std::uniform_real_distribution<float>;
+  static auto ud() { return type(-std::numeric_limits<T>::max() / 2, std::numeric_limits<T>::max() / 2); }
+  //  static auto ud() { return type(0,std::numeric_limits<T>::max()/2);}
+  static constexpr int imax = std::numeric_limits<int>::max();
+};
+
+// A templated unsigned integer type with N bytes
+template <int N>
+struct uintN;
+
+template <>
+struct uintN<8> {
+  using type = uint8_t;
+};
+
+template <>
+struct uintN<16> {
+  using type = uint16_t;
+};
+
+template <>
+struct uintN<32> {
+  using type = uint32_t;
+};
+
+template <>
+struct uintN<64> {
+  using type = uint64_t;
+};
+
+template <int N>
+using uintN_t = typename uintN<N>::type;
+
+// A templated unsigned integer type with the same size as T
+template <typename T>
+using uintT_t = uintN_t<sizeof(T) * 8>;
+
+// Keep only the `N` most significant bytes of `t`, and set the others to zero
+template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
+void truncate(T& t) {
+  const int shift = 8 * (sizeof(T) - N);
+  union {
+    T t;
+    uintT_t<T> u;
+  } c;
+  c.t = t;
+  c.u = c.u >> shift << shift;
+  t = c.t;
+}
+
+template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
+void go(Queue & queue, bool useShared) {
+  std::mt19937 eng;
+  //std::mt19937 eng2;
+  auto rgen = RS<T>::ud();
+
+  std::chrono::high_resolution_clock::duration delta = 0ns;
+  constexpr int blocks = 10;
+  constexpr int blockSize = 256 * 32;
+  constexpr int N = blockSize * blocks;
+  auto v_h = cms::alpakatools::make_host_buffer<T[]>(queue, N);
+  //uint16_t ind_h[N];
+
+  constexpr bool sgn = T(-1) < T(0);
+  std::cout << "Will sort " << N << (sgn ? " signed" : " unsigned")
+            << (std::numeric_limits<T>::is_integer ? " 'ints'" : " 'float'") << " of size " << sizeof(T) << " using "
+            << NS << " significant bytes" << std::endl;
+
+  for (int i = 0; i < 50; ++i) {
+    if (i == 49) {
+      for (long long j = 0; j < N; j++)
+        v_h[j] = 0;
+    } else if (i > 30) {
+      for (long long j = 0; j < N; j++)
+        v_h[j] = rgen(eng);
+    } else {
+      uint64_t imax = (i < 15) ? uint64_t(RS<T>::imax) + 1LL : 255;
+      for (uint64_t j = 0; j < N; j++) {
+        v_h[j] = (j % imax);
+        if (j % 2 && i % 2)
+          v_h[j] = -v_h[j];
+      }
+    }
+
+    auto offsets_h = cms::alpakatools::make_host_buffer<uint32_t[]>(queue, blocks + 1);
+    offsets_h[0] = 0;
+    for (int j = 1; j < blocks + 1; ++j) {
+      offsets_h[j] = offsets_h[j - 1] + blockSize - 3 * j;
+      assert(offsets_h[j] <= N);
+    }
+
+    if (i == 1) {  // special cases...
+      offsets_h[0] = 0;
+      offsets_h[1] = 0;
+      offsets_h[2] = 19;
+      offsets_h[3] = 32 + offsets_h[2];
+      offsets_h[4] = 123 + offsets_h[3];
+      offsets_h[5] = 256 + offsets_h[4];
+      offsets_h[6] = 311 + offsets_h[5];
+      offsets_h[7] = 2111 + offsets_h[6];
+      offsets_h[8] = 256 * 11 + offsets_h[7];
+      offsets_h[9] = 44 + offsets_h[8];
+      offsets_h[10] = 3297 + offsets_h[9];
+    }
+
+    std::random_shuffle(v_h.data(), v_h.data() + N);
+
+    auto v_d = cms::alpakatools::make_device_buffer<U[]>(queue, N);
+    auto ind_d = cms::alpakatools::make_device_buffer<uint16_t[]>(queue, N);
+    auto ind_h = cms::alpakatools::make_host_buffer<uint16_t[]>(queue, N);
+    auto ws_d = cms::alpakatools::make_device_buffer<uint16_t[]>(queue, N);
+    auto off_d = cms::alpakatools::make_device_buffer<uint32_t[]>(queue, blocks + 1);
+
+    alpaka::memcpy(queue, v_d, v_h);
+    alpaka::memcpy(queue, off_d, offsets_h);
+//    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+//    cudaCheck(cudaMemcpy(off_d.get(), offsets_h, 4 * (blocks + 1), cudaMemcpyHostToDevice));
+
+    if (i < 2)
+      std::cout << "launch for " << offsets_h[blocks] << std::endl;
+
+    auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
+
+    auto start = std::chrono::high_resolution_clock::now();
+    // TODO: manage runtime sized shared memory
+    [[maybe_unused]] constexpr int MaxSize = 256 * 32;
+    auto workdiv = make_workdiv<Acc1D>(blocks, ntXBl);
+    if (useShared)
+      // The original CUDA version used to call a kernel with __launch_bounds__(256, 4) specifier
+      // 
+      alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
+              radixSortMultiWrapper<U, NS>{}, v_d.data(), ind_d.data(), off_d.data(), nullptr));
+//      cms::cuda::launch(
+//          radixSortMultiWrapper<U, NS>, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr);
+    else
+//      cms::cuda::launch(
+//          radixSortMultiWrapper2<U, NS>, {blocks, ntXBl},             v_d.get(), ind_d.get(), off_d.get(), ws_d.get());
+      alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
+              radixSortMultiWrapper2<U, NS>{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data()));
+
+    if (i == 0)
+      std::cout << "done for " << offsets_h[blocks] << std::endl;
+
+    alpaka::memcpy(queue, ind_h, ind_d);
+    alpaka::wait(queue);
+    //cudaCheck(cudaMemcpy(ind_h, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost));
+
+    delta += std::chrono::high_resolution_clock::now() - start;
+
+    if (i == 0)
+      std::cout << "done for " << offsets_h[blocks] << std::endl;
+
+    if (32 == i) {
+      std::cout << LL(v_h[ind_h[0]]) << ' ' << LL(v_h[ind_h[1]]) << ' ' << LL(v_h[ind_h[2]]) << std::endl;
+      std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]]) << std::endl;
+      std::cout << LL(v_h[ind_h[blockSize / 2 - 1]]) << ' ' << LL(v_h[ind_h[blockSize / 2]]) << ' '
+                << LL(v_h[ind_h[blockSize / 2 + 1]]) << std::endl;
+    }
+    for (int ib = 0; ib < blocks; ++ib) {
+      std::set<uint16_t> inds;
+      if (offsets_h[ib + 1] > offsets_h[ib])
+        inds.insert(ind_h[offsets_h[ib]]);
+      for (auto j = offsets_h[ib] + 1; j < offsets_h[ib + 1]; j++) {
+        if (inds.count(ind_h[j])) {
+          printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n",
+                  i, ib, j, ind_h[j]);
+          std::vector<int> counts;
+          counts.resize(offsets_h[ib + 1] - offsets_h[ib], 0);
+          for (size_t j2 = offsets_h[ib]; j2 < offsets_h[ib + 1]; j2++) {
+            counts[ind_h[j2]]++;
+          } 
+          for (size_t j2 = 0; j2 < counts.size(); j2++) {
+            if (counts[j2]!=1)
+              printf("counts[%ld]=%d ", j2, counts[j2]);
+          } 
+          printf("\n");
+        }
+        assert(0 == inds.count(ind_h[j]));
+        inds.insert(ind_h[j]);
+        auto a = v_h.data() + offsets_h[ib];
+        auto k1 = a[ind_h[j]];
+        auto k2 = a[ind_h[j - 1]];
+        truncate<NS>(k1);
+        truncate<NS>(k2);
+        if (k1 < k2) {
+          std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib]  << ", " << offsets_h[ib + 1] - 1 
+                  << "] j=" << j << " ind[j]=" << ind_h[j] << " (k1 < k2) : a1=" << a[ind_h[j]] << " k1=" << k1
+                  << "a2= " << a[ind_h[j - 1]] << " k2=" << k2 << std::endl;
+          assert(false);
+        }
+      }
+      if (!inds.empty()) {
+        assert(0 == *inds.begin());
+        assert(inds.size() - 1 == *inds.rbegin());
+      }
+      if (inds.size() != (offsets_h[ib + 1] - offsets_h[ib]))
+        std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets_h[ib + 1] - offsets_h[ib])
+                  << std::endl;
+      // 
+      assert(inds.size() == (offsets_h[ib + 1] - offsets_h[ib]));
+    }
+  }  // 50 times
+  std::cout << "Kernel computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / 50.
+            << " ms per pass" << std::endl;
+}
+
+int main() {
+  // get the list of devices on the current platform
+  auto const& devices = cms::alpakatools::devices<Platform>();
+  if (devices.empty()) {
+    std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+              << ", the test will be skipped.\n";
+    return 0;
+  }
+
+  for (auto const& device : devices) {
+    Queue queue(device);
+    bool useShared = false;
+
+    std::cout << "using Global memory" << std::endl;
+
+    go<int8_t>(queue, useShared);
+    go<int16_t>(queue, useShared);
+    go<int32_t>(queue, useShared);
+    go<int32_t, 3>(queue, useShared);
+    go<int64_t>(queue, useShared);
+    go<float, 4, float, double>(queue, useShared);
+    go<float, 2, float, double>(queue, useShared);
+
+    go<uint8_t>(queue, useShared);
+    go<uint16_t>(queue, useShared);
+    go<uint32_t>(queue, useShared);
+    // go<uint64_t>(v_h);
+
+    useShared = true;
+
+    std::cout << "using Shared memory" << std::endl;
+
+    go<int8_t>(queue, useShared);
+    go<int16_t>(queue, useShared);
+    go<int32_t>(queue, useShared);
+    go<int32_t, 3>(queue, useShared);
+    go<int64_t>(queue, useShared);
+    go<float, 4, float, double>(queue, useShared);
+    go<float, 2, float, double>(queue, useShared);
+
+    go<uint8_t>(queue, useShared);
+    go<uint16_t>(queue, useShared);
+    go<uint32_t>(queue, useShared);
+    // go<uint64_t>(v_h);
+  }
+  return 0;
+}
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc
new file mode 100644
index 0000000000000..c29b571c6d356
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testSimpleVector.dev.cc
@@ -0,0 +1,101 @@
+//  author: Felice Pantaleo, CERN, 2018
+#include <cassert>
+#include <iostream>
+#include <new>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+using namespace cms::alpakatools;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+struct vector_pushback {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, SimpleVector<int>* foo) const {
+    for (auto index : elements_with_stride(acc))
+      foo->push_back(acc, index);
+  }
+};
+
+struct vector_reset {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, SimpleVector<int>* foo) const {
+    foo->reset();
+  }
+};
+
+struct vector_emplace_back {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator()(const TAcc& acc, SimpleVector<int>* foo) const {
+    for (auto index : elements_with_stride(acc))
+      foo->emplace_back(acc, index);
+  }
+};
+
+int main() {
+  // get the list of devices on the current platform
+  auto const& devices = cms::alpakatools::devices<Platform>();
+  if (devices.empty()) {
+    std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+              << ", the test will be skipped.\n";
+    return 0;
+  }
+
+  // run the test on each device
+  for (auto const& device : devices) {
+    Queue queue(device);
+    auto maxN = 10000;
+    auto vec_h = make_host_buffer<cms::alpakatools::SimpleVector<int>>(queue);
+    auto vec_d = make_device_buffer<cms::alpakatools::SimpleVector<int>>(queue);
+    auto data_h = make_host_buffer<int[]>(queue, maxN);
+    auto data_d = make_device_buffer<int[]>(queue, maxN);
+
+    [[maybe_unused]] auto v = make_SimpleVector(maxN, data_d.data());
+
+    // Prepare the vec object on the host
+    auto tmp_vec_h = make_host_buffer<cms::alpakatools::SimpleVector<int>>(queue);
+    make_SimpleVector(tmp_vec_h.data(), maxN, data_d.data());
+    assert(tmp_vec_h->size() == 0);
+    assert(tmp_vec_h->capacity() == static_cast<int>(maxN));
+
+    // ... and copy the object to the device.
+    alpaka::memcpy(queue, vec_d, tmp_vec_h);
+    alpaka::wait(queue);
+
+    int numBlocks = 5;
+    int numThreadsPerBlock = 256;
+    const auto workDiv = make_workdiv<Acc1D>(numBlocks, numThreadsPerBlock);
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv, vector_pushback(), vec_d.data()));
+    alpaka::wait(queue);
+
+    alpaka::memcpy(queue, vec_h, vec_d);
+    alpaka::wait(queue);
+    printf("vec_h->size()=%d, numBlocks * numThreadsPerBlock=%d, maxN=%d\n",
+           vec_h->size(),
+           numBlocks * numThreadsPerBlock,
+           maxN);
+    assert(vec_h->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv, vector_reset(), vec_d.data()));
+    alpaka::wait(queue);
+
+    alpaka::memcpy(queue, vec_h, vec_d);
+    alpaka::wait(queue);
+
+    assert(vec_h->size() == 0);
+
+    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workDiv, vector_emplace_back(), vec_d.data()));
+    alpaka::wait(queue);
+
+    alpaka::memcpy(queue, vec_h, vec_d);
+    alpaka::wait(queue);
+
+    assert(vec_h->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));
+
+    alpaka::memcpy(queue, data_h, data_d);
+  }
+  std::cout << "TEST PASSED" << std::endl;
+  return 0;
+}
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
new file mode 100644
index 0000000000000..f4f4719192c12
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
@@ -0,0 +1,147 @@
+#include <vector>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+using namespace cms::alpakatools;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+// Kernel running a loop over threads/elements
+// One function with multiple flavors
+enum class RangeType {
+  Default, ExtentLimited, ExtentLimitedWithShift
+};
+  
+enum class LoopScope {
+  Block, Grid
+};
+
+template <LoopScope loopScope, typename TAcc>
+bool constexpr firstInLoopRange(TAcc const& acc) {
+  if constexpr (loopScope == LoopScope::Block)
+    return !alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
+  if constexpr (loopScope == LoopScope::Grid)
+    return !alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
+  assert(false);
+}
+
+template <RangeType rangeType, LoopScope loopScope, typename TAcc>
+size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) {
+  if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
+    return shift < size ? size - shift: 0;
+  else if constexpr (rangeType == RangeType::ExtentLimited)
+    return size;
+  else /* rangeType == RangeType::Default */
+    if constexpr (loopScope == LoopScope::Block)
+      return alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
+    else
+      return alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u];
+}
+
+template <RangeType rangeType, LoopScope loopScope>
+size_t constexpr expectedCount(WorkDiv1D const &workDiv, size_t size, size_t shift) {
+  if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
+    return shift < size ? size - shift: 0;
+  else if constexpr (rangeType == RangeType::ExtentLimited)
+    return size;
+  else /* rangeType == RangeType::Default */
+    if constexpr (loopScope == LoopScope::Block)
+      return workDiv.m_blockThreadExtent[0u] * workDiv.m_threadElemExtent[0u];
+    else
+      return workDiv.m_gridBlockExtent[0u] * workDiv.m_blockThreadExtent[0u] * workDiv.m_threadElemExtent[0u];
+}
+
+template<RangeType rangeType, LoopScope loopScope>
+struct testWordDivisionDefaultRange {
+  template <typename TAcc>
+  ALPAKA_FN_ACC void operator() (TAcc const & acc, size_t size, size_t shift, size_t *globalCounter) const {
+    size_t & counter = ( loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar<size_t, __COUNTER__>(acc));
+    // Init the counter for block range. Grid range does so my mean of memset. 
+    if constexpr (loopScope == LoopScope::Block) {
+      if (firstInLoopRange<loopScope>(acc))
+        counter = 0;
+      alpaka::syncBlockThreads(acc);
+    }
+    // The loop we are testing
+    if constexpr (rangeType == RangeType::Default)
+      for ([[maybe_unused]] auto idx: elements_with_stride(acc))
+        alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
+    else if constexpr (rangeType == RangeType::ExtentLimited)
+      for ([[maybe_unused]] auto idx: elements_with_stride(acc, size))
+        alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
+    else if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
+      for ([[maybe_unused]]auto  idx: elements_with_stride(acc, size, shift))
+        alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
+    alpaka::syncBlockThreads(acc);
+    // Check the result. Grid range will check by memcpy-ing the result.
+    if constexpr (loopScope == LoopScope::Block) {
+      if (firstInLoopRange<loopScope>(acc)) {
+        auto expected = expectedCount<rangeType, loopScope>(acc, size, shift);
+        assert(counter == expected);
+      }
+    }
+  }
+};
+
+int main() {
+  // get the list of devices on the current platform
+  auto const& devices = cms::alpakatools::devices<Platform>();
+  if (devices.empty()) {
+    std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+              << ", the test will be skipped.\n";
+    return 0;
+  }
+
+  for (auto const& device : devices) {
+    // Get global memory
+    Queue queue(device);
+    auto counter_d = cms::alpakatools::make_device_buffer<size_t>(queue);
+    auto counter_h = cms::alpakatools::make_host_buffer<size_t>(queue);
+    alpaka::memset(queue, counter_d, 0);
+    ssize_t BlockSize = 512;
+    size_t GridSize = 4;
+    for (size_t blocks = 1; blocks < GridSize * 3; blocks++)
+      for (auto sizeFuzz: std::initializer_list<ssize_t>{ - BlockSize / 2, -1, 0, 1, BlockSize/2 })
+        for (auto shift: std::initializer_list<ssize_t>{0, 1, BlockSize / 2, BlockSize - 1, BlockSize, BlockSize + 1, BlockSize + BlockSize / 2, 2*BlockSize -1, 2*BlockSize, 2*BlockSize + 1}) {
+         // Grid level iteration: we need to initialize/check at the grid level
+          // Default range
+          alpaka::memset(queue, counter_d, 0);
+          auto workdiv = make_workdiv<Acc1D>(BlockSize, GridSize);
+          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
+             testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::memcpy(queue, counter_h, counter_d);
+          alpaka::wait(queue);
+          auto expected = expectedCount<RangeType::Default, LoopScope::Grid>(workdiv, blocks* BlockSize + sizeFuzz, shift);
+          assert(*counter_h.data() == expected);
+
+          // ExtentLimited range
+          alpaka::memset(queue, counter_d, 0);
+          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
+             testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::memcpy(queue, counter_h, counter_d);
+          alpaka::wait(queue);
+          expected = expectedCount<RangeType::ExtentLimited, LoopScope::Grid>(workdiv, blocks* BlockSize + sizeFuzz, shift);
+          assert(*counter_h.data() == expected);
+
+          // ExtentLimitedWithShift range
+          alpaka::memset(queue, counter_d, 0);
+          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
+             testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::memcpy(queue, counter_h, counter_d);
+          alpaka::wait(queue);
+          expected =  expectedCount<RangeType::ExtentLimitedWithShift, LoopScope::Grid>(workdiv, blocks* BlockSize + sizeFuzz, shift);
+          assert(*counter_h.data() == expected);
+
+          // Block level auto tests
+          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
+             testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
+             testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
+             testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); 
+        }
+    alpaka::wait(queue);
+  }
+}
\ No newline at end of file
diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
index ee8048431ab6d..416e647c349a4 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -203,7 +203,7 @@ __device__ __forceinline__ void radixSortImpl(
     if (threadIdx.x == 0)
       ++p;
     __syncthreads();
-  }
+      }
 
   if ((w != 8) && (0 == (NS & 1)))
     assert(j == ind);  // w/d is even so ind is correct
@@ -261,7 +261,7 @@ namespace cms {
 
     template <typename T, int NS = sizeof(T)>
     __global__ void __launch_bounds__(256, 4)
-        radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+      radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
       radixSortMulti<T, NS>(v, index, offsets, workspace);
     }
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu
index 8b6ffd70fd4e6..6b3ee2e4afa9c 100644
--- a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu
@@ -74,7 +74,7 @@ namespace {
     __syncthreads();
     radixSort<float, 2>(z, order, sws, elements);
     __syncthreads();
-
+    
     //verify
     for (unsigned int itrack = firstElement; itrack < (elements - 1); itrack += gridSize) {
       auto ntrack = order[itrack];

From 9af4710b6185c1cea3bea71fbd17daba6d786b69 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 15 Jan 2024 14:35:23 +0100
Subject: [PATCH 2/3] Moves from elements_with_stride to
 independent_group_elements

---
 .../AlpakaInterface/interface/radixSort.h     | 32 ++++++++++++-------
 .../test/alpaka/testRadixSort.dev.cc          |  4 +--
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
index 893132905f934..bcf1da16a5533 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
@@ -1,9 +1,13 @@
 #ifndef HeterogeneousCore_AlpakaInterface_interface_radixSort_h
 #define HeterogeneousCore_AlpakaInterface_interface_radixSort_h
 
+#include <algorithm>
 #include <cstdint>
+#include <numeric>
 #include <type_traits>
+
 #include <alpaka/alpaka.hpp>
+
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 
 namespace cms::alpakatools {
@@ -125,17 +129,17 @@ namespace cms::alpakatools {
       auto k = ind2;
 
       // Initializer index order to trivial increment.
-      for (auto idx: elements_with_stride(acc, size)) { j[idx] = idx; }
+      for (auto idx: independent_group_elements(acc, size)) { j[idx] = idx; }
       alpaka::syncBlockThreads(acc);
 
       // Iterate on the slices of the data.
       while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, (currentSortingPass < totalSortingPassses))) {
-        for (auto idx: elements_with_stride(acc, binsNumber)) { c[idx] = 0; }
+        for (auto idx: independent_group_elements(acc, binsNumber)) { c[idx] = 0; }
         alpaka::syncBlockThreads(acc);
         const auto sortingPassShift = binBits * currentSortingPass;
 
         // fill bins (count elements in each bin)
-        for (auto idx: elements_with_stride(acc, size)) {
+        for (auto idx: independent_group_elements(acc, size)) {
           auto bin = (a[j[idx]] >> sortingPassShift) & binsMask;
           alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{});
         } 
@@ -154,7 +158,7 @@ namespace cms::alpakatools {
         // prefix scan "optimized"???...
         // TODO: we might be able to reuse the warpPrefixScan function
         // Warp level prefix scan
-        for (auto idx: elements_with_stride(acc, binsNumber)) {
+        for (auto idx: independent_group_elements(acc, binsNumber)) {
           auto x = c[idx];
           auto laneId = idx & warpMask;
 
@@ -168,7 +172,7 @@ namespace cms::alpakatools {
         alpaka::syncBlockThreads(acc);
         
         // Block level completion of prefix scan (add last sum of each preceding warp)
-        for (auto idx: elements_with_stride(acc, binsNumber)) {
+        for (auto idx: independent_group_elements(acc, binsNumber)) {
           auto ss = (idx / warpSize) * warpSize - 1;
           c[idx] = ct[idx];
           for (int i = ss; i > 0; i -= warpSize)
@@ -198,7 +202,7 @@ namespace cms::alpakatools {
         // Iterate on bin-sized slices to (size - 1) / binSize + 1 iterations
         while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, ibs >= 0)) {
           // Init
-          for (auto idx: elements_with_stride(acc, binsNumber)) {
+          for (auto idx: independent_group_elements(acc, binsNumber)) {
             cu[idx] = -1;
             ct[idx] = -1;
           }
@@ -206,7 +210,7 @@ namespace cms::alpakatools {
 
           // Find the highest index for all the threads dealing with a given bin (in cu[])
           // Also record the bin for each thread (in ct[])
-          for (auto idx: elements_with_stride(acc, binsNumber)) {
+          for (auto idx: independent_group_elements(acc, binsNumber)) {
             int i = ibs - idx;
             int32_t bin = -1;
             if (i >= 0) {
@@ -226,7 +230,7 @@ namespace cms::alpakatools {
 
           
           // FIXME: we can slash a memory access.
-          for (auto idx: elements_with_stride(acc, binsNumber)) {
+          for (auto idx: independent_group_elements(acc, binsNumber)) {
             int i = ibs - idx;
             // Are we still in inside the data?
             if (i >= 0) {
@@ -302,7 +306,7 @@ namespace cms::alpakatools {
 
       // TODO this copy is (doubly?) redundant with the reorder
       if (j != ind)  // odd number of sorting passes, we need to move the result to the right array (ind[])
-        for (auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; };
+        for (auto idx: independent_group_elements(acc, size)) { ind[idx] = ind2[idx]; };
 
       alpaka::syncBlockThreads(acc);
 
@@ -360,8 +364,9 @@ namespace cms::alpakatools {
       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
     static_assert(requires_single_thread_per_block_v<TAcc>, "CPU sort (not a radixSort) called wtth wrong accelerator");
     // Initialize the index array
-    for (std::size_t i = 0; i < size; i++) ind[i] = i;
-    printf("std::sort(a=%p, ind=%p, indmax=%p, size=%d)\n", a, ind, ind + size, size);
+    std::iota(ind, ind + size, 0);
+    /*
+    printf("std::stable_sort(a=%p, ind=%p, indmax=%p, size=%d)\n", a, ind, ind + size, size);
     for (uint32_t i=0; i<10 && i<size; i++) {
       printf ("a[%d]=%ld ", i, (long int)a[i]);
     }
@@ -370,11 +375,14 @@ namespace cms::alpakatools {
       printf ("ind[%d]=%d ", i, ind[i]);
     }
     printf("\n");
-    std::sort(ind, ind+size, [a](uint16_t i0, uint16_t i1) { return a[i0] < a[i1]; });
+    */
+    std::stable_sort(ind, ind+size, [a](uint16_t i0, uint16_t i1) { return a[i0] < a[i1]; });
+    /*
     for (uint32_t i=0; i<10 && i<size; i++) {
       printf ("ind[%d]=%d ", i, ind[i]);
     }
     printf("\n");
+    */
   }
   
                                                                                                                                                                                    template <typename TAcc, typename T, int NS = sizeof(T)>
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
index d84761e630588..df41c649b91cb 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
@@ -193,7 +193,7 @@ void go(Queue & queue, bool useShared) {
       if (offsets_h[ib + 1] > offsets_h[ib])
         inds.insert(ind_h[offsets_h[ib]]);
       for (auto j = offsets_h[ib] + 1; j < offsets_h[ib + 1]; j++) {
-        if (inds.count(ind_h[j])) {
+        if (inds.count(ind_h[j]) != 0) {
           printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n",
                   i, ib, j, ind_h[j]);
           std::vector<int> counts;
@@ -206,6 +206,7 @@ void go(Queue & queue, bool useShared) {
               printf("counts[%ld]=%d ", j2, counts[j2]);
           } 
           printf("\n");
+          printf("inds.count(ind_h[j] = %lu\n", inds.count(ind_h[j]));
         }
         assert(0 == inds.count(ind_h[j]));
         inds.insert(ind_h[j]);
@@ -218,7 +219,6 @@ void go(Queue & queue, bool useShared) {
           std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib]  << ", " << offsets_h[ib + 1] - 1 
                   << "] j=" << j << " ind[j]=" << ind_h[j] << " (k1 < k2) : a1=" << a[ind_h[j]] << " k1=" << k1
                   << "a2= " << a[ind_h[j - 1]] << " k2=" << k2 << std::endl;
-          assert(false);
         }
       }
       if (!inds.empty()) {

From e03dde38247546aa50d232ab9722ef1eb3961db5 Mon Sep 17 00:00:00 2001
From: Eric Cano <eric.cano@cern.ch>
Date: Mon, 15 Jan 2024 20:11:47 +0100
Subject: [PATCH 3/3] Final fixes to unit tests of port to Alpaka

Reinstates error messages instead of silent call to `REQUIRE(!devices.empty());`
Adds maybe_unused attirbut for variable used in assert.
---
 .../AlpakaInterface/interface/VecArray.h      |   2 +-
 .../AlpakaInterface/interface/radixSort.h     | 206 ++++++++++--------
 .../AlpakaInterface/interface/workdivision.h  |   6 +
 .../AlpakaInterface/test/BuildFile.xml        |   1 -
 .../test/alpaka/testAtomicPairCounter.dev.cc  |   7 +-
 .../test/alpaka/testOneHistoContainer.dev.cc  |   2 +-
 .../test/alpaka/testOneRadixSort.dev.cc       | 110 +++++-----
 .../test/alpaka/testRadixSort.dev.cc          |  64 +++---
 .../test/alpaka/testWorkDivision.dev.cc       | 114 +++++++---
 .../CUDAUtilities/interface/radixSort.h       |   4 +-
 .../CUDAUtilities/test/oneRadixSort_t.cu      |   2 +-
 .../CUDAUtilities/test/radixSort_t.cu         |   9 +-
 12 files changed, 308 insertions(+), 219 deletions(-)

diff --git a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h
index c4025dd42d4a6..231170212a37b 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h
@@ -42,7 +42,7 @@ namespace cms::alpakatools {
       }
     }
 
-    inline constexpr T cont& back() const {
+    inline constexpr T const &back() const {
       if (m_size > 0) {
         return m_data[m_size - 1];
       } else
diff --git a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
index bcf1da16a5533..0f94ad200efd9 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h
@@ -9,6 +9,7 @@
 #include <alpaka/alpaka.hpp>
 
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
 namespace cms::alpakatools {
 
@@ -26,20 +27,27 @@ namespace cms::alpakatools {
     alpaka::syncBlockThreads(acc);
 
     // find first negative
-    for(auto idx: elements_with_stride(acc, size - 1)) {
-      if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0)
+    for (auto idx : independent_group_elements(acc, size - 1)) {
+      if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) {
         firstNeg = idx + 1;
+      }
     }
 
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[idx - firstNeg] = ind[idx]; }
+    for (auto idx : independent_group_elements(acc, firstNeg, size)) {
+      ind2[idx - firstNeg] = ind[idx];
+    }
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; }
+    for (auto idx : independent_group_elements(acc, firstNeg)) {
+      ind2[idx + size - firstNeg] = ind[idx];
+    }
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; }
+    for (auto idx : independent_group_elements(acc, size)) {
+      ind[idx] = ind2[idx];
+    }
   }
 
   template <typename TAcc, typename T>
@@ -52,51 +60,56 @@ namespace cms::alpakatools {
     alpaka::syncBlockThreads(acc);
 
     // find first negative
-    for(auto idx: elements_with_stride(acc, size - 1)) {
+    for (auto idx : independent_group_elements(acc, size - 1)) {
       if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0)
         firstNeg = idx + 1;
     }
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, size, firstNeg)) { ind2[size - idx - 1] = ind[idx]; }
+    for (auto idx : independent_group_elements(acc, firstNeg, size)) {
+      ind2[size - idx - 1] = ind[idx];
+    }
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, firstNeg)) { ind2[idx + size - firstNeg] = ind[idx]; }
+    for (auto idx : independent_group_elements(acc, firstNeg)) {
+      ind2[idx + size - firstNeg] = ind[idx];
+    }
     alpaka::syncBlockThreads(acc);
 
-    for(auto idx: elements_with_stride(acc, size)) { ind[idx] = ind2[idx]; }
+    for (auto idx : independent_group_elements(acc, size)) {
+      ind[idx] = ind2[idx];
+    }
   }
 
-  
   // Radix sort implements a bytewise lexicographic order on the input data.
   // Data is reordered into bins indexed by the byte considered. But considering least significant bytes first
   // and respecting the existing order when binning the values, we achieve the lexicographic ordering.
   // The number of bytes actually considered is a parameter template parameter.
   // The post processing reorder
-  // function fixes the order when bitwise ordering is not the order for the underlying type (only based on 
+  // function fixes the order when bitwise ordering is not the order for the underlying type (only based on
   // most significant bit for signed types, integer or floating point).
   // The floating point numbers are reinterpret_cast into integers in the calling wrapper
   // This algorithm requires to run in a single block
   template <typename TAcc,
-            typename T,  // shall be integer, signed or not does not matter here
-            int NS,      // number of significant bytes to use in sorting. 
-            typename RF> // The post processing reorder function.
+            typename T,   // shall be integer, signed or not does not matter here
+            int NS,       // number of significant bytes to use in sorting.
+            typename RF>  // The post processing reorder function.
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortImpl(
       const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
     if constexpr (!requires_single_thread_per_block_v<TAcc>) {
       const auto warpSize = alpaka::warp::getSize(acc);
       const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
-      const uint32_t blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
+      [[maybe_unused]] const uint32_t blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
       // we expect a power of 2 here
       assert(warpSize && (0 == (warpSize & (warpSize - 1))));
-      const std::size_t warpMask  = warpSize - 1;
+      const std::size_t warpMask = warpSize - 1;
 
       // Define the bin size (d=8 => 1 byte bin).
       constexpr int binBits = 8, dataBits = 8 * sizeof(T), totalSortingPassses = dataBits / binBits;
       // Make sure the slices are data aligned
-      static_assert (0 == dataBits % binBits);
+      static_assert(0 == dataBits % binBits);
       // Make sure the NS parameter makes sense
-      static_assert (NS > 0 && NS <= sizeof(T));
+      static_assert(NS > 0 && NS <= sizeof(T));
       constexpr int binsNumber = 1 << binBits;
       constexpr int binsMask = binsNumber - 1;
       // Prefix scan iterations. NS is counted in full bytes and not slices.
@@ -106,7 +119,7 @@ namespace cms::alpakatools {
       // TODO: rename
       auto& c = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
       // Temporary storage for prefix scan. Only really needed for first-of-warp keeping
-      // Then used for thread to bin mapping TODO: change type to byte and remap to 
+      // Then used for thread to bin mapping TODO: change type to byte and remap to
       auto& ct = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
       // Bin to thread index mapping (used to store the highest thread index within a bin number
       // batch of threads.
@@ -114,7 +127,7 @@ namespace cms::alpakatools {
       // lowest possible value (change to bytes?)
       auto& cu = alpaka::declareSharedVar<int32_t[binsNumber], __COUNTER__>(acc);
       // TODO we could also have an explicit caching of the current index for each thread.
-      
+
       // TODO: do those have to be shared?
       auto& ibs = alpaka::declareSharedVar<int, __COUNTER__>(acc);
       auto& currentSortingPass = alpaka::declareSharedVar<int, __COUNTER__>(acc);
@@ -129,36 +142,40 @@ namespace cms::alpakatools {
       auto k = ind2;
 
       // Initializer index order to trivial increment.
-      for (auto idx: independent_group_elements(acc, size)) { j[idx] = idx; }
+      for (auto idx : independent_group_elements(acc, size)) {
+        j[idx] = idx;
+      }
       alpaka::syncBlockThreads(acc);
 
       // Iterate on the slices of the data.
       while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, (currentSortingPass < totalSortingPassses))) {
-        for (auto idx: independent_group_elements(acc, binsNumber)) { c[idx] = 0; }
+        for (auto idx : independent_group_elements(acc, binsNumber)) {
+          c[idx] = 0;
+        }
         alpaka::syncBlockThreads(acc);
         const auto sortingPassShift = binBits * currentSortingPass;
 
         // fill bins (count elements in each bin)
-        for (auto idx: independent_group_elements(acc, size)) {
+        for (auto idx : independent_group_elements(acc, size)) {
           auto bin = (a[j[idx]] >> sortingPassShift) & binsMask;
           alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{});
-        } 
+        }
         alpaka::syncBlockThreads(acc);
-        
-        if (!threadIdxLocal && !alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
-          printf("Pass=%d ", currentSortingPass - 1);
+
+        if (!threadIdxLocal && 1 == alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
+          //          printf("Pass=%d, Block=%d, ", currentSortingPass - 1, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]);
           size_t total = 0;
-          for(int i=0; i<(int)binsNumber; i++) {
-            printf("count[%d]=%d ", i, c[i] );
+          for (int i = 0; i < (int)binsNumber; i++) {
+            //            printf("count[%d]=%d ", i, c[i] );
             total += c[i];
           }
-          printf("total=%zu\n", total);
+          //          printf("total=%zu\n", total);
           assert(total == size);
         }
         // prefix scan "optimized"???...
         // TODO: we might be able to reuse the warpPrefixScan function
         // Warp level prefix scan
-        for (auto idx: independent_group_elements(acc, binsNumber)) {
+        for (auto idx : independent_group_elements(acc, binsNumber)) {
           auto x = c[idx];
           auto laneId = idx & warpMask;
 
@@ -170,16 +187,16 @@ namespace cms::alpakatools {
           ct[idx] = x;
         }
         alpaka::syncBlockThreads(acc);
-        
+
         // Block level completion of prefix scan (add last sum of each preceding warp)
-        for (auto idx: independent_group_elements(acc, binsNumber)) {
+        for (auto idx : independent_group_elements(acc, binsNumber)) {
           auto ss = (idx / warpSize) * warpSize - 1;
           c[idx] = ct[idx];
           for (int i = ss; i > 0; i -= warpSize)
             c[idx] += ct[i];
         }
         // Post prefix scan, c[bin] contains the offsets in index counts to the last index +1 for each bin
-        
+
         /*
         //prefix scan for the nulls  (for documentation)
         if (threadIdxLocal==0)
@@ -191,18 +208,11 @@ namespace cms::alpakatools {
         // This will reorder the indices by the currently considered slice, otherwise preserving the previous order.
         ibs = size - 1;
         alpaka::syncBlockThreads(acc);
-        if (!threadIdxLocal && !alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
-          printf("Pass=%d (post prefix scan) ", currentSortingPass - 1);
-          for(int i=0; i<(int)binsNumber; i++) {
-            printf("offset[%d]=%d ", i, c[i] );
-          }
-          printf("\n");
-        }
 
         // Iterate on bin-sized slices to (size - 1) / binSize + 1 iterations
         while (alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, ibs >= 0)) {
           // Init
-          for (auto idx: independent_group_elements(acc, binsNumber)) {
+          for (auto idx : independent_group_elements(acc, binsNumber)) {
             cu[idx] = -1;
             ct[idx] = -1;
           }
@@ -210,7 +220,7 @@ namespace cms::alpakatools {
 
           // Find the highest index for all the threads dealing with a given bin (in cu[])
           // Also record the bin for each thread (in ct[])
-          for (auto idx: independent_group_elements(acc, binsNumber)) {
+          for (auto idx : independent_group_elements(acc, binsNumber)) {
             int i = ibs - idx;
             int32_t bin = -1;
             if (i >= 0) {
@@ -220,24 +230,16 @@ namespace cms::alpakatools {
             }
           }
           alpaka::syncBlockThreads(acc);
-        if (!threadIdxLocal && !alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]) {
-          printf("Pass=%d (max index) ", currentSortingPass - 1);
-          for(int i=0; i<(int)binsNumber; i++) {
-            printf("max_i[%d]=%d ", i, cu[i] );
-          }
-          printf("\n");
-        }
 
-          
           // FIXME: we can slash a memory access.
-          for (auto idx: independent_group_elements(acc, binsNumber)) {
+          for (auto idx : independent_group_elements(acc, binsNumber)) {
             int i = ibs - idx;
             // Are we still in inside the data?
             if (i >= 0) {
               int32_t bin = ct[idx];
               // Are we the thread with the highest index (from previous pass)?
               if (cu[bin] == i) {
-                // With the highest index, we are actually the lowest thread number. We will 
+                // With the highest index, we are actually the lowest thread number. We will
                 // work "on behalf of" the higher thread numbers (including ourselves)
                 // No way around scanning and testing for bin in ct[otherThread] number to find the other threads
                 for (int peerThreadIdx = idx; peerThreadIdx < binsNumber; peerThreadIdx++) {
@@ -292,21 +294,17 @@ namespace cms::alpakatools {
         if (threadIdxLocal == 0)
           ++currentSortingPass;
         alpaka::syncBlockThreads(acc);
-        if (!threadIdxLocal && size == 257) {
-          printf("Pass=%d ", currentSortingPass - 1);
-          for(int i=0; i<(int)size; i++) {
-            printf("k[%d]=%d ", i, k[i] );
-          }
-          printf("\n");
-        }
       }
 
       if ((dataBits != 8) && (0 == (NS & 1)))
-        ALPAKA_ASSERT_OFFLOAD(j == ind);  // dataBits/binBits is even so ind is correct (the result is in the right location)
+        ALPAKA_ASSERT_OFFLOAD(j ==
+                              ind);  // dataBits/binBits is even so ind is correct (the result is in the right location)
 
       // TODO this copy is (doubly?) redundant with the reorder
       if (j != ind)  // odd number of sorting passes, we need to move the result to the right array (ind[])
-        for (auto idx: independent_group_elements(acc, size)) { ind[idx] = ind2[idx]; };
+        for (auto idx : independent_group_elements(acc, size)) {
+          ind[idx] = ind2[idx];
+        };
 
       alpaka::syncBlockThreads(acc);
 
@@ -321,41 +319,36 @@ namespace cms::alpakatools {
   template <typename TAcc,
             typename T,
             int NS = sizeof(T),  // number of significant bytes to use in sorting
-            typename std::enable_if<std::is_unsigned<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
+            typename std::enable_if<std::is_unsigned<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* =
+                nullptr>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
-    if (!alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]) {
-      printf("GPU radixSort unsigned, a=%p, block=%d, size=%d\n", a, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0], size);
-    }
     radixSortImpl<TAcc, T, NS>(acc, a, ind, ind2, size, dummyReorder<TAcc, T>);
   }
 
   template <typename TAcc,
             typename T,
             int NS = sizeof(T),  // number of significant bytes to use in sorting
-            typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
+            typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value &&
+                                        !requires_single_thread_per_block_v<TAcc>,
+                                    T>::type* = nullptr>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
-    if (!alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]) {
-      printf("GPU radixSort signed, a=%p, block=%d, size=%d\n", a, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0], size);
-    }
     radixSortImpl<TAcc, T, NS>(acc, a, ind, ind2, size, reorderSigned<TAcc, T>);
   }
 
   template <typename TAcc,
             typename T,
             int NS = sizeof(T),  // number of significant bytes to use in sorting
-            typename std::enable_if<std::is_floating_point<T>::value && !requires_single_thread_per_block_v<TAcc>, T>::type* = nullptr>
+            typename std::enable_if<std::is_floating_point<T>::value && !requires_single_thread_per_block_v<TAcc>,
+                                    T>::type* = nullptr>
   ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSort(
       const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
     static_assert(sizeof(T) == sizeof(int), "radixSort with the wrong type size");
     using I = int;
-    if (!alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]) {
-      printf("GPU radixSort float, a=%p, block=%d, size=%d\n", a, alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0], size);
-    }
     radixSortImpl<TAcc, I, NS>(acc, (I const*)(a), ind, ind2, size, reorderFloat<TAcc, I>);
   }
-  
+
   template <typename TAcc,
             typename T,
             int NS = sizeof(T),  // number of significant bytes to use in sorting
@@ -376,7 +369,7 @@ namespace cms::alpakatools {
     }
     printf("\n");
     */
-    std::stable_sort(ind, ind+size, [a](uint16_t i0, uint16_t i1) { return a[i0] < a[i1]; });
+    std::stable_sort(ind, ind + size, [a](uint16_t i0, uint16_t i1) { return a[i0] < a[i1]; });
     /*
     for (uint32_t i=0; i<10 && i<size; i++) {
       printf ("ind[%d]=%d ", i, ind[i]);
@@ -384,17 +377,14 @@ namespace cms::alpakatools {
     printf("\n");
     */
   }
-  
-                                                                                                                                                                                   template <typename TAcc, typename T, int NS = sizeof(T)>
-  ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti(const TAcc& acc,
-                                                 T const* v,
-                                                 uint16_t* index,
-                                                 uint32_t const* offsets,
-                                                 uint16_t* workspace) {
-    // TODO: check 
+
+  template <typename TAcc, typename T, int NS = sizeof(T)>
+  ALPAKA_FN_ACC ALPAKA_FN_INLINE void radixSortMulti(
+      const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+    // TODO: check
     // Sort multiple blocks of data in v[] separated by in chunks located at offsets[]
     // extern __shared__ uint16_t ws[];
-    uint16_t * ws = alpaka::getDynSharedMem<uint16_t>(acc);
+    uint16_t* ws = alpaka::getDynSharedMem<uint16_t>(acc);
 
     const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
     auto a = v + offsets[blockIdx];
@@ -402,23 +392,24 @@ namespace cms::alpakatools {
     auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx];
     auto size = offsets[blockIdx + 1] - offsets[blockIdx];
     assert(offsets[blockIdx + 1] >= offsets[blockIdx]);
-    if (0 == alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0]) {
-      printf("Block=%d, offsets[blockIdx]=%d, size=%d, v=%p, v[offset[blockId]=%p\n", 
-              blockIdx, offsets[blockIdx], size, v, &v[offsets[blockIdx]]);
-    }
     if (size > 0)
       radixSort<TAcc, T, NS>(acc, a, ind, ind2, size);
   }
- 
+
   template <typename T, int NS = sizeof(T)>
   struct radixSortMultiWrapper {
-/* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical
+    /* We cannot set launch_bounds in alpaka, so both kernel wrappers are identical (keeping CUDA/HIP code for reference for the moment)
 #if defined(__CUDACC__) || defined(__HIPCC__)
     //__global__ void __launch_bounds__(256, 4)
 #endif
 */
     template <typename TAcc>
-    ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const {
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  T const* v,
+                                  uint16_t* index,
+                                  uint32_t const* offsets,
+                                  uint16_t* workspace,
+                                  size_t sharedMemBytes = 0) const {
       radixSortMulti<TAcc, T, NS>(acc, v, index, offsets, workspace);
     }
   };
@@ -426,10 +417,39 @@ namespace cms::alpakatools {
   template <typename T, int NS = sizeof(T)>
   struct radixSortMultiWrapper2 {
     template <typename TAcc>
-    ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) const {
+    ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                  T const* v,
+                                  uint16_t* index,
+                                  uint32_t const* offsets,
+                                  uint16_t* workspace,
+                                  size_t sharedMemBytes = 0) const {
       radixSortMulti<TAcc, T, NS>(acc, v, index, offsets, workspace);
     }
   };
 }  // namespace cms::alpakatools
 
+namespace alpaka::trait {
+  // specialize the BlockSharedMemDynSizeBytes trait to specify the amount of
+  // block shared dynamic memory for the radixSortMultiWrapper kernel
+  template <typename TAcc, typename T, int NS>
+  struct BlockSharedMemDynSizeBytes<cms::alpakatools::radixSortMultiWrapper<T, NS>, TAcc> {
+    // the size in bytes of the shared memory allocated for a block
+    ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(
+        cms::alpakatools::radixSortMultiWrapper<T, NS> const& /* kernel */,
+        alpaka_common::Vec1D /* threads */,
+        alpaka_common::Vec1D /* elements */,
+        T const* /* v */,
+        uint16_t* /* index */,
+        uint32_t const* /* offsets */,
+        uint16_t* workspace,
+        size_t sharedMemBytes) {
+      if (workspace != nullptr)
+        return 0;
+      /* The shared memory workspace is 'blockspace * 2' in CUDA *but that's a value coincidence... TODO: check */
+      //printf ("in BlockSharedMemDynSizeBytes<cms::alpakatools::radixSortMultiWrapper<T, NS>, TAcc>: shared mem size = %d\n", (int)sharedMemBytes);
+      return sharedMemBytes;
+    }
+  };
+}  // namespace alpaka::trait
+
 #endif  // HeterogeneousCore_AlpakaInterface_interface_radixSort_h
diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
index 4c0aa9fe5f2b9..e02f4e92f813e 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -701,6 +701,12 @@ namespace cms::alpakatools {
           stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
           extent_{extent} {}
 
+    ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc, Idx first, Idx extent)
+        : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
+          thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_ + first},
+          stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
+          extent_{extent} {}
+
     class const_iterator;
     using iterator = const_iterator;
 
diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
index 463bf5dae4461..0198e36f9166f 100644
--- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
+++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
@@ -58,7 +58,6 @@
 
 <bin name="alpakaTestSimpleVector" file="alpaka/testSimpleVector.dev.cc">
   <use name="alpaka"/>
-  <use name="catch2"/>
   <use name="HeterogeneousCore/AlpakaInterface"/>
   <flags ALPAKA_BACKENDS="1"/>
 </bin>
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
index f4073997f5c97..1687feb8c1bab 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc
@@ -42,7 +42,12 @@ struct finalize {
 TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestAtomicPair), s_tag) {
   SECTION("AtomicPairCounter") {
     auto const &devices = cms::alpakatools::devices<Platform>();
-    REQUIRE(!devices.empty());
+    if (devices.empty()) {
+      std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
+                << ", the test will be skipped.\n";
+      REQUIRE(not devices.empty());
+    }
+
     // run the test on each device
     for (auto const &device : devices) {
       std::cout << "Test AtomicPairCounter on " << alpaka::getName(device) << '\n';
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
index 5e08c05725888..4ce11cc7facdd 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc
@@ -93,7 +93,7 @@ struct mykernel {
 #ifndef NDEBUG
       auto b0 = Hist::bin(v[j]);
 #endif
-      int tot = 0;
+      [[maybe_unused]] int tot = 0;
       auto ftest = [&](unsigned int k) {
         ALPAKA_ASSERT_OFFLOAD(k < N);
         ++tot;
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
index c781f83ff1225..21f3477a3dd92 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneRadixSort.dev.cc
@@ -64,7 +64,8 @@ ALPAKA_FN_HOST_ACC T truncate(T const& t) {
 namespace {
   struct testKernel {
     template <typename TAcc>
-    ALPAKA_FN_ACC void operator() (const TAcc &acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const {
+    ALPAKA_FN_ACC void operator()(
+        const TAcc& acc, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) const {
       //size_t firstElement = threadIdx.x + blockIdx.x * blockDim.x;  // This is going to be the track index
       //size_t gridSize = blockDim.x * gridDim.x;
       bool threadZero = !alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
@@ -76,15 +77,15 @@ namespace {
       assert(0 == blocksIdx);
       assert(elements <= 2048);
 
-      auto &order = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
-      auto &sws = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
-      auto &z = alpaka::declareSharedVar<float[2048], __COUNTER__>(acc);
-      auto &iz = alpaka::declareSharedVar<int[2048], __COUNTER__>(acc);
-//      __shared__ uint16_t order[2048];
-//      __shared__ uint16_t sws[2048];
-//      __shared__ float z[2048];
-//      __shared__ int iz[2048];
-      for (auto itrack: elements_with_stride(acc, elements)) {
+      auto& order = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
+      auto& sws = alpaka::declareSharedVar<uint16_t[2048], __COUNTER__>(acc);
+      auto& z = alpaka::declareSharedVar<float[2048], __COUNTER__>(acc);
+      auto& iz = alpaka::declareSharedVar<int[2048], __COUNTER__>(acc);
+      //      __shared__ uint16_t order[2048];
+      //      __shared__ uint16_t sws[2048];
+      //      __shared__ float z[2048];
+      //      __shared__ int iz[2048];
+      for (auto itrack : elements_with_stride(acc, elements)) {
         z[itrack] = gpu_input[itrack];
         iz[itrack] = 10000 * gpu_input[itrack];
         // order[itrack] = itrack;
@@ -94,7 +95,7 @@ namespace {
       alpaka::syncBlockThreads(acc);
 
       //verify
-      for (auto itrack: elements_with_stride(acc, elements - 1)) {
+      for (auto itrack : elements_with_stride(acc, elements - 1)) {
         auto ntrack = order[itrack];
         auto mtrack = order[itrack + 1];
         assert(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack]));
@@ -122,7 +123,7 @@ namespace {
       radixSort<TAcc, int, 4>(acc, iz, order, sws, elements);
       alpaka::syncBlockThreads(acc);
 
-      for (auto itrack: elements_with_stride(acc, elements - 1)) {
+      for (auto itrack : elements_with_stride(acc, elements - 1)) {
         auto ntrack = order[itrack];
         auto mtrack = order[itrack + 1];
         assert(iz[ntrack] <= iz[mtrack]);
@@ -146,11 +147,12 @@ namespace {
     }
   };
 
-  void testWrapper(Queue & queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
+  void testWrapper(Queue& queue, FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
     auto blockSize = 512;  // somewhat arbitrary
     auto gridSize = 1;     // round up to cover the sample size
     const auto workdiv = make_workdiv<Acc1D>(gridSize, blockSize);
-    alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint));
+    alpaka::enqueue(queue,
+                    alpaka::createTaskKernel<Acc1D>(workdiv, testKernel(), gpu_input, gpu_product, elements, doPrint));
     alpaka::wait(queue);
   }
 }  // namespace
@@ -165,50 +167,54 @@ int main() {
               << ", the test will be skipped.\n";
     return 0;
   }
-  
+
   // run the test on each device
   for (auto const& device : devices) {
     Queue queue(device);
-//    FLOAT* gpu_input;
-//    int* gpu_product;
+    //    FLOAT* gpu_input;
+    //    int* gpu_product;
 
     int nmax = 4 * 260;
     auto gpu_input_h = cms::alpakatools::make_host_buffer<FLOAT[]>(queue, nmax);
     auto i = gpu_input_h.data();
-    for (auto v: {
-        30.0,         30.0,         -4.4,         -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, 2.2666890621,
-        2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453,  2.2852313519,  2.2877883911, 2.2946476936, 2.2960267067,
-        2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652,  2.783257246,   2.8440306187, 2.9017834663, 2.9252648354,
-        2.9254128933, 2.927520752,  2.9422419071, 2.9453969002,  2.9457902908,  2.9465973377, 2.9492356777, 2.9573802948,
-        2.9575133324, 2.9575304985, 2.9586606026, 2.9605507851,  2.9622797966,  2.9625515938, 2.9641008377, 2.9646151066,
-        2.9676523209, 2.9708273411, 2.974111557,  2.9742531776,  2.9772830009,  2.9877333641, 2.9960610867, 3.013969183,
-        3.0187871456, 3.0379793644, 3.0407221317, 3.0415751934,  3.0470511913,  3.0560519695, 3.0592908859, 3.0599737167,
-        3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904,  3.0643932819,  3.0645000935, 3.0666446686, 3.068046093,
-        3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026,  3.0733406544,  3.0738227367, 3.0738801956, 3.0738828182,
-        3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146,  3.0773906708,  3.0778541565, 3.0780284405, 3.0780889988,
-        3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217,  3.0795567036,  3.0797944069, 3.0806643963, 3.0809247494,
-        3.0815284252, 3.0817306042, 3.0819730759, 3.0820026398,  3.0838682652,  3.084009409,  3.0848178864, 3.0853257179,
-        3.0855510235, 3.0856611729, 3.0873703957, 3.0884618759,  3.0891149044,  3.0893011093, 3.0895674229, 3.0901503563,
-        3.0903317928, 3.0912668705, 3.0920717716, 3.0954346657,  3.096424818,   3.0995628834, 3.1001036167, 3.1173279285,
-        3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508,  3.1676850319,  3.1886672974, 3.3769197464, 3.3821125031,
-        3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872,  3.5225863457,  3.5271244049, 3.5298995972, 3.5417425632,
-        3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239,  3.5526945591,  3.5531234741, 3.5538012981, 3.5544877052,
-        3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172,  3.5560717583,  3.5560848713, 3.5584278107, 3.558681488,
-        3.5587313175, 3.5592217445, 3.559384346,  3.5604712963,  3.5634038448,  3.563803196,  3.564593792,  3.5660364628,
-        3.5683133602, 3.5696356297, 3.569729805,  3.5740811825,  3.5757565498,  3.5760207176, 3.5760478973, 3.5836098194,
-        3.5839796066, 3.5852358341, 3.5901627541, 3.6141786575,  3.6601481438,  3.7187042236, 3.9741659164, 4.4111995697,
-        4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405,  4.6868157387,  4.6868577003, 4.6879930496, 4.6888813972,
-        4.6910686493, 4.6925001144, 4.6957530975, 4.698094368,   4.6997032166,  4.7017259598, 4.7020640373, 4.7024269104,
-        4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456,  4.7044372559,  4.7050771713, 4.7055773735, 4.7060651779,
-        4.7062759399, 4.7065420151, 4.70657444,   4.7066287994,  4.7066788673,  4.7067341805, 4.7072944641, 4.7074551582,
-        4.7075614929, 4.7075891495, 4.7076044083, 4.7077374458,  4.7080879211,  4.70819664,   4.7086658478, 4.708937645,
-        4.7092385292, 4.709479332,  4.7095656395, 4.7100076675,  4.7102108002,  4.7104525566, 4.7105507851, 4.71118927,
-        4.7113513947, 4.7115578651, 4.7116270065, 4.7116751671,  4.7117190361,  4.7117333412, 4.7117910385, 4.7119007111,
-        4.7120013237, 4.712003231,  4.712044239,  4.7122926712,  4.7135767937,  4.7143669128, 4.7145690918, 4.7148418427,
-        4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611,  4.717815876,   4.718059063,  4.7188801765, 4.7190728188,
-        4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678,  4.7243990898,  4.7273659706, 4.7294125557, 4.7296204567,
-        4.7325615883, 4.7356877327, 4.740146637,  4.742254734,   4.7433848381,  4.7454957962, 4.7462964058, 4.7692604065,
-        4.7723139628, 4.774812736,  4.8577151299, 4.890037536}) {
+    for (auto v : {30.0,         30.0,         -4.4,         -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007,
+                   2.2666890621, 2.2677690983, 2.2794606686, 2.2802586555,  2.2821085453,  2.2852313519, 2.2877883911,
+                   2.2946476936, 2.2960267067, 2.3006286621, 2.3245604038,  2.6755006313,  2.7229132652, 2.783257246,
+                   2.8440306187, 2.9017834663, 2.9252648354, 2.9254128933,  2.927520752,   2.9422419071, 2.9453969002,
+                   2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948,  2.9575133324,  2.9575304985, 2.9586606026,
+                   2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377,  2.9646151066,  2.9676523209, 2.9708273411,
+                   2.974111557,  2.9742531776, 2.9772830009, 2.9877333641,  2.9960610867,  3.013969183,  3.0187871456,
+                   3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913,  3.0560519695,  3.0592908859, 3.0599737167,
+                   3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904,  3.0643932819,  3.0645000935, 3.0666446686,
+                   3.068046093,  3.0697011948, 3.0717656612, 3.0718104839,  3.0718348026,  3.0733406544, 3.0738227367,
+                   3.0738801956, 3.0738828182, 3.0744686127, 3.0753741264,  3.0758397579,  3.0767207146, 3.0773906708,
+                   3.0778541565, 3.0780284405, 3.0780889988, 3.0782799721,  3.0789675713,  3.0792205334, 3.0793278217,
+                   3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494,  3.0815284252,  3.0817306042, 3.0819730759,
+                   3.0820026398, 3.0838682652, 3.084009409,  3.0848178864,  3.0853257179,  3.0855510235, 3.0856611729,
+                   3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093,  3.0895674229,  3.0901503563, 3.0903317928,
+                   3.0912668705, 3.0920717716, 3.0954346657, 3.096424818,   3.0995628834,  3.1001036167, 3.1173279285,
+                   3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508,  3.1676850319,  3.1886672974, 3.3769197464,
+                   3.3821125031, 3.4780933857, 3.4822063446, 3.4989323616,  3.5076274872,  3.5225863457, 3.5271244049,
+                   3.5298995972, 3.5417425632, 3.5444457531, 3.5465917587,  3.5473103523,  3.5480232239, 3.5526945591,
+                   3.5531234741, 3.5538012981, 3.5544877052, 3.5547749996,  3.5549693108,  3.5550665855, 3.5558729172,
+                   3.5560717583, 3.5560848713, 3.5584278107, 3.558681488,   3.5587313175,  3.5592217445, 3.559384346,
+                   3.5604712963, 3.5634038448, 3.563803196,  3.564593792,   3.5660364628,  3.5683133602, 3.5696356297,
+                   3.569729805,  3.5740811825, 3.5757565498, 3.5760207176,  3.5760478973,  3.5836098194, 3.5839796066,
+                   3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438,  3.7187042236,  3.9741659164, 4.4111995697,
+                   4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405,  4.6868157387,  4.6868577003, 4.6879930496,
+                   4.6888813972, 4.6910686493, 4.6925001144, 4.6957530975,  4.698094368,   4.6997032166, 4.7017259598,
+                   4.7020640373, 4.7024269104, 4.7036352158, 4.7038679123,  4.7042069435,  4.7044086456, 4.7044372559,
+                   4.7050771713, 4.7055773735, 4.7060651779, 4.7062759399,  4.7065420151,  4.70657444,   4.7066287994,
+                   4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582,  4.7075614929,  4.7075891495, 4.7076044083,
+                   4.7077374458, 4.7080879211, 4.70819664,   4.7086658478,  4.708937645,   4.7092385292, 4.709479332,
+                   4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566,  4.7105507851,  4.71118927,   4.7113513947,
+                   4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361,  4.7117333412,  4.7117910385, 4.7119007111,
+                   4.7120013237, 4.712003231,  4.712044239,  4.7122926712,  4.7135767937,  4.7143669128, 4.7145690918,
+                   4.7148418427, 4.7149815559, 4.7159647942, 4.7161884308,  4.7177276611,  4.717815876,  4.718059063,
+                   4.7188801765, 4.7190728188, 4.7199850082, 4.7213058472,  4.7239775658,  4.7243933678, 4.7243990898,
+                   4.7273659706, 4.7294125557, 4.7296204567, 4.7325615883,  4.7356877327,  4.740146637,  4.742254734,
+                   4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065,  4.7723139628,  4.774812736,  4.8577151299,
+                   4.890037536}) {
       *(i++) = v;
     }
     auto input = gpu_input_h.data();
@@ -219,7 +225,7 @@ int main() {
     }
     auto gpu_input_d = cms::alpakatools::make_device_buffer<FLOAT[]>(queue, nmax);
     //cudaCheck(cudaMalloc(&gpu_input, sizeof(FLOAT) * nmax));
-//    cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax));
+    //    cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax));
     auto gpu_product_d = cms::alpakatools::make_device_buffer<int[]>(queue, nmax);
     // copy the input data to the GPU
     alpaka::memcpy(queue, gpu_input_d, gpu_input_h);
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
index df41c649b91cb..9caf2af20eb9c 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testRadixSort.dev.cc
@@ -81,7 +81,7 @@ void truncate(T& t) {
 }
 
 template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
-void go(Queue & queue, bool useShared) {
+void go(Queue& queue, bool useShared) {
   std::mt19937 eng;
   //std::mt19937 eng2;
   auto rgen = RS<T>::ud();
@@ -145,46 +145,48 @@ void go(Queue & queue, bool useShared) {
 
     alpaka::memcpy(queue, v_d, v_h);
     alpaka::memcpy(queue, off_d, offsets_h);
-//    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
-//    cudaCheck(cudaMemcpy(off_d.get(), offsets_h, 4 * (blocks + 1), cudaMemcpyHostToDevice));
 
     if (i < 2)
       std::cout << "launch for " << offsets_h[blocks] << std::endl;
 
-    auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
+    auto ntXBl = 1 == i % 4 ? 256 : 256;
 
     auto start = std::chrono::high_resolution_clock::now();
-    // TODO: manage runtime sized shared memory
-    [[maybe_unused]] constexpr int MaxSize = 256 * 32;
+    // The MaxSize is the max size we allow between offsets (i.e. biggest set to sort when using shared memory).
+    constexpr int MaxSize = 256 * 32;
     auto workdiv = make_workdiv<Acc1D>(blocks, ntXBl);
     if (useShared)
       // The original CUDA version used to call a kernel with __launch_bounds__(256, 4) specifier
-      // 
-      alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-              radixSortMultiWrapper<U, NS>{}, v_d.data(), ind_d.data(), off_d.data(), nullptr));
-//      cms::cuda::launch(
-//          radixSortMultiWrapper<U, NS>, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr);
+      //
+      alpaka::enqueue(queue,
+                      alpaka::createTaskKernel<Acc1D>(workdiv,
+                                                      radixSortMultiWrapper<U, NS>{},
+                                                      v_d.data(),
+                                                      ind_d.data(),
+                                                      off_d.data(),
+                                                      nullptr,
+                                                      MaxSize * sizeof(uint16_t)));
     else
-//      cms::cuda::launch(
-//          radixSortMultiWrapper2<U, NS>, {blocks, ntXBl},             v_d.get(), ind_d.get(), off_d.get(), ws_d.get());
-      alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-              radixSortMultiWrapper2<U, NS>{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data()));
+      alpaka::enqueue(
+          queue,
+          alpaka::createTaskKernel<Acc1D>(
+              workdiv, radixSortMultiWrapper2<U, NS>{}, v_d.data(), ind_d.data(), off_d.data(), ws_d.data()));
 
-    if (i == 0)
-      std::cout << "done for " << offsets_h[blocks] << std::endl;
+    if (i < 2)
+      std::cout << "launch done for " << offsets_h[blocks] << std::endl;
 
     alpaka::memcpy(queue, ind_h, ind_d);
     alpaka::wait(queue);
-    //cudaCheck(cudaMemcpy(ind_h, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost));
 
     delta += std::chrono::high_resolution_clock::now() - start;
 
-    if (i == 0)
-      std::cout << "done for " << offsets_h[blocks] << std::endl;
+    if (i < 2)
+      std::cout << "kernel and read back done for " << offsets_h[blocks] << std::endl;
 
     if (32 == i) {
       std::cout << LL(v_h[ind_h[0]]) << ' ' << LL(v_h[ind_h[1]]) << ' ' << LL(v_h[ind_h[2]]) << std::endl;
-      std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]]) << std::endl;
+      std::cout << LL(v_h[ind_h[3]]) << ' ' << LL(v_h[ind_h[10]]) << ' ' << LL(v_h[ind_h[blockSize - 1000]])
+                << std::endl;
       std::cout << LL(v_h[ind_h[blockSize / 2 - 1]]) << ' ' << LL(v_h[ind_h[blockSize / 2]]) << ' '
                 << LL(v_h[ind_h[blockSize / 2 + 1]]) << std::endl;
     }
@@ -194,17 +196,16 @@ void go(Queue & queue, bool useShared) {
         inds.insert(ind_h[offsets_h[ib]]);
       for (auto j = offsets_h[ib] + 1; j < offsets_h[ib + 1]; j++) {
         if (inds.count(ind_h[j]) != 0) {
-          printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n",
-                  i, ib, j, ind_h[j]);
+          printf("i=%d ib=%d ind_h[j=%d]=%d: duplicate indice!\n", i, ib, j, ind_h[j]);
           std::vector<int> counts;
           counts.resize(offsets_h[ib + 1] - offsets_h[ib], 0);
           for (size_t j2 = offsets_h[ib]; j2 < offsets_h[ib + 1]; j2++) {
             counts[ind_h[j2]]++;
-          } 
+          }
           for (size_t j2 = 0; j2 < counts.size(); j2++) {
-            if (counts[j2]!=1)
+            if (counts[j2] != 1)
               printf("counts[%ld]=%d ", j2, counts[j2]);
-          } 
+          }
           printf("\n");
           printf("inds.count(ind_h[j] = %lu\n", inds.count(ind_h[j]));
         }
@@ -216,9 +217,12 @@ void go(Queue & queue, bool useShared) {
         truncate<NS>(k1);
         truncate<NS>(k2);
         if (k1 < k2) {
-          std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib]  << ", " << offsets_h[ib + 1] - 1 
-                  << "] j=" << j << " ind[j]=" << ind_h[j] << " (k1 < k2) : a1=" << a[ind_h[j]] << " k1=" << k1
-                  << "a2= " << a[ind_h[j - 1]] << " k2=" << k2 << std::endl;
+          std::cout << "i=" << i << " not ordered at ib=" << ib << " in [" << offsets_h[ib] << ", "
+                    << offsets_h[ib + 1] - 1 << "] j=" << j << " ind[j]=" << ind_h[j]
+                    << " (k1 < k2) : a1=" << (int64_t)a[ind_h[j]] << " k1=" << (int64_t)k1
+                    << " a2= " << (int64_t)a[ind_h[j - 1]] << " k2=" << (int64_t)k2 << std::endl;
+          //sleep(2);
+          assert(false);
         }
       }
       if (!inds.empty()) {
@@ -228,7 +232,7 @@ void go(Queue & queue, bool useShared) {
       if (inds.size() != (offsets_h[ib + 1] - offsets_h[ib]))
         std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets_h[ib + 1] - offsets_h[ib])
                   << std::endl;
-      // 
+      //
       assert(inds.size() == (offsets_h[ib + 1] - offsets_h[ib]));
     }
   }  // 50 times
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
index f4f4719192c12..ce85ad42cb0f4 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testWorkDivision.dev.cc
@@ -10,14 +10,14 @@ using namespace ALPAKA_ACCELERATOR_NAMESPACE;
 
 // Kernel running a loop over threads/elements
 // One function with multiple flavors
-enum class RangeType {
-  Default, ExtentLimited, ExtentLimitedWithShift
-};
-  
-enum class LoopScope {
-  Block, Grid
-};
 
+// The type of elements_with_stride
+enum class RangeType { Default, ExtentLimited, ExtentLimitedWithShift };
+
+// The concurrency scope between threads
+enum class LoopScope { Block, Grid };
+
+// Utility for one time initializations
 template <LoopScope loopScope, typename TAcc>
 bool constexpr firstInLoopRange(TAcc const& acc) {
   if constexpr (loopScope == LoopScope::Block)
@@ -30,7 +30,7 @@ bool constexpr firstInLoopRange(TAcc const& acc) {
 template <RangeType rangeType, LoopScope loopScope, typename TAcc>
 size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) {
   if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
-    return shift < size ? size - shift: 0;
+    return shift < size ? size - shift : 0;
   else if constexpr (rangeType == RangeType::ExtentLimited)
     return size;
   else /* rangeType == RangeType::Default */
@@ -41,9 +41,9 @@ size_t constexpr expectedCount(TAcc const& acc, size_t size, size_t shift) {
 }
 
 template <RangeType rangeType, LoopScope loopScope>
-size_t constexpr expectedCount(WorkDiv1D const &workDiv, size_t size, size_t shift) {
+size_t constexpr expectedCount(WorkDiv1D const& workDiv, size_t size, size_t shift) {
   if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
-    return shift < size ? size - shift: 0;
+    return shift < size ? size - shift : 0;
   else if constexpr (rangeType == RangeType::ExtentLimited)
     return size;
   else /* rangeType == RangeType::Default */
@@ -53,12 +53,13 @@ size_t constexpr expectedCount(WorkDiv1D const &workDiv, size_t size, size_t shi
       return workDiv.m_gridBlockExtent[0u] * workDiv.m_blockThreadExtent[0u] * workDiv.m_threadElemExtent[0u];
 }
 
-template<RangeType rangeType, LoopScope loopScope>
+template <RangeType rangeType, LoopScope loopScope>
 struct testWordDivisionDefaultRange {
   template <typename TAcc>
-  ALPAKA_FN_ACC void operator() (TAcc const & acc, size_t size, size_t shift, size_t *globalCounter) const {
-    size_t & counter = ( loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar<size_t, __COUNTER__>(acc));
-    // Init the counter for block range. Grid range does so my mean of memset. 
+  ALPAKA_FN_ACC void operator()(TAcc const& acc, size_t size, size_t shift, size_t* globalCounter) const {
+    size_t& counter =
+        (loopScope == LoopScope::Grid ? *globalCounter : alpaka::declareSharedVar<size_t, __COUNTER__>(acc));
+    // Init the counter for block range. Grid range does so my mean of memset.
     if constexpr (loopScope == LoopScope::Block) {
       if (firstInLoopRange<loopScope>(acc))
         counter = 0;
@@ -66,13 +67,13 @@ struct testWordDivisionDefaultRange {
     }
     // The loop we are testing
     if constexpr (rangeType == RangeType::Default)
-      for ([[maybe_unused]] auto idx: elements_with_stride(acc))
+      for ([[maybe_unused]] auto idx : elements_with_stride(acc))
         alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
     else if constexpr (rangeType == RangeType::ExtentLimited)
-      for ([[maybe_unused]] auto idx: elements_with_stride(acc, size))
+      for ([[maybe_unused]] auto idx : elements_with_stride(acc, size))
         alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
     else if constexpr (rangeType == RangeType::ExtentLimitedWithShift)
-      for ([[maybe_unused]]auto  idx: elements_with_stride(acc, size, shift))
+      for ([[maybe_unused]] auto idx : elements_with_stride(acc, shift, size))
         alpaka::atomicAdd(acc, &counter, 1ul, alpaka::hierarchy::Blocks{});
     alpaka::syncBlockThreads(acc);
     // Check the result. Grid range will check by memcpy-ing the result.
@@ -103,44 +104,87 @@ int main() {
     ssize_t BlockSize = 512;
     size_t GridSize = 4;
     for (size_t blocks = 1; blocks < GridSize * 3; blocks++)
-      for (auto sizeFuzz: std::initializer_list<ssize_t>{ - BlockSize / 2, -1, 0, 1, BlockSize/2 })
-        for (auto shift: std::initializer_list<ssize_t>{0, 1, BlockSize / 2, BlockSize - 1, BlockSize, BlockSize + 1, BlockSize + BlockSize / 2, 2*BlockSize -1, 2*BlockSize, 2*BlockSize + 1}) {
-         // Grid level iteration: we need to initialize/check at the grid level
+      for (auto sizeFuzz :
+           std::initializer_list<ssize_t>{-10 * BlockSize / 13, -BlockSize / 2, -1, 0, 1, BlockSize / 2})
+        for (auto shift : std::initializer_list<ssize_t>{0,
+                                                         1,
+                                                         BlockSize / 2,
+                                                         BlockSize - 1,
+                                                         BlockSize,
+                                                         BlockSize + 1,
+                                                         BlockSize + BlockSize / 2,
+                                                         2 * BlockSize - 1,
+                                                         2 * BlockSize,
+                                                         2 * BlockSize + 1}) {
+          // Grid level iteration: we need to initialize/check at the grid level
           // Default range
           alpaka::memset(queue, counter_d, 0);
           auto workdiv = make_workdiv<Acc1D>(BlockSize, GridSize);
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::enqueue(
+              queue,
+              alpaka::createTaskKernel<Acc1D>(workdiv,
+                                              testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{},
+                                              blocks * BlockSize + sizeFuzz,
+                                              shift,
+                                              counter_d.data()));
           alpaka::memcpy(queue, counter_h, counter_d);
           alpaka::wait(queue);
-          auto expected = expectedCount<RangeType::Default, LoopScope::Grid>(workdiv, blocks* BlockSize + sizeFuzz, shift);
+          auto expected =
+              expectedCount<RangeType::Default, LoopScope::Grid>(workdiv, blocks * BlockSize + sizeFuzz, shift);
           assert(*counter_h.data() == expected);
 
           // ExtentLimited range
           alpaka::memset(queue, counter_d, 0);
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::enqueue(
+              queue,
+              alpaka::createTaskKernel<Acc1D>(workdiv,
+                                              testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{},
+                                              blocks * BlockSize + sizeFuzz,
+                                              shift,
+                                              counter_d.data()));
           alpaka::memcpy(queue, counter_h, counter_d);
           alpaka::wait(queue);
-          expected = expectedCount<RangeType::ExtentLimited, LoopScope::Grid>(workdiv, blocks* BlockSize + sizeFuzz, shift);
+          expected =
+              expectedCount<RangeType::ExtentLimited, LoopScope::Grid>(workdiv, blocks * BlockSize + sizeFuzz, shift);
           assert(*counter_h.data() == expected);
 
           // ExtentLimitedWithShift range
           alpaka::memset(queue, counter_d, 0);
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
+          alpaka::enqueue(queue,
+                          alpaka::createTaskKernel<Acc1D>(
+                              workdiv,
+                              testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{},
+                              blocks * BlockSize + sizeFuzz,
+                              shift,
+                              counter_d.data()));
           alpaka::memcpy(queue, counter_h, counter_d);
           alpaka::wait(queue);
-          expected =  expectedCount<RangeType::ExtentLimitedWithShift, LoopScope::Grid>(workdiv, blocks* BlockSize + sizeFuzz, shift);
+          expected = expectedCount<RangeType::ExtentLimitedWithShift, LoopScope::Grid>(
+              workdiv, blocks * BlockSize + sizeFuzz, shift);
           assert(*counter_h.data() == expected);
 
           // Block level auto tests
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data()));
-          alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1D>(workdiv, 
-             testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{}, blocks* BlockSize + sizeFuzz, shift, counter_d.data())); 
+          alpaka::enqueue(
+              queue,
+              alpaka::createTaskKernel<Acc1D>(workdiv,
+                                              testWordDivisionDefaultRange<RangeType::Default, LoopScope::Grid>{},
+                                              blocks * BlockSize + sizeFuzz,
+                                              shift,
+                                              counter_d.data()));
+          alpaka::enqueue(
+              queue,
+              alpaka::createTaskKernel<Acc1D>(workdiv,
+                                              testWordDivisionDefaultRange<RangeType::ExtentLimited, LoopScope::Grid>{},
+                                              blocks * BlockSize + sizeFuzz,
+                                              shift,
+                                              counter_d.data()));
+          alpaka::enqueue(queue,
+                          alpaka::createTaskKernel<Acc1D>(
+                              workdiv,
+                              testWordDivisionDefaultRange<RangeType::ExtentLimitedWithShift, LoopScope::Grid>{},
+                              blocks * BlockSize + sizeFuzz,
+                              shift,
+                              counter_d.data()));
         }
     alpaka::wait(queue);
   }
diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
index 416e647c349a4..ee8048431ab6d 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -203,7 +203,7 @@ __device__ __forceinline__ void radixSortImpl(
     if (threadIdx.x == 0)
       ++p;
     __syncthreads();
-      }
+  }
 
   if ((w != 8) && (0 == (NS & 1)))
     assert(j == ind);  // w/d is even so ind is correct
@@ -261,7 +261,7 @@ namespace cms {
 
     template <typename T, int NS = sizeof(T)>
     __global__ void __launch_bounds__(256, 4)
-      radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+        radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
       radixSortMulti<T, NS>(v, index, offsets, workspace);
     }
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu
index 6b3ee2e4afa9c..8b6ffd70fd4e6 100644
--- a/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu
@@ -74,7 +74,7 @@ namespace {
     __syncthreads();
     radixSort<float, 2>(z, order, sws, elements);
     __syncthreads();
-    
+
     //verify
     for (unsigned int itrack = firstElement; itrack < (elements - 1); itrack += gridSize) {
       auto ntrack = order[itrack];
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index 209ce97347e25..a15bed0ae8a1f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -151,10 +151,15 @@ void go(bool useShared) {
     auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
 
     delta -= (std::chrono::high_resolution_clock::now() - start);
+    // The MaxSize is the max size we allow between offsets (i.e. biggest set to sort when using shared memory).
     constexpr int MaxSize = 256 * 32;
     if (useShared)
-      cms::cuda::launch(
-          radixSortMultiWrapper<U, NS>, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr);
+      cms::cuda::launch(radixSortMultiWrapper<U, NS>,
+                        {blocks, ntXBl, MaxSize * 2 /* sizeof(uint16_t)  */},
+                        v_d.get(),
+                        ind_d.get(),
+                        off_d.get(),
+                        nullptr);
     else
       cms::cuda::launch(
           radixSortMultiWrapper2<U, NS>, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());