alpaka-group · bernhardmgruber · Mar 29, 2021 · Mar 14, 2021 · Mar 14, 2021 · Mar 15, 2021
diff --git a/include/alpaka/warp/Traits.hpp b/include/alpaka/warp/Traits.hpp
@@ -49,6 +49,11 @@ namespace alpaka
             template<typename TWarp, typename TSfinae = void>
             struct Ballot;
 
+            //#############################################################################
+            //! The shfl warp swizzling trait.
+            template<typename TWarp, typename TSfinae = void>
+            struct Shfl;
+
             //#############################################################################
             //! The active mask trait.
             template<typename TWarp, typename TSfinae = void>
@@ -150,5 +155,32 @@ namespace alpaka
             using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
             return traits::Ballot<ImplementationBase>::ballot(warp, predicate);
         }
+
+        //-----------------------------------------------------------------------------
+        //! Broadcasts data from one thread to all members of the warp.
+        //! Similar to MPI_Bcast, but using srcLane instead of root.
+        //!
+        //! \tparam TWarp The warp implementation type.
+        //! \param  warp The warp implementation.
+        //! \param  value The value to broadcast (only meaningful from threadIdx == srcLane)
+        //! \param  srcLane The source lane sending value.
+        //! \return val from the thread index srcLane.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWarp>
+        ALPAKA_FN_ACC auto shfl(TWarp const& warp, int value, int srcLane)
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+            return traits::Shfl<ImplementationBase>::shfl(warp, value, srcLane);
+        }
+
+        //-----------------------------------------------------------------------------
+        //! shfl for float vals
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWarp>
+        ALPAKA_FN_ACC auto shfl(TWarp const& warp, float value, int srcLane)
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+            return traits::Shfl<ImplementationBase>::shfl(warp, value, srcLane);
+        }
     } // namespace warp
 } // namespace alpaka
diff --git a/include/alpaka/warp/WarpSingleThread.hpp b/include/alpaka/warp/WarpSingleThread.hpp
@@ -92,6 +92,22 @@ namespace alpaka
                     return predicate ? 1u : 0u;
                 }
             };
+
+            //#################################################################
+            template<>
+            struct Shfl<WarpSingleThread>
+            {
+                //-------------------------------------------------------------
+                static auto shfl(warp::WarpSingleThread const& /*warp*/, int val, int /*srcLane*/)
+                {
+                    return val;
+                }
+                //-------------------------------------------------------------
+                static auto shfl(warp::WarpSingleThread const& /*warp*/, float val, int /*srcLane*/)
+                {
+                    return val;
+                }
+            };
         } // namespace traits
     } // namespace warp
 } // namespace alpaka
diff --git a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
@@ -142,6 +142,31 @@ namespace alpaka
 #    else
                     ignore_unused(warp);
                     return __ballot(predicate);
+#    endif
+                }
+            };
+
+            //#################################################################
+            template<>
+            struct Shfl<WarpUniformCudaHipBuiltIn>
+            {
+                //-------------------------------------------------------------
+                __device__ static auto shfl(warp::WarpUniformCudaHipBuiltIn const& warp, float val, int srcLane)
+                    -> float
+                {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    return __shfl_sync(activemask(warp), val, srcLane, getSize(warp));
+#    else
+                    return __shfl(val, srcLane, getSize(warp));
+#    endif
+                }
+                //-------------------------------------------------------------
+                __device__ static auto shfl(warp::WarpUniformCudaHipBuiltIn const& warp, int val, int srcLane) -> int
+                {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    return __shfl_sync(activemask(warp), val, srcLane, getSize(warp));
+#    else
+                    return __shfl(val, srcLane, getSize(warp));
 #    endif
                 }
             };

diff --git a/test/unit/warp/src/Shfl.cpp b/test/unit/warp/src/Shfl.cpp
@@ -0,0 +1,110 @@
+/* Copyright 2021 David M. Rogers
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/warp/Traits.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <cstdint>
+
+//#############################################################################
+class ShflSingleThreadWarpTestKernel
+{
+public:
+    //-------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, warpExtent == 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 12, 0) == 12);
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 42, -1) == 42);
+        // ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 3.3f, 0) == 3.3f);
+    }
+};
+
+//#############################################################################
+class ShflMultipleThreadWarpTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        // Test relies on having a single warp per thread block
+        ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
+        int const threadIdxInWarp = alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0];
+
+        ALPAKA_CHECK(*success, warpExtent > 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 42, 0) == 42);
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, 0) == 0);
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, 1) == 1);
+        // fails -- apparently this case wraps, but should probably be undefined
+        // ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, -1) == threadIdxInWarp);
+
+        // Some threads quit the kernel to test that the warp operations
+        // properly operate on the active threads only
+        if(threadIdxInWarp >= warpExtent / 2)
+            return;
+
+        for(int idx = 0; idx < warpExtent / 2; idx++)
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, idx) == idx);
+            ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 4.0f - float(threadIdxInWarp), idx) == 4.0f - float(idx));
+        }
+    }
+};
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Pltf = alpaka::Pltf<Dev>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+    auto const warpExtent = alpaka::getWarpSize(dev);
+    if(warpExtent == 1)
+    {
+        Idx const gridThreadExtentPerDim = 4;
+        alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(gridThreadExtentPerDim));
+        ShflSingleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+    }
+    else
+    {
+        // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0
+#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+        return;
+#else
+        using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
+        auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
+        // Enforce one warp per thread block
+        auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
+        blockThreadExtent[0] = static_cast<Idx>(warpExtent);
+        auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
+        auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
+        auto fixture = ExecutionFixture{workDiv};
+        ShflMultipleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+#endif
+    }
+}