Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added warp::shfl functionality. #1273

Merged
merged 5 commits into from
Mar 29, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions include/alpaka/warp/Traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ namespace alpaka
template<typename TWarp, typename TSfinae = void>
struct Ballot;

//#############################################################################
//! The shfl warp swizzling trait.
template<typename TWarp, typename TSfinae = void>
struct Shfl;

//#############################################################################
//! The active mask trait.
template<typename TWarp, typename TSfinae = void>
Expand Down Expand Up @@ -150,5 +155,32 @@ namespace alpaka
using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
return traits::Ballot<ImplementationBase>::ballot(warp, predicate);
}

//-----------------------------------------------------------------------------
//! Broadcasts data from one thread to all members of the warp.
//! Similar to MPI_Bcast, but using srcLane instead of root.
//!
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO we need to add to the documentation that this function shfl is collective what means all threads need to call the function and also from the same code branch.
The reason is that for CUDA the implementation is using activemask and for HIP all threads in a warp needs to call the function. Using activemask means if threads from the if and else branch call the function they will not see each other.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated these docs to include this warning.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I forgot to add a similar warning to the previously existing warp collectives. You comment also alllies to those, right @psychocoderHPC ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sbastrakov Yes this should be added to other warp functions too. Currently, only CUDA allows calling warp functions from different branches. It is fine if all threads of the warp are in the same branch but as soon as the threads diverge the behavior is undefined (for HIP and CUDA devices before sm_70) .

//! \tparam TWarp The warp implementation type.
//! \param warp The warp implementation.
//! \param value The value to broadcast (only meaningful from threadIdx == srcLane)
//! \param srcLane The source lane sending value.
//! \return val from the thread index srcLane.
ALPAKA_NO_HOST_ACC_WARNING
template<typename TWarp>
ALPAKA_FN_ACC auto shfl(TWarp const& warp, int value, int srcLane)
{
using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
return traits::Shfl<ImplementationBase>::shfl(warp, value, srcLane);
}

//-----------------------------------------------------------------------------
//! shfl for float vals
ALPAKA_NO_HOST_ACC_WARNING
template<typename TWarp>
ALPAKA_FN_ACC auto shfl(TWarp const& warp, float value, int srcLane)
{
using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
return traits::Shfl<ImplementationBase>::shfl(warp, value, srcLane);
}
} // namespace warp
} // namespace alpaka
16 changes: 16 additions & 0 deletions include/alpaka/warp/WarpSingleThread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,22 @@ namespace alpaka
return predicate ? 1u : 0u;
}
};

//#################################################################
template<>
struct Shfl<WarpSingleThread>
{
//-------------------------------------------------------------
bernhardmgruber marked this conversation as resolved.
Show resolved Hide resolved
static auto shfl(warp::WarpSingleThread const& /*warp*/, int val, int /*srcLane*/)
bernhardmgruber marked this conversation as resolved.
Show resolved Hide resolved
{
return val;
}
//-------------------------------------------------------------
static auto shfl(warp::WarpSingleThread const& /*warp*/, float val, int /*srcLane*/)
{
return val;
}
};
} // namespace traits
} // namespace warp
} // namespace alpaka
25 changes: 25 additions & 0 deletions include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,31 @@ namespace alpaka
# else
ignore_unused(warp);
return __ballot(predicate);
# endif
}
};

//#################################################################
template<>
struct Shfl<WarpUniformCudaHipBuiltIn>
{
//-------------------------------------------------------------
__device__ static auto shfl(warp::WarpUniformCudaHipBuiltIn const& warp, float val, int srcLane)
-> float
{
# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
return __shfl_sync(activemask(warp), val, srcLane, getSize(warp));
# else
return __shfl(val, srcLane, getSize(warp));
# endif
}
//-------------------------------------------------------------
__device__ static auto shfl(warp::WarpUniformCudaHipBuiltIn const& warp, int val, int srcLane) -> int
bernhardmgruber marked this conversation as resolved.
Show resolved Hide resolved
{
# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
return __shfl_sync(activemask(warp), val, srcLane, getSize(warp));
# else
return __shfl(val, srcLane, getSize(warp));
# endif
}
};
Expand Down
110 changes: 110 additions & 0 deletions test/unit/warp/src/Shfl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/* Copyright 2021 David M. Rogers
bernhardmgruber marked this conversation as resolved.
Show resolved Hide resolved
*
* This file is part of Alpaka.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#include <alpaka/test/KernelExecutionFixture.hpp>
#include <alpaka/test/acc/TestAccs.hpp>
#include <alpaka/test/queue/Queue.hpp>
#include <alpaka/warp/Traits.hpp>

#include <catch2/catch.hpp>

#include <cstdint>

//#############################################################################
class ShflSingleThreadWarpTestKernel
{
public:
//-------------------------------------------------------------------------
ALPAKA_NO_HOST_ACC_WARNING
template<typename TAcc>
ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
{
std::int32_t const warpExtent = alpaka::warp::getSize(acc);
ALPAKA_CHECK(*success, warpExtent == 1);

ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 12, 0) == 12);
ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 42, -1) == 42);
// ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 3.3f, 0) == 3.3f);
bernhardmgruber marked this conversation as resolved.
Show resolved Hide resolved
}
};

//#############################################################################
class ShflMultipleThreadWarpTestKernel
{
public:
//-----------------------------------------------------------------------------
ALPAKA_NO_HOST_ACC_WARNING
template<typename TAcc>
ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
{
auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
std::int32_t const warpExtent = alpaka::warp::getSize(acc);
// Test relies on having a single warp per thread block
ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
int const threadIdxInWarp = alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0];

ALPAKA_CHECK(*success, warpExtent > 1);

ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 42, 0) == 42);
ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, 0) == 0);
ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, 1) == 1);
// fails -- apparently this case wraps, but should probably be undefined
// ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, -1) == threadIdxInWarp);

// Some threads quit the kernel to test that the warp operations
// properly operate on the active threads only
if(threadIdxInWarp >= warpExtent / 2)
return;

for(int idx = 0; idx < warpExtent / 2; idx++)
{
ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, idx) == idx);
ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 4.0f - float(threadIdxInWarp), idx) == 4.0f - float(idx));
}
}
};

//-----------------------------------------------------------------------------
TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs)
{
using Acc = TestType;
using Dev = alpaka::Dev<Acc>;
using Pltf = alpaka::Pltf<Dev>;
using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;

Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
auto const warpExtent = alpaka::getWarpSize(dev);
if(warpExtent == 1)
{
Idx const gridThreadExtentPerDim = 4;
alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(gridThreadExtentPerDim));
ShflSingleThreadWarpTestKernel kernel;
REQUIRE(fixture(kernel));
}
else
{
// Work around gcc 7.5 trying and failing to offload for OpenMP 4.0
#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED
return;
#else
using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
// Enforce one warp per thread block
auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
blockThreadExtent[0] = static_cast<Idx>(warpExtent);
auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
auto fixture = ExecutionFixture{workDiv};
ShflMultipleThreadWarpTestKernel kernel;
REQUIRE(fixture(kernel));
#endif
}
}