From af8f62c160391625b6bcdb29e19baec6aa4d345e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Wed, 5 Jun 2024 16:29:53 +0200 Subject: [PATCH] Remove the usage of `ExampleDefaultAcc` in all examples. Examples will now be executed for all enabled accelerators. Fix that `ExampleDefaultAcc` was used in the test `MapIdxPitchBytes`. --- cmake/alpakaCommon.cmake | 2 +- docs/source/basic/library.rst | 2 +- example/bufferCopy/src/bufferCopy.cpp | 33 +++++++--- example/complex/src/complex.cpp | 44 +++++++------- example/convolution1D/src/convolution1D.cpp | 27 ++++++++- example/convolution2D/src/convolution2D.cpp | 27 ++++++++- .../counterBasedRng/src/counterBasedRng.cpp | 38 +++++++----- example/heatEquation/src/heatEquation.cpp | 31 ++++++++-- example/helloWorld/src/helloWorld.cpp | 45 +++++++------- .../helloWorldLambda/src/helloWorldLambda.cpp | 37 ++++++++---- .../src/kernelSpecialization.cpp | 37 ++++++++---- .../src/monteCarloIntegration.cpp | 30 ++++++++-- example/openMPSchedule/src/openMPSchedule.cpp | 1 - .../src/parallelLoopPatterns.cpp | 51 +++++++++++----- example/randomCells2D/src/randomCells2D.cpp | 27 ++++++++- .../randomStrategies/src/randomStrategies.cpp | 60 ++++++++++++++----- .../src/tagSpecialization.cpp | 38 +++++++----- example/vectorAdd/src/vectorAdd.cpp | 37 ++++++++---- include/alpaka/acc/AccCpuThreads.hpp | 2 +- .../alpaka/example/ExecuteForEachAccTag.hpp | 41 +++++++++++++ test/unit/idx/src/MapIdxPitchBytes.cpp | 14 +++-- 21 files changed, 453 insertions(+), 171 deletions(-) create mode 100644 include/alpaka/example/ExecuteForEachAccTag.hpp diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake index 7c81504639f1..7d24f8c3a5c1 100644 --- a/cmake/alpakaCommon.cmake +++ b/cmake/alpakaCommon.cmake @@ -581,7 +581,7 @@ if(alpaka_ACC_GPU_HIP_ENABLE) # https://github.com/llvm/llvm-project/commit/b86e0992bfa6 # https://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#150 # for example, is required to create alpaka::EnabledAccTags - # TODO(SimeonEhrig): restict HIP version, if first HIP version is release using Clang 19 + # TODO(SimeonEhrig): restict HIP version, if first HIP version is release using Clang 19 alpaka_set_compiler_options(HOST_DEVICE target alpaka "$<$:SHELL:-frelaxed-template-template-args>") alpaka_compiler_option(HIP_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps 'CMakeFiles/.dir'" OFF) diff --git a/docs/source/basic/library.rst b/docs/source/basic/library.rst index ab25b9555bd5..3a5608b6f5e1 100644 --- a/docs/source/basic/library.rst +++ b/docs/source/basic/library.rst @@ -124,7 +124,7 @@ Kernels can also be defined via lambda expressions. int main() { // ... - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::AccGpuCudaRt; auto kernel = [] ALPAKA_FN_ACC (Acc const & acc /* , ... */) -> void { // ... diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp index 12bd4a1a2acb..077797e3af30 100644 --- a/example/bufferCopy/src/bufferCopy.cpp +++ b/example/bufferCopy/src/bufferCopy.cpp @@ -4,7 +4,7 @@ */ #include -#include +#include #include #include @@ -64,7 +64,12 @@ struct FillBufferKernel } }; -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Define the index domain using Dim = alpaka::DimInt<3u>; @@ -81,7 +86,7 @@ auto main() -> int // - AccCpuTbbBlocks // - AccCpuSerial // using Acc = alpaka::AccCpuSerial; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::TagToAcc; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; // Defines the synchronization behavior of a queue // @@ -90,12 +95,6 @@ auto main() -> int using DevQueue = alpaka::Queue; // Define the device accelerator - // - // It is possible to choose from a set of accelerators: - // - AccCpuThreads - // - AccCpuOmp2Threads - // - AccCpuOmp2Blocks - // - AccCpuSerial using Host = alpaka::AccCpuSerial; // Defines the synchronization behavior of a queue // @@ -257,3 +256,19 @@ auto main() -> int return EXIT_SUCCESS; } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/complex/src/complex.cpp b/example/complex/src/complex.cpp index 2e8d62ecdf61..7eb8e44ddd72 100644 --- a/example/complex/src/complex.cpp +++ b/example/complex/src/complex.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include #include @@ -28,29 +28,17 @@ struct ComplexKernel } }; -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { using Idx = std::size_t; // Define the accelerator - // - // It is possible to choose from a set of accelerators: - // - AccGpuCudaRt - // - AccGpuHipRt - // - AccCpuThreads - // - AccCpuOmp2Threads - // - AccCpuOmp2Blocks - // - AccCpuTbbBlocks - // - AccCpuSerial - // - // Each accelerator has strengths and weaknesses. Therefore, - // they need to be choosen carefully depending on the actual - // use case. Furthermore, some accelerators only support a - // particular workdiv, but workdiv can also be generated - // automatically. - - // By exchanging the Acc and Queue types you can select where to execute the kernel. - using Acc = alpaka::ExampleDefaultAcc, Idx>; + using Acc = alpaka::TagToAcc, Idx>; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; // Defines the synchronization behavior of a queue @@ -88,3 +76,19 @@ auto main() -> int return EXIT_SUCCESS; } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/convolution1D/src/convolution1D.cpp b/example/convolution1D/src/convolution1D.cpp index 047f462ef0a7..60e8f7be5b28 100644 --- a/example/convolution1D/src/convolution1D.cpp +++ b/example/convolution1D/src/convolution1D.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include #include @@ -64,7 +64,12 @@ auto FuzzyEqual(float a, float b) -> bool return std::fabs(a - b) < std::numeric_limits::epsilon() * 10.0f; } -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Size of 1D arrays to be used in convolution integral // Here instead of "convolution kernel" the term "filter" is used because kernel has a different meaning in GPU @@ -80,7 +85,7 @@ auto main() -> int using Idx = std::size_t; // Define the accelerator - using DevAcc = alpaka::ExampleDefaultAcc; + using DevAcc = alpaka::TagToAcc; using QueueProperty = alpaka::Blocking; using QueueAcc = alpaka::Queue; using BufAcc = alpaka::Buf; @@ -176,3 +181,19 @@ auto main() -> int std::cout << "All results are correct!\n"; return EXIT_SUCCESS; } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp index e9d1cd5007d7..8de7071c05cc 100644 --- a/example/convolution2D/src/convolution2D.cpp +++ b/example/convolution2D/src/convolution2D.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include #include @@ -208,7 +208,12 @@ auto FuzzyEqual(float a, float b) -> bool return std::fabs(a - b) < std::numeric_limits::epsilon() * 1000.0f; } -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Define the index domain using Dim = alpaka::DimInt<2>; @@ -216,7 +221,7 @@ auto main() -> int using Idx = std::uint32_t; using Vec = alpaka::Vec; // Define the accelerator - using DevAcc = alpaka::ExampleDefaultAcc; + using DevAcc = alpaka::TagToAcc; using QueueAcc = alpaka::Queue; using DataType = float; @@ -379,3 +384,19 @@ auto main() -> int std::cout << "Sampled result checks are correct!\n"; return EXIT_SUCCESS; } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/counterBasedRng/src/counterBasedRng.cpp b/example/counterBasedRng/src/counterBasedRng.cpp index 86da223f1d0a..f25ad1cc954d 100644 --- a/example/counterBasedRng/src/counterBasedRng.cpp +++ b/example/counterBasedRng/src/counterBasedRng.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include #include @@ -92,25 +92,19 @@ class CounterBasedRngKernel } }; -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Define the index domain using Dim = alpaka::DimInt<3u>; using Idx = std::size_t; // Define the accelerator - // - // It is possible to choose from a set of accelerators: - // - AccGpuCudaRt - // - AccGpuHipRt - // - AccCpuThreads - // - AccCpuFibers - // - AccCpuOmp2Threads - // - AccCpuOmp2Blocks - // - AccCpuTbbBlocks - // - AccCpuSerial - // using Acc = alpaka::AccCpuSerial; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::TagToAcc; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; using AccHost = alpaka::AccCpuSerial; @@ -222,3 +216,19 @@ auto main() -> int return EXIT_FAILURE; } } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/heatEquation/src/heatEquation.cpp b/example/heatEquation/src/heatEquation.cpp index ff7ee6c7dafe..9c98008561f6 100644 --- a/example/heatEquation/src/heatEquation.cpp +++ b/example/heatEquation/src/heatEquation.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include #include @@ -62,7 +62,13 @@ auto exactSolution(double const x, double const t) -> double //! Every time step the kernel will be executed numNodesX-times //! After every step the curr-buffer will be set to the calculated values //! from the next-buffer. -auto main() -> int +//! +//! In standard projects, you typically do not execute the code with any available accelerator. +//! Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +//! selected accelerator only. If you use the example as the starting point for your project, you can rename the +//! example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Parameters (a user is supposed to change numNodesX, numTimeSteps) uint32_t const numNodesX = 1000; @@ -84,9 +90,8 @@ auto main() -> int using Dim = alpaka::DimInt<1u>; using Idx = uint32_t; - // Select accelerator-types for host and device - // using Acc = alpaka::AccCpuSerial; - using Acc = alpaka::ExampleDefaultAcc; + // Define the accelerator + using Acc = alpaka::TagToAcc; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; // Select specific devices @@ -179,3 +184,19 @@ auto main() -> int return EXIT_FAILURE; } } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/helloWorld/src/helloWorld.cpp b/example/helloWorld/src/helloWorld.cpp index 20a28d5be5ba..6d7eb0522272 100644 --- a/example/helloWorld/src/helloWorld.cpp +++ b/example/helloWorld/src/helloWorld.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include @@ -47,7 +47,12 @@ struct HelloWorldKernel } }; -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Define the index domain // @@ -59,25 +64,7 @@ auto main() -> int using Idx = std::size_t; // Define the accelerator - // - // It is possible to choose from a set of accelerators: - // - AccGpuCudaRt - // - AccGpuHipRt - // - AccCpuThreads - // - AccCpuOmp2Threads - // - AccCpuOmp2Blocks - // - AccCpuTbbBlocks - // - AccCpuSerial - // - // Each accelerator has strengths and weaknesses. Therefore, - // they need to be chosen carefully depending on the actual - // use case. Furthermore, some accelerators only support a - // particular workdiv, but workdiv can also be generated - // automatically. - - // By exchanging the Acc and Queue types you can select where to execute the kernel. - // using Acc = alpaka::AccCpuSerial; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::TagToAcc; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; // Defines the synchronization behavior of a queue @@ -173,3 +160,19 @@ auto main() -> int return EXIT_SUCCESS; } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/helloWorldLambda/src/helloWorldLambda.cpp b/example/helloWorldLambda/src/helloWorldLambda.cpp index 90296c396f60..298b9b15b619 100644 --- a/example/helloWorldLambda/src/helloWorldLambda.cpp +++ b/example/helloWorldLambda/src/helloWorldLambda.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include @@ -39,7 +39,12 @@ void ALPAKA_FN_ACC hiWorldFunction(TAcc const& acc, size_t const nExclamationMar printf("\n"); } -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // It requires support for extended lambdas when using nvcc as CUDA compiler. // Requires sequential backend if CI is used @@ -51,17 +56,7 @@ auto main() -> int using Idx = std::size_t; // Define the accelerator - // - // It is possible to choose from a set of accelerators: - // - AccGpuCudaRt - // - AccGpuHipRt - // - AccCpuThreads - // - AccCpuOmp2Threads - // - AccCpuOmp2Blocks - // - AccCpuTbbBlocks - // - AccCpuSerial - // using Acc = alpaka::AccCpuSerial; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::TagToAcc; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; // Defines the synchronization behavior of a queue @@ -138,3 +133,19 @@ auto main() -> int return EXIT_SUCCESS; #endif } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/kernelSpecialization/src/kernelSpecialization.cpp b/example/kernelSpecialization/src/kernelSpecialization.cpp index 57e2ee1e64d7..e0e7d660b561 100644 --- a/example/kernelSpecialization/src/kernelSpecialization.cpp +++ b/example/kernelSpecialization/src/kernelSpecialization.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include @@ -53,21 +53,16 @@ struct Kernel #endif }; -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Define the accelerator - // - // It is possible to choose from a set of accelerators: - // - AccGpuCudaRt - // - AccGpuHipRt - // - AccCpuThreads - // - AccCpuOmp2Threads - // - AccCpuOmp2Blocks - // - AccCpuTbbBlocks - // - AccCpuSerial - // // For simplicity this examples always uses 1 dimensional indexing, and index type size_t - using Acc = alpaka::ExampleDefaultAcc, std::size_t>; + using Acc = alpaka::TagToAcc, std::size_t>; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; // Defines the synchronization behavior of a queue @@ -97,3 +92,19 @@ auto main() -> int return EXIT_SUCCESS; } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/monteCarloIntegration/src/monteCarloIntegration.cpp b/example/monteCarloIntegration/src/monteCarloIntegration.cpp index 52e050785c88..d1ed33f0f4f0 100644 --- a/example/monteCarloIntegration/src/monteCarloIntegration.cpp +++ b/example/monteCarloIntegration/src/monteCarloIntegration.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include #include @@ -34,6 +34,7 @@ struct Kernel //! \param numPoints The total number of points to be calculated. //! \param globalCounter The sum of all local results. //! \param functor The function for which the integral is to be computed. + ALPAKA_NO_HOST_ACC_WARNING template ALPAKA_FN_ACC auto operator()( TAcc const& acc, @@ -52,7 +53,7 @@ struct Kernel linearizedGlobalThreadIdx, 0); // No specific subsequence start. // For simplicity the interval is fixed to [0.0,1.0]. - auto dist(alpaka::rand::distribution::createUniformReal(acc)); + auto dist = alpaka::rand::distribution::createUniformReal(acc); uint32_t localCount = 0; for(size_t i = linearizedGlobalThreadIdx; i < numPoints; i += globalThreadExtent.prod()) @@ -72,13 +73,18 @@ struct Kernel } }; -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Defines and setup. using Dim = alpaka::DimInt<1>; using Idx = std::size_t; using Vec = alpaka::Vec; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::TagToAcc; using Host = alpaka::DevCpu; auto const platformHost = alpaka::PlatformCpu{}; auto const devHost = alpaka::getDevByIdx(platformHost, 0); @@ -131,3 +137,19 @@ auto main() -> int std::cout << "error: " << error << "\n"; return error > 0.001 ? EXIT_FAILURE : EXIT_SUCCESS; } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/openMPSchedule/src/openMPSchedule.cpp b/example/openMPSchedule/src/openMPSchedule.cpp index 23a71eec21ba..a6b5de722fe9 100644 --- a/example/openMPSchedule/src/openMPSchedule.cpp +++ b/example/openMPSchedule/src/openMPSchedule.cpp @@ -3,7 +3,6 @@ */ #include -#include #include #include diff --git a/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp b/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp index 0bd79ab9a26a..8aa36f4248b9 100644 --- a/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp +++ b/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include #include @@ -73,7 +73,7 @@ struct NaiveCudaStyleKernel template ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const { - auto const globalThreadIdx(alpaka::getIdx(acc)[0u]); + auto const globalThreadIdx = alpaka::getIdx(acc)[0u]; // Cuf off threads that have nothing to do if(globalThreadIdx < n) { @@ -140,8 +140,8 @@ struct GridStridedLoopKernel template ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const { - auto const globalThreadExtent(alpaka::getWorkDiv(acc)[0u]); - auto const globalThreadIdx(alpaka::getIdx(acc)[0u]); + auto const globalThreadExtent = alpaka::getWorkDiv(acc)[0u]; + auto const globalThreadIdx = alpaka::getIdx(acc)[0u]; for(uint32_t dataDomainIdx = globalThreadIdx; dataDomainIdx < n; dataDomainIdx += globalThreadExtent) { auto const memoryIdx = dataDomainIdx; @@ -205,9 +205,9 @@ struct ChunkedGridStridedLoopKernel template ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const { - auto const numElements(alpaka::getWorkDiv(acc)[0u]); - auto const globalThreadExtent(alpaka::getWorkDiv(acc)[0u]); - auto const globalThreadIdx(alpaka::getIdx(acc)[0u]); + auto const numElements = alpaka::getWorkDiv(acc)[0u]; + auto const globalThreadExtent = alpaka::getWorkDiv(acc)[0u]; + auto const globalThreadIdx = alpaka::getIdx(acc)[0u]; // Additionally could split the loop into peeled and remainder for(uint32_t chunkStart = globalThreadIdx * numElements; chunkStart < n; chunkStart += globalThreadExtent * numElements) @@ -277,8 +277,8 @@ struct NaiveOpenMPStyleKernel template ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const { - auto const globalThreadExtent(alpaka::getWorkDiv(acc)[0u]); - auto const globalThreadIdx(alpaka::getIdx(acc)[0u]); + auto const globalThreadExtent = alpaka::getWorkDiv(acc)[0u]; + auto const globalThreadIdx = alpaka::getIdx(acc)[0u]; auto const processPerThread = (n + globalThreadExtent - 1) / globalThreadExtent; for(uint32_t dataDomainIdx = globalThreadIdx * processPerThread; (dataDomainIdx < (globalThreadIdx + 1) * processPerThread) && (dataDomainIdx < n); @@ -342,9 +342,9 @@ struct OpenMPSimdStyleKernel template ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const { - auto const numElements(alpaka::getWorkDiv(acc)[0u]); - auto const globalThreadExtent(alpaka::getWorkDiv(acc)[0u]); - auto const globalThreadIdx(alpaka::getIdx(acc)[0u]); + auto const numElements = alpaka::getWorkDiv(acc)[0u]; + auto const globalThreadExtent = alpaka::getWorkDiv(acc)[0u]; + auto const globalThreadIdx = alpaka::getIdx(acc)[0u]; // This is the number for naive OpenMP style auto const naiveProcessPerThread = (n + globalThreadExtent - 1) / globalThreadExtent; // Round up to multiple of numElements @@ -400,7 +400,12 @@ void openMPSimdStyle(TDev& dev, TQueue& queue, TBufAcc& bufAcc) testResult(queue, bufAcc); } -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Define the index domain, this example is only for 1d using Dim = alpaka::DimInt<1u>; @@ -417,7 +422,7 @@ auto main() -> int // - AccCpuTbbBlocks // - AccCpuSerial // using Acc = alpaka::AccCpuSerial; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::TagToAcc; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; // Select a device and create queue for it @@ -435,4 +440,22 @@ auto main() -> int chunkedGridStridedLoop(devAcc, queue, bufAcc); naiveOpenMPStyle(devAcc, queue, bufAcc); openMPSimdStyle(devAcc, queue, bufAcc); + + return EXIT_SUCCESS; +} + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); } diff --git a/example/randomCells2D/src/randomCells2D.cpp b/example/randomCells2D/src/randomCells2D.cpp index a0a21370cecc..086757658ce1 100644 --- a/example/randomCells2D/src/randomCells2D.cpp +++ b/example/randomCells2D/src/randomCells2D.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include #include @@ -141,12 +141,17 @@ struct RunTimestepKernelVector } }; -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { using Dim = alpaka::DimInt<2>; using Idx = std::size_t; using Vec = alpaka::Vec; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::TagToAcc; using Host = alpaka::DevCpu; auto const platformHost = alpaka::PlatformCpu{}; auto const devHost = alpaka::getDevByIdx(platformHost, 0); @@ -285,3 +290,19 @@ auto main() -> int return 1; } } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/randomStrategies/src/randomStrategies.cpp b/example/randomStrategies/src/randomStrategies.cpp index 3dd0c9efde5e..a0a8e2e8172d 100644 --- a/example/randomStrategies/src/randomStrategies.cpp +++ b/example/randomStrategies/src/randomStrategies.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include #include @@ -25,13 +25,14 @@ using RandomEngine = alpaka::rand::Philox4x32x10; /// Parameters to set up the default accelerator, queue, and buffers +template struct Box { // accelerator, queue, and work division typedefs using Dim = alpaka::DimInt<1>; using Idx = std::size_t; using Vec = alpaka::Vec; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::TagToAcc; using PlatformHost = alpaka::PlatformCpu; using Host = alpaka::Dev; using PlatformAcc = alpaka::Platform; @@ -194,13 +195,14 @@ struct FillKernel * * File is in TSV format. One line for each "point"; line length is the number of "rolls". */ -void saveDataAndShowAverage(std::string filename, float const* buffer, Box const& box) +template +void saveDataAndShowAverage(std::string filename, float const* buffer, Box const& box) { std::ofstream output(filename); std::cout << "Writing " << filename << " ... " << std::flush; auto const lineLength = box.extentResult[0] / box.extentRand[0]; double average = 0; - for(Box::Idx i = 0; i < box.extentResult[0]; ++i) + for(typename Box::Idx i = 0; i < box.extentResult[0]; ++i) { output << buffer[i] << ((i + 1) % lineLength ? "\t" : "\n"); average += buffer[i]; @@ -216,7 +218,8 @@ struct Writer; template<> struct Writer { - static void save(float const* buffer, Box const& box) + template + static void save(float const* buffer, Box const& box) { saveDataAndShowAverage("out_seed.csv", buffer, box); } @@ -225,7 +228,8 @@ struct Writer template<> struct Writer { - static void save(float const* buffer, Box const& box) + template + static void save(float const* buffer, Box const& box) { saveDataAndShowAverage("out_subsequence.csv", buffer, box); } @@ -234,14 +238,15 @@ struct Writer template<> struct Writer { - static void save(float const* buffer, Box const& box) + template + static void save(float const* buffer, Box const& box) { saveDataAndShowAverage("out_offset.csv", buffer, box); } }; -template -void runStrategy(Box& box) +template +void runStrategy(Box& box) { // Set up the pointer to the PRNG states buffer RandomEngine* const ptrBufAccRand{std::data(box.bufAccRand)}; @@ -252,7 +257,7 @@ void runStrategy(Box& box) // of the PRNG buffer and has to be passed in explicitly. Other strategies ignore the last parameter, and deduce // the initial parameters solely from the thread index - alpaka::exec( + alpaka::exec::Acc>( box.queue, box.workdivRand, initRandomKernel, @@ -273,13 +278,19 @@ void runStrategy(Box& box) float* const ptrBufAccResult{std::data(box.bufAccResult)}; // Initialise the results buffer to zero - for(Box::Idx i = 0; i < box.extentResult[0]; ++i) + for(typename Box::Idx i = 0; i < box.extentResult[0]; ++i) ptrBufHostResult[i] = 0; // Run the "computation" kernel filling the results buffer with random numbers in parallel alpaka::memcpy(box.queue, box.bufAccResult, box.bufHostResult); FillKernel fillKernel; - alpaka::exec(box.queue, box.workdivResult, fillKernel, box.extentResult, ptrBufAccRand, ptrBufAccResult); + alpaka::exec::Acc>( + box.queue, + box.workdivResult, + fillKernel, + box.extentResult, + ptrBufAccRand, + ptrBufAccResult); alpaka::memcpy(box.queue, box.bufHostResult, box.bufAccResult); alpaka::wait(box.queue); @@ -287,9 +298,14 @@ void runStrategy(Box& box) Writer::save(ptrBufHostResult, box); } -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { - Box box; // Initialize the box + Box box; // Initialize the box runStrategy(box); // threads start from different seeds runStrategy(box); // threads use different subsequences @@ -297,3 +313,19 @@ auto main() -> int return 0; } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/tagSpecialization/src/tagSpecialization.cpp b/example/tagSpecialization/src/tagSpecialization.cpp index 534aa29aeda8..dac94bbaa32b 100644 --- a/example/tagSpecialization/src/tagSpecialization.cpp +++ b/example/tagSpecialization/src/tagSpecialization.cpp @@ -3,7 +3,7 @@ */ #include -#include +#include #include @@ -76,22 +76,16 @@ struct WrapperKernel } }; -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Define the accelerator - // - // It is possible to choose from a set of accelerators: - // - AccGpuCudaRt - // - AccGpuHipRt - // - AccCpuThreads - // - AccCpuFibers - // - AccCpuOmp2Threads - // - AccCpuOmp2Blocks - // - AccCpuTbbBlocks - // - AccCpuSerial - // // For simplicity this examples always uses 1 dimensional indexing, and index type size_t - using Acc = alpaka::ExampleDefaultAcc, std::size_t>; + using Acc = alpaka::TagToAcc, std::size_t>; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; // Call the specialized functions @@ -122,3 +116,19 @@ auto main() -> int alpaka::wait(queue); return EXIT_SUCCESS; } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp index 5eca205279c8..b24a4f156119 100644 --- a/example/vectorAdd/src/vectorAdd.cpp +++ b/example/vectorAdd/src/vectorAdd.cpp @@ -4,7 +4,7 @@ */ #include -#include +#include #include #include @@ -54,7 +54,12 @@ class VectorAddKernel } }; -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { // Define the index domain // Set the number of dimensions as an integral constant. Set to 1 for 1D. @@ -62,17 +67,7 @@ auto main() -> int using Idx = std::size_t; // Define the accelerator - // - // It is possible to choose from a set of accelerators: - // - AccGpuCudaRt - // - AccGpuHipRt - // - AccCpuThreads - // - AccCpuOmp2Threads - // - AccCpuOmp2Blocks - // - AccCpuTbbBlocks - // - AccCpuSerial - // using Acc = alpaka::AccCpuSerial; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::TagToAcc; using DevAcc = alpaka::Dev; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; @@ -198,3 +193,19 @@ auto main() -> int return EXIT_FAILURE; } } + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); +} diff --git a/include/alpaka/acc/AccCpuThreads.hpp b/include/alpaka/acc/AccCpuThreads.hpp index f4984b63d734..ce8f04a73d99 100644 --- a/include/alpaka/acc/AccCpuThreads.hpp +++ b/include/alpaka/acc/AccCpuThreads.hpp @@ -143,7 +143,7 @@ namespace alpaka ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps { # ifdef ALPAKA_CI - auto const blockThreadCountMax(static_cast(8)); + auto const blockThreadCountMax = static_cast(8); # else // \TODO: Magic number. What is the maximum? Just set a reasonable value? There is a implementation // defined maximum where the creation of a new thread crashes. std::thread::hardware_concurrency can diff --git a/include/alpaka/example/ExecuteForEachAccTag.hpp b/include/alpaka/example/ExecuteForEachAccTag.hpp new file mode 100644 index 000000000000..84247ab55304 --- /dev/null +++ b/include/alpaka/example/ExecuteForEachAccTag.hpp @@ -0,0 +1,41 @@ +/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "alpaka/alpaka.hpp" + +#include +#include +#include + +#pragma once + +namespace alpaka +{ + //! execute a callable for each active accelerator tag + // + // @param callable callable which can be invoked with an accelerator tag + // @return disjunction of all invokation results + // + template + inline auto executeForEachAccTag(TCallable&& callable) + { + // Execute the callable once for each enabled accelerator. + // Pass the tag as first argument to the callable. + return std::apply( + [=](auto const&... tags) { return (callable(tags) || ...); }, + alpaka::EnabledAccTags{}); + } + + template + inline auto executeForEachAccTag(TCallable (*callable)(TArg)) + { + // Execute the callable once for each enabled accelerator. + // Pass the tag as first argument to the callable. + return std::apply( + [=](auto const&... tags) { return (callable(tags) || ...); }, + alpaka::EnabledAccTags{}); + } + + +} // namespace alpaka diff --git a/test/unit/idx/src/MapIdxPitchBytes.cpp b/test/unit/idx/src/MapIdxPitchBytes.cpp index 327d9a46fa0b..6a341cac27cd 100644 --- a/test/unit/idx/src/MapIdxPitchBytes.cpp +++ b/test/unit/idx/src/MapIdxPitchBytes.cpp @@ -3,7 +3,6 @@ */ #include -#include #include #include #include @@ -14,15 +13,16 @@ #include #include -TEMPLATE_LIST_TEST_CASE("mapIdxPitchBytes", "[idx]", alpaka::test::NonZeroTestDims) +template +auto mapIdxPitchBytes(TAccTag const&) { - using Dim = TestType; + using Dim = TDim; using Idx = std::size_t; using Vec = alpaka::Vec; auto const extentNd = alpaka::test::extentBuf; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::TagToAcc; using Elem = std::uint8_t; auto const platformAcc = alpaka::Platform{}; auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); @@ -45,3 +45,9 @@ TEMPLATE_LIST_TEST_CASE("mapIdxPitchBytes", "[idx]", alpaka::test::NonZeroTestDi // roundtrip REQUIRE(idxNd == idxNdResult); } + +TEMPLATE_LIST_TEST_CASE("mapIdxPitchBytes", "[idx]", alpaka::test::NonZeroTestDims) +{ + // execute the example once for each enabled accelerator + std::apply([](auto const&... tags) { (mapIdxPitchBytes(tags), ...); }, alpaka::EnabledAccTags{}); +}