From af8f62c160391625b6bcdb29e19baec6aa4d345e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Wed, 5 Jun 2024 16:29:53 +0200
Subject: [PATCH] Remove the usage of `ExampleDefaultAcc` in all examples.

Examples will now be executed for all enabled accelerators.
Fix that `ExampleDefaultAcc` was used in the test `MapIdxPitchBytes`.
---
 cmake/alpakaCommon.cmake                      |  2 +-
 docs/source/basic/library.rst                 |  2 +-
 example/bufferCopy/src/bufferCopy.cpp         | 33 +++++++---
 example/complex/src/complex.cpp               | 44 +++++++-------
 example/convolution1D/src/convolution1D.cpp   | 27 ++++++++-
 example/convolution2D/src/convolution2D.cpp   | 27 ++++++++-
 .../counterBasedRng/src/counterBasedRng.cpp   | 38 +++++++-----
 example/heatEquation/src/heatEquation.cpp     | 31 ++++++++--
 example/helloWorld/src/helloWorld.cpp         | 45 +++++++-------
 .../helloWorldLambda/src/helloWorldLambda.cpp | 37 ++++++++----
 .../src/kernelSpecialization.cpp              | 37 ++++++++----
 .../src/monteCarloIntegration.cpp             | 30 ++++++++--
 example/openMPSchedule/src/openMPSchedule.cpp |  1 -
 .../src/parallelLoopPatterns.cpp              | 51 +++++++++++-----
 example/randomCells2D/src/randomCells2D.cpp   | 27 ++++++++-
 .../randomStrategies/src/randomStrategies.cpp | 60 ++++++++++++++-----
 .../src/tagSpecialization.cpp                 | 38 +++++++-----
 example/vectorAdd/src/vectorAdd.cpp           | 37 ++++++++----
 include/alpaka/acc/AccCpuThreads.hpp          |  2 +-
 .../alpaka/example/ExecuteForEachAccTag.hpp   | 41 +++++++++++++
 test/unit/idx/src/MapIdxPitchBytes.cpp        | 14 +++--
 21 files changed, 453 insertions(+), 171 deletions(-)
 create mode 100644 include/alpaka/example/ExecuteForEachAccTag.hpp
diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake
index 7c81504639f1..7d24f8c3a5c1 100644
--- a/cmake/alpakaCommon.cmake
+++ b/cmake/alpakaCommon.cmake
@@ -581,7 +581,7 @@ if(alpaka_ACC_GPU_HIP_ENABLE)
         # https://github.com/llvm/llvm-project/commit/b86e0992bfa6
         # https://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#150
         # for example, is required to create alpaka::EnabledAccTags
-        # TODO(SimeonEhrig): restict HIP version, if first HIP version is release using Clang 19 
+        # TODO(SimeonEhrig): restict HIP version, if first HIP version is release using Clang 19
         alpaka_set_compiler_options(HOST_DEVICE target alpaka "$<$<COMPILE_LANGUAGE:HIP>:SHELL:-frelaxed-template-template-args>")
 
         alpaka_compiler_option(HIP_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps 'CMakeFiles/<targetname>.dir'" OFF)
diff --git a/docs/source/basic/library.rst b/docs/source/basic/library.rst
index ab25b9555bd5..3a5608b6f5e1 100644
--- a/docs/source/basic/library.rst
+++ b/docs/source/basic/library.rst
@@ -124,7 +124,7 @@ Kernels can also be defined via lambda expressions.
 
       int main() {
           // ...
-	  using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+	  using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
 
 	  auto kernel = [] ALPAKA_FN_ACC (Acc const & acc /* , ... */) -> void {
 	      // ...
diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp
index 12bd4a1a2acb..077797e3af30 100644
--- a/example/bufferCopy/src/bufferCopy.cpp
+++ b/example/bufferCopy/src/bufferCopy.cpp
@@ -4,7 +4,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <cstdint>
 #include <iostream>
@@ -64,7 +64,12 @@ struct FillBufferKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Define the index domain
     using Dim = alpaka::DimInt<3u>;
@@ -81,7 +86,7 @@ auto main() -> int
     // - AccCpuTbbBlocks
     // - AccCpuSerial
     // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
     // Defines the synchronization behavior of a queue
     //
@@ -90,12 +95,6 @@ auto main() -> int
     using DevQueue = alpaka::Queue<Acc, AccQueueProperty>;
 
     // Define the device accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuSerial
     using Host = alpaka::AccCpuSerial<Dim, Idx>;
     // Defines the synchronization behavior of a queue
     //
@@ -257,3 +256,19 @@ auto main() -> int
 
     return EXIT_SUCCESS;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/complex/src/complex.cpp b/example/complex/src/complex.cpp
index 2e8d62ecdf61..7eb8e44ddd72 100644
--- a/example/complex/src/complex.cpp
+++ b/example/complex/src/complex.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <cstdint>
 #include <iostream>
@@ -28,29 +28,17 @@ struct ComplexKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     using Idx = std::size_t;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    //
-    // Each accelerator has strengths and weaknesses. Therefore,
-    // they need to be choosen carefully depending on the actual
-    // use case. Furthermore, some accelerators only support a
-    // particular workdiv, but workdiv can also be generated
-    // automatically.
-
-    // By exchanging the Acc and Queue types you can select where to execute the kernel.
-    using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, alpaka::DimInt<1>, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
@@ -88,3 +76,19 @@ auto main() -> int
 
     return EXIT_SUCCESS;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/convolution1D/src/convolution1D.cpp b/example/convolution1D/src/convolution1D.cpp
index 047f462ef0a7..60e8f7be5b28 100644
--- a/example/convolution1D/src/convolution1D.cpp
+++ b/example/convolution1D/src/convolution1D.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <cmath>
 #include <iomanip>
@@ -64,7 +64,12 @@ auto FuzzyEqual(float a, float b) -> bool
     return std::fabs(a - b) < std::numeric_limits<float>::epsilon() * 10.0f;
 }
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Size of 1D arrays to be used in convolution integral
     // Here instead of "convolution kernel" the term "filter" is used because kernel has a different meaning in GPU
@@ -80,7 +85,7 @@ auto main() -> int
     using Idx = std::size_t;
 
     // Define the accelerator
-    using DevAcc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using DevAcc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using QueueProperty = alpaka::Blocking;
     using QueueAcc = alpaka::Queue<DevAcc, QueueProperty>;
     using BufAcc = alpaka::Buf<DevAcc, DataType, Dim, Idx>;
@@ -176,3 +181,19 @@ auto main() -> int
     std::cout << "All results are correct!\n";
     return EXIT_SUCCESS;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp
index e9d1cd5007d7..8de7071c05cc 100644
--- a/example/convolution2D/src/convolution2D.cpp
+++ b/example/convolution2D/src/convolution2D.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <iomanip>
 #include <iostream>
@@ -208,7 +208,12 @@ auto FuzzyEqual(float a, float b) -> bool
     return std::fabs(a - b) < std::numeric_limits<float>::epsilon() * 1000.0f;
 }
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Define the index domain
     using Dim = alpaka::DimInt<2>;
@@ -216,7 +221,7 @@ auto main() -> int
     using Idx = std::uint32_t;
     using Vec = alpaka::Vec<Dim, Idx>;
     // Define the accelerator
-    using DevAcc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using DevAcc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
 
     using DataType = float;
@@ -379,3 +384,19 @@ auto main() -> int
     std::cout << "Sampled result checks are correct!\n";
     return EXIT_SUCCESS;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/counterBasedRng/src/counterBasedRng.cpp b/example/counterBasedRng/src/counterBasedRng.cpp
index 86da223f1d0a..f25ad1cc954d 100644
--- a/example/counterBasedRng/src/counterBasedRng.cpp
+++ b/example/counterBasedRng/src/counterBasedRng.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 #include <alpaka/rand/RandPhiloxStateless.hpp>
 
 #include <chrono>
@@ -92,25 +92,19 @@ class CounterBasedRngKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Define the index domain
     using Dim = alpaka::DimInt<3u>;
     using Idx = std::size_t;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuFibers
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     using AccHost = alpaka::AccCpuSerial<Dim, Idx>;
@@ -222,3 +216,19 @@ auto main() -> int
         return EXIT_FAILURE;
     }
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/heatEquation/src/heatEquation.cpp b/example/heatEquation/src/heatEquation.cpp
index ff7ee6c7dafe..9c98008561f6 100644
--- a/example/heatEquation/src/heatEquation.cpp
+++ b/example/heatEquation/src/heatEquation.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <algorithm>
 #include <cmath>
@@ -62,7 +62,13 @@ auto exactSolution(double const x, double const t) -> double
 //! Every time step the kernel will be executed numNodesX-times
 //! After every step the curr-buffer will be set to the calculated values
 //! from the next-buffer.
-auto main() -> int
+//!
+//! In standard projects, you typically do not execute the code with any available accelerator.
+//! Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+//! selected accelerator only. If you use the example as the starting point for your project, you can rename the
+//! example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Parameters (a user is supposed to change numNodesX, numTimeSteps)
     uint32_t const numNodesX = 1000;
@@ -84,9 +90,8 @@ auto main() -> int
     using Dim = alpaka::DimInt<1u>;
     using Idx = uint32_t;
 
-    // Select accelerator-types for host and device
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    // Define the accelerator
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Select specific devices
@@ -179,3 +184,19 @@ auto main() -> int
         return EXIT_FAILURE;
     }
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/helloWorld/src/helloWorld.cpp b/example/helloWorld/src/helloWorld.cpp
index 20a28d5be5ba..6d7eb0522272 100644
--- a/example/helloWorld/src/helloWorld.cpp
+++ b/example/helloWorld/src/helloWorld.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <iostream>
 
@@ -47,7 +47,12 @@ struct HelloWorldKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Define the index domain
     //
@@ -59,25 +64,7 @@ auto main() -> int
     using Idx = std::size_t;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    //
-    // Each accelerator has strengths and weaknesses. Therefore,
-    // they need to be chosen carefully depending on the actual
-    // use case. Furthermore, some accelerators only support a
-    // particular workdiv, but workdiv can also be generated
-    // automatically.
-
-    // By exchanging the Acc and Queue types you can select where to execute the kernel.
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
@@ -173,3 +160,19 @@ auto main() -> int
 
     return EXIT_SUCCESS;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/helloWorldLambda/src/helloWorldLambda.cpp b/example/helloWorldLambda/src/helloWorldLambda.cpp
index 90296c396f60..298b9b15b619 100644
--- a/example/helloWorldLambda/src/helloWorldLambda.cpp
+++ b/example/helloWorldLambda/src/helloWorldLambda.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <functional>
 
@@ -39,7 +39,12 @@ void ALPAKA_FN_ACC hiWorldFunction(TAcc const& acc, size_t const nExclamationMar
     printf("\n");
 }
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
 // It requires support for extended lambdas when using nvcc as CUDA compiler.
 // Requires sequential backend if CI is used
@@ -51,17 +56,7 @@ auto main() -> int
     using Idx = std::size_t;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
@@ -138,3 +133,19 @@ auto main() -> int
     return EXIT_SUCCESS;
 #endif
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/kernelSpecialization/src/kernelSpecialization.cpp b/example/kernelSpecialization/src/kernelSpecialization.cpp
index 57e2ee1e64d7..e0e7d660b561 100644
--- a/example/kernelSpecialization/src/kernelSpecialization.cpp
+++ b/example/kernelSpecialization/src/kernelSpecialization.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <iostream>
 
@@ -53,21 +53,16 @@ struct Kernel
 #endif
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    //
     // For simplicity this examples always uses 1 dimensional indexing, and index type size_t
-    using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
+    using Acc = alpaka::TagToAcc<TAccTag, alpaka::DimInt<1>, std::size_t>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
@@ -97,3 +92,19 @@ auto main() -> int
 
     return EXIT_SUCCESS;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/monteCarloIntegration/src/monteCarloIntegration.cpp b/example/monteCarloIntegration/src/monteCarloIntegration.cpp
index 52e050785c88..d1ed33f0f4f0 100644
--- a/example/monteCarloIntegration/src/monteCarloIntegration.cpp
+++ b/example/monteCarloIntegration/src/monteCarloIntegration.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <cstdint>
 #include <cstdlib>
@@ -34,6 +34,7 @@ struct Kernel
     //! \param numPoints The total number of points to be calculated.
     //! \param globalCounter The sum of all local results.
     //! \param functor The function for which the integral is to be computed.
+    ALPAKA_NO_HOST_ACC_WARNING
     template<typename TAcc, typename TFunctor>
     ALPAKA_FN_ACC auto operator()(
         TAcc const& acc,
@@ -52,7 +53,7 @@ struct Kernel
             linearizedGlobalThreadIdx,
             0); // No specific subsequence start.
         // For simplicity the interval is fixed to [0.0,1.0].
-        auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));
+        auto dist = alpaka::rand::distribution::createUniformReal<float>(acc);
 
         uint32_t localCount = 0;
         for(size_t i = linearizedGlobalThreadIdx; i < numPoints; i += globalThreadExtent.prod())
@@ -72,13 +73,18 @@ struct Kernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Defines and setup.
     using Dim = alpaka::DimInt<1>;
     using Idx = std::size_t;
     using Vec = alpaka::Vec<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using Host = alpaka::DevCpu;
     auto const platformHost = alpaka::PlatformCpu{};
     auto const devHost = alpaka::getDevByIdx(platformHost, 0);
@@ -131,3 +137,19 @@ auto main() -> int
     std::cout << "error: " << error << "\n";
     return error > 0.001 ? EXIT_FAILURE : EXIT_SUCCESS;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/openMPSchedule/src/openMPSchedule.cpp b/example/openMPSchedule/src/openMPSchedule.cpp
index 23a71eec21ba..a6b5de722fe9 100644
--- a/example/openMPSchedule/src/openMPSchedule.cpp
+++ b/example/openMPSchedule/src/openMPSchedule.cpp
@@ -3,7 +3,6 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
 
 #include <cstdint>
 #include <iostream>
diff --git a/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp b/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp
index 0bd79ab9a26a..8aa36f4248b9 100644
--- a/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp
+++ b/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <iostream>
 #include <typeinfo>
@@ -73,7 +73,7 @@ struct NaiveCudaStyleKernel
     template<typename TAcc>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const
     {
-        auto const globalThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
         // Cuf off threads that have nothing to do
         if(globalThreadIdx < n)
         {
@@ -140,8 +140,8 @@ struct GridStridedLoopKernel
     template<typename TAcc>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const
     {
-        auto const globalThreadExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const globalThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
         for(uint32_t dataDomainIdx = globalThreadIdx; dataDomainIdx < n; dataDomainIdx += globalThreadExtent)
         {
             auto const memoryIdx = dataDomainIdx;
@@ -205,9 +205,9 @@ struct ChunkedGridStridedLoopKernel
     template<typename TAcc>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const
     {
-        auto const numElements(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
-        auto const globalThreadExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const globalThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const numElements = alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u];
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
         // Additionally could split the loop into peeled and remainder
         for(uint32_t chunkStart = globalThreadIdx * numElements; chunkStart < n;
             chunkStart += globalThreadExtent * numElements)
@@ -277,8 +277,8 @@ struct NaiveOpenMPStyleKernel
     template<typename TAcc>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const
     {
-        auto const globalThreadExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const globalThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
         auto const processPerThread = (n + globalThreadExtent - 1) / globalThreadExtent;
         for(uint32_t dataDomainIdx = globalThreadIdx * processPerThread;
             (dataDomainIdx < (globalThreadIdx + 1) * processPerThread) && (dataDomainIdx < n);
@@ -342,9 +342,9 @@ struct OpenMPSimdStyleKernel
     template<typename TAcc>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const
     {
-        auto const numElements(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
-        auto const globalThreadExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const globalThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const numElements = alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u];
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
         // This is the number for naive OpenMP style
         auto const naiveProcessPerThread = (n + globalThreadExtent - 1) / globalThreadExtent;
         // Round up to multiple of numElements
@@ -400,7 +400,12 @@ void openMPSimdStyle(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
     testResult(queue, bufAcc);
 }
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Define the index domain, this example is only for 1d
     using Dim = alpaka::DimInt<1u>;
@@ -417,7 +422,7 @@ auto main() -> int
     // - AccCpuTbbBlocks
     // - AccCpuSerial
     // using Acc = alpaka::AccCpuSerial<Dim, uint32_t>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, uint32_t>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, uint32_t>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Select a device and create queue for it
@@ -435,4 +440,22 @@ auto main() -> int
     chunkedGridStridedLoop<Acc>(devAcc, queue, bufAcc);
     naiveOpenMPStyle<Acc>(devAcc, queue, bufAcc);
     openMPSimdStyle<Acc>(devAcc, queue, bufAcc);
+
+    return EXIT_SUCCESS;
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }
diff --git a/example/randomCells2D/src/randomCells2D.cpp b/example/randomCells2D/src/randomCells2D.cpp
index a0a21370cecc..086757658ce1 100644
--- a/example/randomCells2D/src/randomCells2D.cpp
+++ b/example/randomCells2D/src/randomCells2D.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <cstdint>
 #include <cstdlib>
@@ -141,12 +141,17 @@ struct RunTimestepKernelVector
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     using Dim = alpaka::DimInt<2>;
     using Idx = std::size_t;
     using Vec = alpaka::Vec<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using Host = alpaka::DevCpu;
     auto const platformHost = alpaka::PlatformCpu{};
     auto const devHost = alpaka::getDevByIdx(platformHost, 0);
@@ -285,3 +290,19 @@ auto main() -> int
         return 1;
     }
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/randomStrategies/src/randomStrategies.cpp b/example/randomStrategies/src/randomStrategies.cpp
index 3dd0c9efde5e..a0a8e2e8172d 100644
--- a/example/randomStrategies/src/randomStrategies.cpp
+++ b/example/randomStrategies/src/randomStrategies.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <chrono>
 #include <cstdint>
@@ -25,13 +25,14 @@ using RandomEngine = alpaka::rand::Philox4x32x10;
 
 
 /// Parameters to set up the default accelerator, queue, and buffers
+template<typename TAccTag>
 struct Box
 {
     // accelerator, queue, and work division typedefs
     using Dim = alpaka::DimInt<1>;
     using Idx = std::size_t;
     using Vec = alpaka::Vec<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using PlatformHost = alpaka::PlatformCpu;
     using Host = alpaka::Dev<PlatformHost>;
     using PlatformAcc = alpaka::Platform<Acc>;
@@ -194,13 +195,14 @@ struct FillKernel
  *
  *  File is in TSV format. One line for each "point"; line length is the number of "rolls".
  */
-void saveDataAndShowAverage(std::string filename, float const* buffer, Box const& box)
+template<typename TAccTag>
+void saveDataAndShowAverage(std::string filename, float const* buffer, Box<TAccTag> const& box)
 {
     std::ofstream output(filename);
     std::cout << "Writing " << filename << " ... " << std::flush;
     auto const lineLength = box.extentResult[0] / box.extentRand[0];
     double average = 0;
-    for(Box::Idx i = 0; i < box.extentResult[0]; ++i)
+    for(typename Box<TAccTag>::Idx i = 0; i < box.extentResult[0]; ++i)
     {
         output << buffer[i] << ((i + 1) % lineLength ? "\t" : "\n");
         average += buffer[i];
@@ -216,7 +218,8 @@ struct Writer;
 template<>
 struct Writer<Strategy::seed>
 {
-    static void save(float const* buffer, Box const& box)
+    template<typename TAccTag>
+    static void save(float const* buffer, Box<TAccTag> const& box)
     {
         saveDataAndShowAverage("out_seed.csv", buffer, box);
     }
@@ -225,7 +228,8 @@ struct Writer<Strategy::seed>
 template<>
 struct Writer<Strategy::subsequence>
 {
-    static void save(float const* buffer, Box const& box)
+    template<typename TAccTag>
+    static void save(float const* buffer, Box<TAccTag> const& box)
     {
         saveDataAndShowAverage("out_subsequence.csv", buffer, box);
     }
@@ -234,14 +238,15 @@ struct Writer<Strategy::subsequence>
 template<>
 struct Writer<Strategy::offset>
 {
-    static void save(float const* buffer, Box const& box)
+    template<typename TAccTag>
+    static void save(float const* buffer, Box<TAccTag> const& box)
     {
         saveDataAndShowAverage("out_offset.csv", buffer, box);
     }
 };
 
-template<Strategy TStrategy>
-void runStrategy(Box& box)
+template<Strategy TStrategy, typename TAccTag>
+void runStrategy(Box<TAccTag>& box)
 {
     // Set up the pointer to the PRNG states buffer
     RandomEngine* const ptrBufAccRand{std::data(box.bufAccRand)};
@@ -252,7 +257,7 @@ void runStrategy(Box& box)
     // of the PRNG buffer and has to be passed in explicitly. Other strategies ignore the last parameter, and deduce
     // the initial parameters solely from the thread index
 
-    alpaka::exec<Box::Acc>(
+    alpaka::exec<typename Box<TAccTag>::Acc>(
         box.queue,
         box.workdivRand,
         initRandomKernel,
@@ -273,13 +278,19 @@ void runStrategy(Box& box)
     float* const ptrBufAccResult{std::data(box.bufAccResult)};
 
     // Initialise the results buffer to zero
-    for(Box::Idx i = 0; i < box.extentResult[0]; ++i)
+    for(typename Box<TAccTag>::Idx i = 0; i < box.extentResult[0]; ++i)
         ptrBufHostResult[i] = 0;
 
     // Run the "computation" kernel filling the results buffer with random numbers in parallel
     alpaka::memcpy(box.queue, box.bufAccResult, box.bufHostResult);
     FillKernel fillKernel;
-    alpaka::exec<Box::Acc>(box.queue, box.workdivResult, fillKernel, box.extentResult, ptrBufAccRand, ptrBufAccResult);
+    alpaka::exec<typename Box<TAccTag>::Acc>(
+        box.queue,
+        box.workdivResult,
+        fillKernel,
+        box.extentResult,
+        ptrBufAccRand,
+        ptrBufAccResult);
     alpaka::memcpy(box.queue, box.bufHostResult, box.bufAccResult);
     alpaka::wait(box.queue);
 
@@ -287,9 +298,14 @@ void runStrategy(Box& box)
     Writer<TStrategy>::save(ptrBufHostResult, box);
 }
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-    Box box; // Initialize the box
+    Box<TAccTag> box; // Initialize the box
 
     runStrategy<Strategy::seed>(box); // threads start from different seeds
     runStrategy<Strategy::subsequence>(box); // threads use different subsequences
@@ -297,3 +313,19 @@ auto main() -> int
 
     return 0;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/tagSpecialization/src/tagSpecialization.cpp b/example/tagSpecialization/src/tagSpecialization.cpp
index 534aa29aeda8..dac94bbaa32b 100644
--- a/example/tagSpecialization/src/tagSpecialization.cpp
+++ b/example/tagSpecialization/src/tagSpecialization.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <iostream>
 
@@ -76,22 +76,16 @@ struct WrapperKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuFibers
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    //
     // For simplicity this examples always uses 1 dimensional indexing, and index type size_t
-    using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
+    using Acc = alpaka::TagToAcc<TAccTag, alpaka::DimInt<1>, std::size_t>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Call the specialized functions
@@ -122,3 +116,19 @@ auto main() -> int
     alpaka::wait(queue);
     return EXIT_SUCCESS;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp
index 5eca205279c8..b24a4f156119 100644
--- a/example/vectorAdd/src/vectorAdd.cpp
+++ b/example/vectorAdd/src/vectorAdd.cpp
@@ -4,7 +4,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <chrono>
 #include <iostream>
@@ -54,7 +54,12 @@ class VectorAddKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Define the index domain
     // Set the number of dimensions as an integral constant. Set to 1 for 1D.
@@ -62,17 +67,7 @@ auto main() -> int
     using Idx = std::size_t;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using DevAcc = alpaka::Dev<Acc>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
@@ -198,3 +193,19 @@ auto main() -> int
         return EXIT_FAILURE;
     }
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/include/alpaka/acc/AccCpuThreads.hpp b/include/alpaka/acc/AccCpuThreads.hpp
index f4984b63d734..ce8f04a73d99 100644
--- a/include/alpaka/acc/AccCpuThreads.hpp
+++ b/include/alpaka/acc/AccCpuThreads.hpp
@@ -143,7 +143,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
             {
 #    ifdef ALPAKA_CI
-                auto const blockThreadCountMax(static_cast<TIdx>(8));
+                auto const blockThreadCountMax = static_cast<TIdx>(8);
 #    else
                 // \TODO: Magic number. What is the maximum? Just set a reasonable value? There is a implementation
                 // defined maximum where the creation of a new thread crashes. std::thread::hardware_concurrency can
diff --git a/include/alpaka/example/ExecuteForEachAccTag.hpp b/include/alpaka/example/ExecuteForEachAccTag.hpp
new file mode 100644
index 000000000000..84247ab55304
--- /dev/null
+++ b/include/alpaka/example/ExecuteForEachAccTag.hpp
@@ -0,0 +1,41 @@
+/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include "alpaka/alpaka.hpp"
+
+#include <functional>
+#include <tuple>
+#include <utility>
+
+#pragma once
+
+namespace alpaka
+{
+    //! execute a callable for each active accelerator tag
+    //
+    // @param callable callable which can be invoked with an accelerator tag
+    // @return disjunction of all invokation results
+    //
+    template<typename TCallable>
+    inline auto executeForEachAccTag(TCallable&& callable)
+    {
+        // Execute the callable once for each enabled accelerator.
+        // Pass the tag as first argument to the callable.
+        return std::apply(
+            [=](auto const&... tags) { return (callable(tags) || ...); },
+            alpaka::EnabledAccTags{});
+    }
+
+    template<typename TCallable,typename TArg>
+    inline auto executeForEachAccTag(TCallable (*callable)(TArg))
+    {
+        // Execute the callable once for each enabled accelerator.
+        // Pass the tag as first argument to the callable.
+        return std::apply(
+            [=](auto const&... tags) { return (callable(tags) || ...); },
+            alpaka::EnabledAccTags{});
+    }
+
+
+} // namespace alpaka
diff --git a/test/unit/idx/src/MapIdxPitchBytes.cpp b/test/unit/idx/src/MapIdxPitchBytes.cpp
index 327d9a46fa0b..6a341cac27cd 100644
--- a/test/unit/idx/src/MapIdxPitchBytes.cpp
+++ b/test/unit/idx/src/MapIdxPitchBytes.cpp
@@ -3,7 +3,6 @@
  */
 
 #include <alpaka/dev/Traits.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
 #include <alpaka/idx/Accessors.hpp>
 #include <alpaka/idx/MapIdx.hpp>
 #include <alpaka/mem/view/ViewPlainPtr.hpp>
@@ -14,15 +13,16 @@
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/catch_test_macros.hpp>
 
-TEMPLATE_LIST_TEST_CASE("mapIdxPitchBytes", "[idx]", alpaka::test::NonZeroTestDims)
+template<typename TDim, typename TAccTag>
+auto mapIdxPitchBytes(TAccTag const&)
 {
-    using Dim = TestType;
+    using Dim = TDim;
     using Idx = std::size_t;
     using Vec = alpaka::Vec<Dim, Idx>;
 
     auto const extentNd = alpaka::test::extentBuf<Dim, Idx>;
 
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using Elem = std::uint8_t;
     auto const platformAcc = alpaka::Platform<Acc>{};
     auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
@@ -45,3 +45,9 @@ TEMPLATE_LIST_TEST_CASE("mapIdxPitchBytes", "[idx]", alpaka::test::NonZeroTestDi
     // roundtrip
     REQUIRE(idxNd == idxNdResult);
 }
+
+TEMPLATE_LIST_TEST_CASE("mapIdxPitchBytes", "[idx]", alpaka::test::NonZeroTestDims)
+{
+    // execute the example once for each enabled accelerator
+    std::apply([](auto const&... tags) { (mapIdxPitchBytes<TestType>(tags), ...); }, alpaka::EnabledAccTags{});
+}