From d83def0161125da43446be07d904be90c4cdc356 Mon Sep 17 00:00:00 2001
From: mehmet yusufoglu <mehmetyusufoglu01@gmail.com>
Date: Tue, 6 Aug 2024 11:56:36 +0200
Subject: [PATCH] refactor examples and tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- rename `bundeledKernelFill` into `kernelBundle`
- use the kernel bundle everywhere where we already have this object for
  the automatically work division creation

Co-authored-by: René Widera <r.widera@hzdr.de>
---
 docs/source/basic/cheatsheet.rst              |  4 +--
 example/bufferCopy/src/bufferCopy.cpp         | 24 +++++++++--------
 example/complex/src/complex.cpp               |  7 +++--
 .../conv2DWithMdspan/src/conv2DWithMdspan.cpp | 12 +++------
 example/convolution1D/src/convolution1D.cpp   | 14 +++-------
 example/convolution2D/src/convolution2D.cpp   | 21 ++++-----------
 .../counterBasedRng/src/counterBasedRng.cpp   | 20 +++++---------
 example/heatEquation/src/heatEquation.cpp     |  8 +++---
 example/helloWorld/src/helloWorld.cpp         | 11 +++-----
 .../helloWorldLambda/src/helloWorldLambda.cpp |  6 ++---
 .../src/kernelSpecialization.cpp              |  7 +++--
 .../src/matrixMulMdSpan.cpp                   |  6 ++---
 .../src/monteCarloIntegration.cpp             |  6 ++---
 example/openMPSchedule/src/openMPSchedule.cpp | 12 ++++-----
 example/randomCells2D/src/randomCells2D.cpp   | 26 ++++++-------------
 .../randomStrategies/src/randomStrategies.cpp | 26 ++++++-------------
 example/vectorAdd/src/vectorAdd.cpp           | 12 +++------
 .../alpaka/test/KernelExecutionFixture.hpp    | 13 +++++-----
 test/integ/axpy/src/axpy.cpp                  | 14 +++-------
 test/integ/mandelbrot/src/mandelbrot.cpp      | 18 +++----------
 test/integ/matMul/src/matMul.cpp              | 20 +++-----------
 test/integ/separableCompilation/src/main.cpp  | 18 ++++++-------
 test/integ/sharedMem/src/sharedMem.cpp        | 14 ++++++----
 test/unit/math/src/TestTemplate.hpp           |  8 +++---
 24 files changed, 116 insertions(+), 211 deletions(-)
diff --git a/docs/source/basic/cheatsheet.rst b/docs/source/basic/cheatsheet.rst
index 7cd60c4de97e..1e0d6dfc72b5 100644
--- a/docs/source/basic/cheatsheet.rst
+++ b/docs/source/basic/cheatsheet.rst
@@ -181,7 +181,7 @@ Prepare Kernel Bundle
 
      HeatEquationKernel heatEqKernel;
      // Arguments of KernelBundle: The kernel instance and the kernel arguments
-     auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+     auto const& kernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
 
 Automatically select a valid kernel launch configuration
   .. code-block:: c++
@@ -191,7 +191,7 @@ Automatically select a valid kernel launch configuration
 
      auto autoWorkDiv = getValidWorkDivForKernel<Acc>(
        device,
-       bundeledKernel,
+       kernelBundle,
        globalThreadExtent, elementsPerThread,
        false,
        GridBlockExtentSubDivRestrictions::Unrestricted);
diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp
index 53d9d25e7f84..b45d8aff777d 100644
--- a/example/bufferCopy/src/bufferCopy.cpp
+++ b/example/bufferCopy/src/bufferCopy.cpp
@@ -164,12 +164,11 @@ auto example(TAccTag const&) -> int
 
     FillBufferKernel fillBufferKernel;
 
-    auto const& bundeledFillBufferKernel = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
+    auto const& fillBufferKernelBundle = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
     auto const hostWorkDiv
-        = alpaka::getValidWorkDivForKernel<Host>(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Host>(devHost, fillBufferKernelBundle, threadsPerGrid, elementsPerThread);
 
-    alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernel,
-                       hostViewPlainPtrMdSpan); // 1st kernel argument
+    alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernelBundle); // 1st kernel argument
 
     // Copy host to device Buffer
     //
@@ -203,14 +202,17 @@ auto example(TAccTag const&) -> int
     auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2);
 
     TestBufferKernel testBufferKernel;
-    auto const& bundeledTestBufferKernel = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);
+    auto const& restBufferKernelBundle1 = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);
+    auto const& restBufferKernelBundle2 = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan2);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const devWorkDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledTestBufferKernel, threadsPerGrid, elementsPerThread);
+    auto const devWorkDiv1
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, restBufferKernelBundle1, threadsPerGrid, elementsPerThread);
+    auto const devWorkDiv2
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, restBufferKernelBundle2, threadsPerGrid, elementsPerThread);
 
-    alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1);
-    alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2);
+    alpaka::exec<Acc>(devQueue, devWorkDiv1, restBufferKernelBundle1);
+    alpaka::exec<Acc>(devQueue, devWorkDiv2, restBufferKernelBundle2);
 
     // Print device Buffer
     //
@@ -223,11 +225,11 @@ auto example(TAccTag const&) -> int
     // completely distorted.
 
     PrintBufferKernel printBufferKernel;
-    alpaka::exec<Acc>(devQueue, devWorkDiv, printBufferKernel, deviceBufferMdSpan1);
+    alpaka::exec<Acc>(devQueue, devWorkDiv1, printBufferKernel, deviceBufferMdSpan1);
     alpaka::wait(devQueue);
     std::cout << std::endl;
 
-    alpaka::exec<Acc>(devQueue, devWorkDiv, printBufferKernel, deviceBufferMdSpan2);
+    alpaka::exec<Acc>(devQueue, devWorkDiv2, printBufferKernel, deviceBufferMdSpan2);
     alpaka::wait(devQueue);
     std::cout << std::endl;
 
diff --git a/example/complex/src/complex.cpp b/example/complex/src/complex.cpp
index 7c9b39563460..1ea8783842ed 100644
--- a/example/complex/src/complex.cpp
+++ b/example/complex/src/complex.cpp
@@ -56,15 +56,14 @@ auto example(TAccTag const&) -> int
     Idx const threadsPerGrid = 1u;
     Idx const elementsPerThread = 1u;
 
-    ComplexKernel complexKernel;
+    alpaka::KernelBundle kernelBundle = ComplexKernel{};
 
-    auto const& bundeledKernel = alpaka::KernelBundle(complexKernel);
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
 
     // Run the kernel
-    alpaka::exec<Acc>(queue, workDiv, complexKernel);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     // Usage of alpaka::Complex<T> on the host side is the same as inside kernels, except math functions are not
diff --git a/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp b/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
index 0a8b7d165b7d..5a41b7bf45ef 100644
--- a/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
+++ b/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
@@ -148,24 +148,18 @@ auto example(TAccTag const&) -> int
     ConvolutionKernelMdspan2D convolutionKernel2D;
 
     // Make a bundle
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    auto const& kernelBundle = alpaka::KernelBundle(
         convolutionKernel2D,
         alpaka::experimental::getMdSpan(bufInputAcc),
         alpaka::experimental::getMdSpan(outputDeviceMemory),
         alpaka::experimental::getMdSpan(bufFilterAcc));
 
     //   Let alpaka calculate good block and grid sizes given our full problem extent.
-    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());
+    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, kernelBundle, extent, Vec::ones());
 
 
     // Run the kernel, pass 3 arrays as 2D mdspans
-    alpaka::exec<DevAcc>(
-        queueAcc,
-        workDiv,
-        convolutionKernel2D,
-        alpaka::experimental::getMdSpan(bufInputAcc),
-        alpaka::experimental::getMdSpan(outputDeviceMemory),
-        alpaka::experimental::getMdSpan(bufFilterAcc));
+    alpaka::exec<DevAcc>(queueAcc, workDiv, kernelBundle);
 
     // Allocate memory on host to receive the resulting matrix as an array
     auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent);
diff --git a/example/convolution1D/src/convolution1D.cpp b/example/convolution1D/src/convolution1D.cpp
index 098dc8501d09..500e3284a46e 100644
--- a/example/convolution1D/src/convolution1D.cpp
+++ b/example/convolution1D/src/convolution1D.cpp
@@ -140,7 +140,7 @@ auto example(TAccTag const&) -> int
     DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory);
     DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory);
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    auto const& kernelBundle = alpaka::KernelBundle(
         convolutionKernel,
         nativeInputDeviceMemory,
         nativeFilterDeviceMemory,
@@ -150,17 +150,9 @@ auto example(TAccTag const&) -> int
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
     // Run the kernel
-    alpaka::exec<DevAcc>(
-        queue,
-        workDiv,
-        convolutionKernel,
-        nativeInputDeviceMemory,
-        nativeFilterDeviceMemory,
-        nativeOutputDeviceMemory,
-        inputSize,
-        filterSize);
+    alpaka::exec<DevAcc>(queue, workDiv, kernelBundle);
 
     // Allocate memory on host
     auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, inputSize);
diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp
index 2c8a6b28d850..7d137adad6ee 100644
--- a/example/convolution2D/src/convolution2D.cpp
+++ b/example/convolution2D/src/convolution2D.cpp
@@ -265,7 +265,7 @@ auto example(TAccTag const&) -> int
     alpaka::wait(queueAcc);
 
     // Calculate the allocated width, due to padding it might be larger then the matrix width
-    auto const intputWidthAllocated = [&]() -> const Idx
+    auto const intputWidthAllocated = [&]() -> Idx const
     {
         // Calculate pitch: The size of one line in bytes including padding.
         auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]};
@@ -294,7 +294,7 @@ auto example(TAccTag const&) -> int
     alpaka::wait(queueAcc);
 
     // Calculate the allocated width, due to padding it might be larger then the matrix width
-    auto const filterWidthAllocated = [&]() -> const Idx
+    auto const filterWidthAllocated = [&]() -> Idx const
     {
         // Calculate pitch: The size of one line in bytes including padding.
         auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]};
@@ -305,7 +305,7 @@ auto example(TAccTag const&) -> int
     //  ConvolutionKernel2DSharedMemory
     ConvolutionKernel2DSharedMemory convolutionKernel2D;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    auto const& kernelBundle = alpaka::KernelBundle(
         convolutionKernel2D,
         alpaka::getPtrNative(bufInputAcc),
         alpaka::getPtrNative(outputDeviceMemory),
@@ -317,21 +317,10 @@ auto example(TAccTag const&) -> int
         filterWidthAllocated);
 
     //   Let alpaka calculate good block and grid sizes given our full problem extent.
-    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());
+    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, kernelBundle, extent, Vec::ones());
 
     // Run the kernel
-    alpaka::exec<DevAcc>(
-        queueAcc,
-        workDiv,
-        convolutionKernel2D,
-        std::data(bufInputAcc),
-        std::data(outputDeviceMemory),
-        matrixWidth,
-        matrixHeight,
-        std::data(bufFilterAcc),
-        filterWidth,
-        intputWidthAllocated,
-        filterWidthAllocated);
+    alpaka::exec<DevAcc>(queueAcc, workDiv, kernelBundle);
 
     // Allocate memory on host to receive the resulting matrix as an array
     auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent1D);
diff --git a/example/counterBasedRng/src/counterBasedRng.cpp b/example/counterBasedRng/src/counterBasedRng.cpp
index 7a9a9abfc7fe..49f23c756384 100644
--- a/example/counterBasedRng/src/counterBasedRng.cpp
+++ b/example/counterBasedRng/src/counterBasedRng.cpp
@@ -147,27 +147,19 @@ auto example(TAccTag const&) -> int
     BufAcc bufAcc(alpaka::allocBuf<Data, Idx>(devAcc, extent));
 
     CounterBasedRngKernel counterBasedRngKernel;
-    auto const& bundeledKernel
+    auto const& kernelBundleAcc
         = alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufAcc), key);
-    auto const& bundeledKernel2
+    auto const& kernelBundleHost
         = alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufHost), key);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDivAcc = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elementsPerThread);
+    auto const workDivAcc = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundleAcc, extent, elementsPerThread);
     auto const workDivHost
-        = alpaka::getValidWorkDivForKernel<AccHost>(devHost, bundeledKernel2, extent, elementsPerThreadHost);
+        = alpaka::getValidWorkDivForKernel<AccHost>(devHost, kernelBundleHost, extent, elementsPerThreadHost);
 
     // Create the kernel execution task.
-    auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(
-        workDivAcc,
-        CounterBasedRngKernel(),
-        alpaka::experimental::getMdSpan(bufAcc),
-        key);
-    auto const taskKernelHost = alpaka::createTaskKernel<AccHost>(
-        workDivHost,
-        CounterBasedRngKernel(),
-        alpaka::experimental::getMdSpan(bufHost),
-        key);
+    auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(workDivAcc, kernelBundleAcc);
+    auto const taskKernelHost = alpaka::createTaskKernel<AccHost>(workDivHost, kernelBundleHost);
 
     // Enqueue the kernel execution task
     alpaka::enqueue(queueHost, taskKernelHost);
diff --git a/example/heatEquation/src/heatEquation.cpp b/example/heatEquation/src/heatEquation.cpp
index df43a4e0ed47..958961078649 100644
--- a/example/heatEquation/src/heatEquation.cpp
+++ b/example/heatEquation/src/heatEquation.cpp
@@ -134,9 +134,9 @@ auto example(TAccTag const&) -> int
 
     HeatEquationKernel heatEqKernel;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+    auto const& kernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elemPerThread);
+    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, extent, elemPerThread);
 
     // Copy host -> device
     alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost);
@@ -146,8 +146,10 @@ auto example(TAccTag const&) -> int
 
     for(uint32_t step = 0; step < numTimeSteps; step++)
     {
+        auto const& tmpKernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+
         // Compute next values
-        alpaka::exec<Acc>(queue, workDiv, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+        alpaka::exec<Acc>(queue, workDiv, tmpKernelBundle);
 
         // We assume the boundary conditions are constant and so these values
         // do not need to be updated.
diff --git a/example/helloWorld/src/helloWorld.cpp b/example/helloWorld/src/helloWorld.cpp
index 646df34d7b66..579759517ddf 100644
--- a/example/helloWorld/src/helloWorld.cpp
+++ b/example/helloWorld/src/helloWorld.cpp
@@ -133,12 +133,11 @@ auto example(TAccTag const&) -> int
     // Kernels can be everything that is trivially copyable, has a
     // callable operator() and takes the accelerator as first
     // argument. So a kernel can be a class or struct, a lambda, etc.
-    HelloWorldKernel helloWorldKernel;
+    alpaka::KernelBundle kernelBundle = HelloWorldKernel{};
 
-    auto const& bundeledKernel = alpaka::KernelBundle(helloWorldKernel);
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
 
     // Run the kernel
     //
@@ -149,11 +148,7 @@ auto example(TAccTag const&) -> int
     // The queue can be blocking or non-blocking
     // depending on the chosen queue type (see type definitions above).
     // Here it is synchronous which means that the kernel is directly executed.
-    alpaka::exec<Acc>(
-        queue,
-        workDiv,
-        helloWorldKernel
-        /* put kernel arguments here */);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;
diff --git a/example/helloWorldLambda/src/helloWorldLambda.cpp b/example/helloWorldLambda/src/helloWorldLambda.cpp
index b0e028cea2d7..442d019bc287 100644
--- a/example/helloWorldLambda/src/helloWorldLambda.cpp
+++ b/example/helloWorldLambda/src/helloWorldLambda.cpp
@@ -117,12 +117,12 @@ auto example(TAccTag const&) -> int
         printf("\n");
     };
 
-    auto const& bundeledKernel = alpaka::KernelBundle(kernelLambda, nExclamationMarks);
+    auto const& kernelBundle = alpaka::KernelBundle(kernelLambda, nExclamationMarks);
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
 
-    alpaka::exec<Acc>(queue, workDiv, kernelLambda, nExclamationMarks);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;
diff --git a/example/kernelSpecialization/src/kernelSpecialization.cpp b/example/kernelSpecialization/src/kernelSpecialization.cpp
index 6bb7ccbda79f..ba78a189b179 100644
--- a/example/kernelSpecialization/src/kernelSpecialization.cpp
+++ b/example/kernelSpecialization/src/kernelSpecialization.cpp
@@ -79,15 +79,14 @@ auto example(TAccTag const&) -> int
     // Define the work division
     std::size_t const threadsPerGrid = 16u;
     std::size_t const elementsPerThread = 1u;
-    Kernel kernel;
+    alpaka::KernelBundle kernelBundle = Kernel{};
 
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel);
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
 
     // Run the kernel
-    alpaka::exec<Acc>(queue, workDiv, kernel);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;
diff --git a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
index 1a5ee577b405..46c1f8b14a38 100644
--- a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
+++ b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
@@ -147,19 +147,19 @@ auto example(TAccTag const&) -> int
     auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);
 
     MatrixMulKernel kernel;
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);
+    auto const& kernelBundle = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
         devAcc,
-        bundeledKernel,
+        kernelBundle,
         extentC,
         Vec::ones(),
         false,
         alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
 
     // Execute the kernel
-    alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
 
     // Copy result back to host
     alpaka::memcpy(queue, bufHostC, bufDevC);
diff --git a/example/monteCarloIntegration/src/monteCarloIntegration.cpp b/example/monteCarloIntegration/src/monteCarloIntegration.cpp
index fd0961979b36..152fa7644643 100644
--- a/example/monteCarloIntegration/src/monteCarloIntegration.cpp
+++ b/example/monteCarloIntegration/src/monteCarloIntegration.cpp
@@ -113,15 +113,15 @@ auto example(TAccTag const&) -> int
     alpaka::memcpy(queue, bufAcc, bufHost);
 
     Kernel kernel;
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{});
+    auto const& kernelBundle = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{});
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
         devAcc,
-        bundeledKernel,
+        kernelBundle,
         Vec(numThreads),
         Vec(numAlpakaElementsPerThread));
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, numPoints, ptrBufAcc, Function{});
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::memcpy(queue, bufHost, bufAcc);
     alpaka::wait(queue);
 
diff --git a/example/openMPSchedule/src/openMPSchedule.cpp b/example/openMPSchedule/src/openMPSchedule.cpp
index 1febb42cd685..60ed66e2eee5 100644
--- a/example/openMPSchedule/src/openMPSchedule.cpp
+++ b/example/openMPSchedule/src/openMPSchedule.cpp
@@ -107,25 +107,25 @@ auto main() -> int
     Idx const threadsPerGrid = 16u;
     Idx const elementsPerThread = 1u;
 
-    OpenMPScheduleDefaultKernel openMPScheduleDefaultKernel;
-    auto const& bundeledKernel = alpaka::KernelBundle(openMPScheduleDefaultKernel);
+    alpaka::KernelBundle kernelBundle = OpenMPScheduleDefaultKernel{};
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
 
     // Run the kernel setting no schedule explicitly.
     std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n";
-    alpaka::exec<Acc>(queue, workDiv, openMPScheduleDefaultKernel);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     // Run the kernel setting the schedule via a trait
     std::cout << "\n\nOpenMPScheduleMemberKernel setting the schedule via a static member:\n";
-    alpaka::exec<Acc>(queue, workDiv, OpenMPScheduleMemberKernel{});
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     // Run the kernel setting the schedule via a trait
     std::cout << "\n\nOpenMPScheduleTraitKernel setting the schedule via trait:\n";
-    alpaka::exec<Acc>(queue, workDiv, OpenMPScheduleTraitKernel{});
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;
diff --git a/example/randomCells2D/src/randomCells2D.cpp b/example/randomCells2D/src/randomCells2D.cpp
index b5b45a5ef423..0194d7e747bb 100644
--- a/example/randomCells2D/src/randomCells2D.cpp
+++ b/example/randomCells2D/src/randomCells2D.cpp
@@ -202,16 +202,17 @@ auto example(TAccTag const&) -> int
 
     auto pitchBufAccRandV = alpaka::getPitchesInBytes(bufAccRandV)[0];
 
-    auto const& bundeledKernelInitRandom
+    auto const& kernelBundleInitRandom
         = alpaka::KernelBundle(initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDivInitRandom
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernelInitRandom, extent, Vec(perThreadY, perThreadX));
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundleInitRandom, extent, Vec(perThreadY, perThreadX));
 
     alpaka::exec<Acc>(queue, workDivInitRandom, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
     alpaka::wait(queue);
 
-    alpaka::exec<Acc>(queue, workDivInitRandom, initRandomKernel, extent, ptrBufAccRandV, pitchBufAccRandV);
+    // execute the same kernel with different pointers
+    alpaka::exec<Acc>(queue, workDivInitRandom, kernelBundleInitRandom);
     alpaka::wait(queue);
 
     auto pitchHostS = alpaka::getPitchesInBytes(bufHostS)[0];
@@ -230,7 +231,7 @@ auto example(TAccTag const&) -> int
     alpaka::memcpy(queue, bufAccS, bufHostS);
     RunTimestepKernelSingle runTimestepKernelSingle;
 
-    auto const& bundeledKernelRuntimeStep = alpaka::KernelBundle(
+    auto const& kernelBundleRuntimeStep = alpaka::KernelBundle(
         runTimestepKernelSingle,
         extent,
         ptrBufAccRandS,
@@ -239,21 +240,10 @@ auto example(TAccTag const&) -> int
         pitchBufAccS);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDivRuntimeStep = alpaka::getValidWorkDivForKernel<Acc>(
-        devAcc,
-        bundeledKernelRuntimeStep,
-        extent,
-        Vec(perThreadY, perThreadX));
+    auto const workDivRuntimeStep
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundleRuntimeStep, extent, Vec(perThreadY, perThreadX));
 
-    alpaka::exec<Acc>(
-        queue,
-        workDivRuntimeStep,
-        runTimestepKernelSingle,
-        extent,
-        ptrBufAccRandS,
-        ptrBufAccS,
-        pitchBufAccRandS,
-        pitchBufAccS);
+    alpaka::exec<Acc>(queue, workDivRuntimeStep, kernelBundleRuntimeStep);
     alpaka::memcpy(queue, bufHostS, bufAccS);
 
     auto pitchBufAccV = alpaka::getPitchesInBytes(bufAccV)[0];
diff --git a/example/randomStrategies/src/randomStrategies.cpp b/example/randomStrategies/src/randomStrategies.cpp
index ea87d290a2c4..0e4c0b125c70 100644
--- a/example/randomStrategies/src/randomStrategies.cpp
+++ b/example/randomStrategies/src/randomStrategies.cpp
@@ -247,7 +247,7 @@ void runStrategy(Box<TAccTag>& box)
     // the initial parameters solely from the thread index
 
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    auto const& kernelBundle = alpaka::KernelBundle(
         initRandomKernel,
         box.extentRand,
         ptrBufAccRand,
@@ -256,7 +256,7 @@ void runStrategy(Box<TAccTag>& box)
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDivRand = alpaka::getValidWorkDivForKernel<typename Box<TAccTag>::Acc>(
         alpaka::getDevByIdx(box.accPlatform, 0),
-        bundeledKernel,
+        kernelBundle,
         box.extentRand,
         typename Box<TAccTag>::Vec(typename Box<TAccTag>::Idx{1}),
         false,
@@ -266,11 +266,7 @@ void runStrategy(Box<TAccTag>& box)
     alpaka::exec<typename Box<TAccTag>::Acc>(
         box.queue,
         workDivRand,
-        initRandomKernel,
-        box.extentRand,
-        ptrBufAccRand,
-        static_cast<unsigned>(
-            box.extentResult[0] / box.extentRand[0])); // == NUM_ROLLS; amount of work to be performed by each thread
+        kernelBundle);
 
     alpaka::wait(box.queue);
 
@@ -291,27 +287,21 @@ void runStrategy(Box<TAccTag>& box)
     alpaka::memcpy(box.queue, box.bufAccResult, box.bufHostResult);
     FillKernel fillKernel;
 
-    auto const& bundeledKernelFill
-        = alpaka::KernelBundle(fillKernel, box.extentResult, ptrBufAccRand, ptrBufAccResult);
+    auto const& kernelBundleFill = alpaka::KernelBundle(fillKernel, box.extentResult, ptrBufAccRand, ptrBufAccResult);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workdivResult = alpaka::getValidWorkDivForKernel<typename Box<TAccTag>::Acc>(
         alpaka::getDevByIdx(box.accPlatform, 0),
-        bundeledKernelFill,
+        kernelBundleFill,
         box.extentResult,
+        // One thread per "point"; each performs NUM_ROLLS "rolls"
         typename Box<TAccTag>::Vec(static_cast<typename Box<TAccTag>::Idx>(
-            NUM_ROLLS)), // One thread per "point"; each performs NUM_ROLLS "rolls"
+            NUM_ROLLS)),
         false,
         alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
 
 
-    alpaka::exec<typename Box<TAccTag>::Acc>(
-        box.queue,
-        workdivResult,
-        fillKernel,
-        box.extentResult,
-        ptrBufAccRand,
-        ptrBufAccResult);
+    alpaka::exec<typename Box<TAccTag>::Acc>(box.queue, workdivResult, kernelBundleFill);
     alpaka::memcpy(box.queue, box.bufHostResult, box.bufAccResult);
     alpaka::wait(box.queue);
 
diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp
index a99393fb8b5b..04d1aad73dc8 100644
--- a/example/vectorAdd/src/vectorAdd.cpp
+++ b/example/vectorAdd/src/vectorAdd.cpp
@@ -130,23 +130,17 @@ auto example(TAccTag const&) -> int
     // Instantiate the kernel function object
     VectorAddKernel kernel;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    auto const& kernelBundle = alpaka::KernelBundle(
         kernel,
         alpaka::getPtrNative(bufAccA),
         alpaka::getPtrNative(bufAccB),
         alpaka::getPtrNative(bufAccC),
         numElements);
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elementsPerThread);
+    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, extent, elementsPerThread);
 
     // Create the kernel execution task.
-    auto const taskKernel = alpaka::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        std::data(bufAccA),
-        std::data(bufAccB),
-        std::data(bufAccC),
-        numElements);
+    auto const& taskKernel = alpaka::createTaskKernel<Acc>(workDiv, kernelBundle);
 
     // Enqueue the kernel execution task
     {
diff --git a/include/alpaka/test/KernelExecutionFixture.hpp b/include/alpaka/test/KernelExecutionFixture.hpp
index 89d17d95b34f..5b146818cff0 100644
--- a/include/alpaka/test/KernelExecutionFixture.hpp
+++ b/include/alpaka/test/KernelExecutionFixture.hpp
@@ -62,21 +62,22 @@ namespace alpaka::test
             memset(m_queue, bufAccResult, static_cast<std::uint8_t>(true));
 
 
-            auto bundeledKernel = alpaka::KernelBundle<TKernelFnObj, decltype(getPtrNative(bufAccResult)), TArgs...>(
-                kernelFnObj,
-                getPtrNative(bufAccResult),
-                std::forward<TArgs>(args)...);
+            auto const& kernelBundle
+                = alpaka::KernelBundle<TKernelFnObj, decltype(getPtrNative(bufAccResult)), TArgs...>(
+                    kernelFnObj,
+                    getPtrNative(bufAccResult),
+                    std::forward<TArgs>(args)...);
 
 
             // set workdiv if it is not before
             if(m_workDiv == WorkDiv{Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0)})
                 m_workDiv = alpaka::getValidWorkDivForKernel<Acc, Dev<Acc>>(
                     m_device,
-                    bundeledKernel,
+                    kernelBundle,
                     m_extent,
                     Vec<Dim, Idx>::ones());
 
-            exec<Acc>(m_queue, m_workDiv, kernelFnObj, getPtrNative(bufAccResult), std::forward<TArgs>(args)...);
+            exec<Acc>(m_queue, m_workDiv, kernelBundle);
 
             // Copy the result value to the host
             auto bufHostResult = allocBuf<bool, Idx>(m_devHost, static_cast<Idx>(1u));
diff --git a/test/integ/axpy/src/axpy.cpp b/test/integ/axpy/src/axpy.cpp
index 4553dba458f9..cbbaaee6b66b 100644
--- a/test/integ/axpy/src/axpy.cpp
+++ b/test/integ/axpy/src/axpy.cpp
@@ -91,7 +91,6 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs)
 
     // Get a queue on this device.
     QueueAcc queue(devAcc);
-
     alpaka::Vec<Dim, Idx> const extent(numElements);
 
     // Allocate host memory buffers in pinned memory.
@@ -146,13 +145,12 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs)
     std::cout << std::endl;
 #endif
 
-
-    auto const& bundeledKernel
+    auto const& kernelBundle
         = alpaka::KernelBundle(kernel, numElements, alpha, std::data(memBufAccX), std::data(memBufAccY));
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
         devAcc,
-        bundeledKernel,
+        kernelBundle,
         extent,
         static_cast<Idx>(3u),
         false,
@@ -164,13 +162,7 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs)
               << std::endl;
 
     // Create the kernel execution task.
-    auto const taskKernel = alpaka::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        numElements,
-        alpha,
-        std::data(memBufAccX),
-        std::data(memBufAccY));
+    auto const taskKernel = alpaka::createTaskKernel<Acc>(workDiv, kernelBundle);
 
     // Profile the kernel execution.
     std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms"
diff --git a/test/integ/mandelbrot/src/mandelbrot.cpp b/test/integ/mandelbrot/src/mandelbrot.cpp
index 6424b3e986e1..b6ab824d63f8 100644
--- a/test/integ/mandelbrot/src/mandelbrot.cpp
+++ b/test/integ/mandelbrot/src/mandelbrot.cpp
@@ -307,7 +307,7 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs)
     auto const [rowPitch, _] = alpaka::getPitchesInBytes(bufColorAcc);
     CHECK(rowPitch % sizeof(Val) == 0);
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    auto const& kernelBundle = alpaka::KernelBundle(
         kernel,
         std::data(bufColorAcc),
         numRows,
@@ -321,7 +321,7 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs)
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
         devAcc,
-        bundeledKernel,
+        kernelBundle,
         extent,
         alpaka::Vec<Dim, Idx>::ones(),
         false,
@@ -334,19 +334,7 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs)
               << ", kernel: " << alpaka::core::demangled<decltype(kernel)> << ", workDiv: " << workDiv << ")"
               << std::endl;
 
-
-    auto const taskKernel = alpaka::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        std::data(bufColorAcc),
-        numRows,
-        numCols,
-        rowPitch,
-        fMinR,
-        fMaxR,
-        fMinI,
-        fMaxI,
-        maxIterations);
+    auto const taskKernel = alpaka::createTaskKernel<Acc>(workDiv, kernelBundle);
 
     // Profile the kernel execution.
     std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms"
diff --git a/test/integ/matMul/src/matMul.cpp b/test/integ/matMul/src/matMul.cpp
index 41e2e4f9cdb0..0ead4f1761fe 100644
--- a/test/integ/matMul/src/matMul.cpp
+++ b/test/integ/matMul/src/matMul.cpp
@@ -244,7 +244,7 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs)
     std::cout << "pitchesC " << alpaka::getPitchesInBytes(bufCAcc) << " ldc: " << ldc << "\n";
 
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    auto const& kernelBundle = alpaka::KernelBundle(
         kernel,
         m,
         n,
@@ -260,7 +260,7 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs)
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
         devAcc,
-        bundeledKernel,
+        kernelBundle,
         extentC,
         alpaka::Vec<Dim, Idx>::ones(),
         false,
@@ -272,22 +272,8 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs)
               << ", kernel: " << alpaka::core::demangled<decltype(kernel)> << ", workDiv: " << workDiv << ")"
               << std::endl;
 
-
     // Create the kernel execution task.
-    auto const taskKernel = alpaka::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        m,
-        n,
-        k,
-        static_cast<Val>(1),
-        std::data(bufAAcc),
-        lda,
-        std::data(bufBAcc),
-        ldb,
-        static_cast<Val>(1),
-        std::data(bufCAcc),
-        ldc);
+    auto const taskKernel = alpaka::createTaskKernel<Acc>(workDiv, kernelBundle);
 
     // Profile the kernel execution.
     std::cout << "Execution time:   " << alpaka::test::integ::measureTaskRunTimeMs(queueAcc, taskKernel) << " ms"
diff --git a/test/integ/separableCompilation/src/main.cpp b/test/integ/separableCompilation/src/main.cpp
index 3fb4f3245682..0020e6ddf8e7 100644
--- a/test/integ/separableCompilation/src/main.cpp
+++ b/test/integ/separableCompilation/src/main.cpp
@@ -111,23 +111,21 @@ TEMPLATE_LIST_TEST_CASE("separableCompilation", "[separableCompilation]", TestAc
     alpaka::memcpy(queueAcc, memBufAccA, memBufHostA);
     alpaka::memcpy(queueAcc, memBufAccB, memBufHostB);
 
-    auto const& bundeledKernel
-        = alpaka::KernelBundle(kernel, memBufAccA.data(), memBufAccB.data(), memBufAccC.data(), numElements);
+    auto const& kernelBundle = alpaka::KernelBundle(
+        kernel,
+        std::data(memBufAccA),
+        std::data(memBufAccB),
+        std::data(memBufAccC),
+        numElements);
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, static_cast<Idx>(3u));
+    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, extent, static_cast<Idx>(3u));
 
     std::cout << alpaka::core::demangled<decltype(kernel)> << "("
               << "accelerator: " << alpaka::getAccName<Acc>() << ", workDiv: " << workDiv
               << ", numElements:" << numElements << ")" << std::endl;
 
     // Create the executor task.
-    auto const taskKernel = alpaka::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        memBufAccA.data(),
-        memBufAccB.data(),
-        memBufAccC.data(),
-        numElements);
+    auto const taskKernel = alpaka::createTaskKernel<Acc>(workDiv, kernelBundle);
 
     // Profile the kernel execution.
     std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queueAcc, taskKernel) << " ms"
diff --git a/test/integ/sharedMem/src/sharedMem.cpp b/test/integ/sharedMem/src/sharedMem.cpp
index 0377f623a5e1..db5648c90be4 100644
--- a/test/integ/sharedMem/src/sharedMem.cpp
+++ b/test/integ/sharedMem/src/sharedMem.cpp
@@ -131,14 +131,14 @@ TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs)
 
 
     auto blockRetValuesDummy = alpaka::allocBuf<Val, Idx>(devAcc, static_cast<Idx>(1));
-    // Kernel input during the runtim of kernel will be different and is chosen to depend on workdiv.
+    // Kernel input during the runtime of kernel will be different and is chosen to depend on workdiv.
     // Therefore initially a  workdiv is needed to find the parameter. Therefore in kernel bundle, we can not use the
     // real input for the buffer pointer.
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel, std::data(blockRetValuesDummy));
+    auto const& kernelBundle = alpaka::KernelBundle(kernel, std::data(blockRetValuesDummy));
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
         devAcc,
-        bundeledKernel,
+        kernelBundle,
         numElements,
         static_cast<Idx>(1u),
         false,
@@ -149,6 +149,7 @@ TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs)
               << ", kernel: " << alpaka::core::demangled<decltype(kernel)> << ", workDiv: " << workDiv << ")"
               << std::endl;
 
+    // Data size depends on workdiv
     Idx const gridBlocksCount(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(workDiv)[0u]);
     Idx const blockThreadCount(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(workDiv)[0u]);
 
@@ -160,8 +161,11 @@ TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs)
     auto blockRetValsAcc = alpaka::allocBuf<Val, Idx>(devAcc, resultElemCount);
     alpaka::memcpy(queue, blockRetValsAcc, blockRetVals, resultElemCount);
 
-    // Create the kernel execution task.
-    auto const taskKernel = alpaka::createTaskKernel<Acc>(workDiv, kernel, std::data(blockRetValsAcc));
+
+    // Create the kernel execution task using the real data.
+    auto const& kernelBundle2 = alpaka::KernelBundle(kernel, std::data(blockRetValsAcc));
+    auto const taskKernel = alpaka::createTaskKernel<Acc>(workDiv, kernelBundle2);
+
 
     // Profile the kernel execution.
     std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms"
diff --git a/test/unit/math/src/TestTemplate.hpp b/test/unit/math/src/TestTemplate.hpp
index 78607712da93..bfdff9419313 100644
--- a/test/unit/math/src/TestTemplate.hpp
+++ b/test/unit/math/src/TestTemplate.hpp
@@ -73,7 +73,6 @@ namespace mathtest
             // SETUP (defines and initialising)
             // DevAcc is defined in Buffer.hpp too.
             using DevAcc = alpaka::Dev<TAcc>;
-
             using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
             using TArgsItem = ArgsItem<TData, TFunctor::arity>;
 
@@ -99,12 +98,12 @@ namespace mathtest
             Results results{devAcc};
 
 
-            auto const& bundeledKernel
+            auto const& kernelBundle
                 = alpaka::KernelBundle(kernel, results.pDevBuffer, wrappedFunctor, args.pDevBuffer);
             // Let alpaka calculate good block and grid sizes given our full problem extent
             auto const workDiv = alpaka::getValidWorkDivForKernel<TAcc>(
                 devAcc,
-                bundeledKernel,
+                kernelBundle,
                 sizeExtent,
                 elementsPerThread,
                 false,
@@ -123,8 +122,7 @@ namespace mathtest
             results.copyToDevice(queue);
 
             // Enqueue the kernel execution task.
-            auto const taskKernel
-                = alpaka::createTaskKernel<TAcc>(workDiv, kernel, results.pDevBuffer, wrappedFunctor, args.pDevBuffer);
+            auto const taskKernel = alpaka::createTaskKernel<TAcc>(workDiv, kernelBundle);
             alpaka::enqueue(queue, taskKernel);
 
             // Copy back the results (encapsulated in the buffer class).