refactor examples and tests

- rename `bundeledKernelFill` into `kernelBundle` - use the kernel bundle everywhere where we already have this object for the automatically work division creation Co-authored-by: René Widera <r.widera@hzdr.de>
psychocoderHPC · Aug 7, 2024 · d83def0 · d83def0
1 parent f9de76c
commit d83def0
Show file tree

Hide file tree

Showing 24 changed files with 116 additions and 211 deletions.
diff --git a/docs/source/basic/cheatsheet.rst b/docs/source/basic/cheatsheet.rst
@@ -181,7 +181,7 @@ Prepare Kernel Bundle
 
      HeatEquationKernel heatEqKernel;
      // Arguments of KernelBundle: The kernel instance and the kernel arguments
-     auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+     auto const& kernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
 
 Automatically select a valid kernel launch configuration
   .. code-block:: c++
@@ -191,7 +191,7 @@ Automatically select a valid kernel launch configuration
 
      auto autoWorkDiv = getValidWorkDivForKernel<Acc>(
        device,
-       bundeledKernel,
+       kernelBundle,
        globalThreadExtent, elementsPerThread,
        false,
        GridBlockExtentSubDivRestrictions::Unrestricted);

diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp
@@ -164,12 +164,11 @@ auto example(TAccTag const&) -> int
 
     FillBufferKernel fillBufferKernel;
 
-    auto const& bundeledFillBufferKernel = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
+    auto const& fillBufferKernelBundle = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
     auto const hostWorkDiv
-        = alpaka::getValidWorkDivForKernel<Host>(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Host>(devHost, fillBufferKernelBundle, threadsPerGrid, elementsPerThread);
 
-    alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernel,
-                       hostViewPlainPtrMdSpan); // 1st kernel argument
+    alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernelBundle); // 1st kernel argument
 
     // Copy host to device Buffer
     //
@@ -203,14 +202,17 @@ auto example(TAccTag const&) -> int
     auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2);
 
     TestBufferKernel testBufferKernel;
-    auto const& bundeledTestBufferKernel = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);
+    auto const& restBufferKernelBundle1 = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);
+    auto const& restBufferKernelBundle2 = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan2);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const devWorkDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledTestBufferKernel, threadsPerGrid, elementsPerThread);
+    auto const devWorkDiv1
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, restBufferKernelBundle1, threadsPerGrid, elementsPerThread);
+    auto const devWorkDiv2
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, restBufferKernelBundle2, threadsPerGrid, elementsPerThread);
 
-    alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1);
-    alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2);
+    alpaka::exec<Acc>(devQueue, devWorkDiv1, restBufferKernelBundle1);
+    alpaka::exec<Acc>(devQueue, devWorkDiv2, restBufferKernelBundle2);
 
     // Print device Buffer
     //
@@ -223,11 +225,11 @@ auto example(TAccTag const&) -> int
     // completely distorted.
 
     PrintBufferKernel printBufferKernel;
-    alpaka::exec<Acc>(devQueue, devWorkDiv, printBufferKernel, deviceBufferMdSpan1);
+    alpaka::exec<Acc>(devQueue, devWorkDiv1, printBufferKernel, deviceBufferMdSpan1);
     alpaka::wait(devQueue);
     std::cout << std::endl;
 
-    alpaka::exec<Acc>(devQueue, devWorkDiv, printBufferKernel, deviceBufferMdSpan2);
+    alpaka::exec<Acc>(devQueue, devWorkDiv2, printBufferKernel, deviceBufferMdSpan2);
     alpaka::wait(devQueue);
     std::cout << std::endl;
 

diff --git a/example/complex/src/complex.cpp b/example/complex/src/complex.cpp
@@ -56,15 +56,14 @@ auto example(TAccTag const&) -> int
     Idx const threadsPerGrid = 1u;
     Idx const elementsPerThread = 1u;
 
-    ComplexKernel complexKernel;
+    alpaka::KernelBundle kernelBundle = ComplexKernel{};
 
-    auto const& bundeledKernel = alpaka::KernelBundle(complexKernel);
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
 
     // Run the kernel
-    alpaka::exec<Acc>(queue, workDiv, complexKernel);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     // Usage of alpaka::Complex<T> on the host side is the same as inside kernels, except math functions are not

diff --git a/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp b/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
@@ -148,24 +148,18 @@ auto example(TAccTag const&) -> int
     ConvolutionKernelMdspan2D convolutionKernel2D;
 
     // Make a bundle
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    auto const& kernelBundle = alpaka::KernelBundle(
         convolutionKernel2D,
         alpaka::experimental::getMdSpan(bufInputAcc),
         alpaka::experimental::getMdSpan(outputDeviceMemory),
         alpaka::experimental::getMdSpan(bufFilterAcc));
 
     //   Let alpaka calculate good block and grid sizes given our full problem extent.
-    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());
+    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, kernelBundle, extent, Vec::ones());
 
 
     // Run the kernel, pass 3 arrays as 2D mdspans
-    alpaka::exec<DevAcc>(
-        queueAcc,
-        workDiv,
-        convolutionKernel2D,
-        alpaka::experimental::getMdSpan(bufInputAcc),
-        alpaka::experimental::getMdSpan(outputDeviceMemory),
-        alpaka::experimental::getMdSpan(bufFilterAcc));
+    alpaka::exec<DevAcc>(queueAcc, workDiv, kernelBundle);
 
     // Allocate memory on host to receive the resulting matrix as an array
     auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent);

diff --git a/example/convolution1D/src/convolution1D.cpp b/example/convolution1D/src/convolution1D.cpp
@@ -140,7 +140,7 @@ auto example(TAccTag const&) -> int
     DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory);
     DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory);
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    auto const& kernelBundle = alpaka::KernelBundle(
         convolutionKernel,
         nativeInputDeviceMemory,
         nativeFilterDeviceMemory,
@@ -150,17 +150,9 @@ auto example(TAccTag const&) -> int
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
     // Run the kernel
-    alpaka::exec<DevAcc>(
-        queue,
-        workDiv,
-        convolutionKernel,
-        nativeInputDeviceMemory,
-        nativeFilterDeviceMemory,
-        nativeOutputDeviceMemory,
-        inputSize,
-        filterSize);
+    alpaka::exec<DevAcc>(queue, workDiv, kernelBundle);
 
     // Allocate memory on host
     auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, inputSize);

diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp
@@ -265,7 +265,7 @@ auto example(TAccTag const&) -> int
     alpaka::wait(queueAcc);
 
     // Calculate the allocated width, due to padding it might be larger then the matrix width
-    auto const intputWidthAllocated = [&]() -> const Idx
+    auto const intputWidthAllocated = [&]() -> Idx const
     {
         // Calculate pitch: The size of one line in bytes including padding.
         auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]};
@@ -294,7 +294,7 @@ auto example(TAccTag const&) -> int
     alpaka::wait(queueAcc);
 
     // Calculate the allocated width, due to padding it might be larger then the matrix width
-    auto const filterWidthAllocated = [&]() -> const Idx
+    auto const filterWidthAllocated = [&]() -> Idx const
     {
         // Calculate pitch: The size of one line in bytes including padding.
         auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]};
@@ -305,7 +305,7 @@ auto example(TAccTag const&) -> int
     //  ConvolutionKernel2DSharedMemory
     ConvolutionKernel2DSharedMemory convolutionKernel2D;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(
+    auto const& kernelBundle = alpaka::KernelBundle(
         convolutionKernel2D,
         alpaka::getPtrNative(bufInputAcc),
         alpaka::getPtrNative(outputDeviceMemory),
@@ -317,21 +317,10 @@ auto example(TAccTag const&) -> int
         filterWidthAllocated);
 
     //   Let alpaka calculate good block and grid sizes given our full problem extent.
-    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());
+    auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, kernelBundle, extent, Vec::ones());
 
     // Run the kernel
-    alpaka::exec<DevAcc>(
-        queueAcc,
-        workDiv,
-        convolutionKernel2D,
-        std::data(bufInputAcc),
-        std::data(outputDeviceMemory),
-        matrixWidth,
-        matrixHeight,
-        std::data(bufFilterAcc),
-        filterWidth,
-        intputWidthAllocated,
-        filterWidthAllocated);
+    alpaka::exec<DevAcc>(queueAcc, workDiv, kernelBundle);
 
     // Allocate memory on host to receive the resulting matrix as an array
     auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent1D);

diff --git a/example/counterBasedRng/src/counterBasedRng.cpp b/example/counterBasedRng/src/counterBasedRng.cpp
@@ -147,27 +147,19 @@ auto example(TAccTag const&) -> int
     BufAcc bufAcc(alpaka::allocBuf<Data, Idx>(devAcc, extent));
 
     CounterBasedRngKernel counterBasedRngKernel;
-    auto const& bundeledKernel
+    auto const& kernelBundleAcc
         = alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufAcc), key);
-    auto const& bundeledKernel2
+    auto const& kernelBundleHost
         = alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufHost), key);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDivAcc = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elementsPerThread);
+    auto const workDivAcc = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundleAcc, extent, elementsPerThread);
     auto const workDivHost
-        = alpaka::getValidWorkDivForKernel<AccHost>(devHost, bundeledKernel2, extent, elementsPerThreadHost);
+        = alpaka::getValidWorkDivForKernel<AccHost>(devHost, kernelBundleHost, extent, elementsPerThreadHost);
 
     // Create the kernel execution task.
-    auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(
-        workDivAcc,
-        CounterBasedRngKernel(),
-        alpaka::experimental::getMdSpan(bufAcc),
-        key);
-    auto const taskKernelHost = alpaka::createTaskKernel<AccHost>(
-        workDivHost,
-        CounterBasedRngKernel(),
-        alpaka::experimental::getMdSpan(bufHost),
-        key);
+    auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(workDivAcc, kernelBundleAcc);
+    auto const taskKernelHost = alpaka::createTaskKernel<AccHost>(workDivHost, kernelBundleHost);
 
     // Enqueue the kernel execution task
     alpaka::enqueue(queueHost, taskKernelHost);

diff --git a/example/heatEquation/src/heatEquation.cpp b/example/heatEquation/src/heatEquation.cpp
@@ -134,9 +134,9 @@ auto example(TAccTag const&) -> int
 
     HeatEquationKernel heatEqKernel;
 
-    auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+    auto const& kernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elemPerThread);
+    auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, extent, elemPerThread);
 
     // Copy host -> device
     alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost);
@@ -146,8 +146,10 @@ auto example(TAccTag const&) -> int
 
     for(uint32_t step = 0; step < numTimeSteps; step++)
     {
+        auto const& tmpKernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+
         // Compute next values
-        alpaka::exec<Acc>(queue, workDiv, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+        alpaka::exec<Acc>(queue, workDiv, tmpKernelBundle);
 
         // We assume the boundary conditions are constant and so these values
         // do not need to be updated.

diff --git a/example/helloWorld/src/helloWorld.cpp b/example/helloWorld/src/helloWorld.cpp
@@ -133,12 +133,11 @@ auto example(TAccTag const&) -> int
     // Kernels can be everything that is trivially copyable, has a
     // callable operator() and takes the accelerator as first
     // argument. So a kernel can be a class or struct, a lambda, etc.
-    HelloWorldKernel helloWorldKernel;
+    alpaka::KernelBundle kernelBundle = HelloWorldKernel{};
 
-    auto const& bundeledKernel = alpaka::KernelBundle(helloWorldKernel);
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
 
     // Run the kernel
     //
@@ -149,11 +148,7 @@ auto example(TAccTag const&) -> int
     // The queue can be blocking or non-blocking
     // depending on the chosen queue type (see type definitions above).
     // Here it is synchronous which means that the kernel is directly executed.
-    alpaka::exec<Acc>(
-        queue,
-        workDiv,
-        helloWorldKernel
-        /* put kernel arguments here */);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;

diff --git a/example/helloWorldLambda/src/helloWorldLambda.cpp b/example/helloWorldLambda/src/helloWorldLambda.cpp
@@ -117,12 +117,12 @@ auto example(TAccTag const&) -> int
         printf("\n");
     };
 
-    auto const& bundeledKernel = alpaka::KernelBundle(kernelLambda, nExclamationMarks);
+    auto const& kernelBundle = alpaka::KernelBundle(kernelLambda, nExclamationMarks);
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
 
-    alpaka::exec<Acc>(queue, workDiv, kernelLambda, nExclamationMarks);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;

diff --git a/example/kernelSpecialization/src/kernelSpecialization.cpp b/example/kernelSpecialization/src/kernelSpecialization.cpp
@@ -79,15 +79,14 @@ auto example(TAccTag const&) -> int
     // Define the work division
     std::size_t const threadsPerGrid = 16u;
     std::size_t const elementsPerThread = 1u;
-    Kernel kernel;
+    alpaka::KernelBundle kernelBundle = Kernel{};
 
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel);
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
 
     // Run the kernel
-    alpaka::exec<Acc>(queue, workDiv, kernel);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;

diff --git a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
@@ -147,19 +147,19 @@ auto example(TAccTag const&) -> int
     auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);
 
     MatrixMulKernel kernel;
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);
+    auto const& kernelBundle = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
         devAcc,
-        bundeledKernel,
+        kernelBundle,
         extentC,
         Vec::ones(),
         false,
         alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
 
     // Execute the kernel
-    alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
 
     // Copy result back to host
     alpaka::memcpy(queue, bufHostC, bufDevC);

diff --git a/example/monteCarloIntegration/src/monteCarloIntegration.cpp b/example/monteCarloIntegration/src/monteCarloIntegration.cpp
@@ -113,15 +113,15 @@ auto example(TAccTag const&) -> int
     alpaka::memcpy(queue, bufAcc, bufHost);
 
     Kernel kernel;
-    auto const& bundeledKernel = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{});
+    auto const& kernelBundle = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{});
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
         devAcc,
-        bundeledKernel,
+        kernelBundle,
         Vec(numThreads),
         Vec(numAlpakaElementsPerThread));
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, numPoints, ptrBufAcc, Function{});
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::memcpy(queue, bufHost, bufAcc);
     alpaka::wait(queue);
 

diff --git a/example/openMPSchedule/src/openMPSchedule.cpp b/example/openMPSchedule/src/openMPSchedule.cpp
@@ -107,25 +107,25 @@ auto main() -> int
     Idx const threadsPerGrid = 16u;
     Idx const elementsPerThread = 1u;
 
-    OpenMPScheduleDefaultKernel openMPScheduleDefaultKernel;
-    auto const& bundeledKernel = alpaka::KernelBundle(openMPScheduleDefaultKernel);
+    alpaka::KernelBundle kernelBundle = OpenMPScheduleDefaultKernel{};
+
     // Let alpaka calculate good block and grid sizes given our full problem extent
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
+        = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
 
     // Run the kernel setting no schedule explicitly.
     std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n";
-    alpaka::exec<Acc>(queue, workDiv, openMPScheduleDefaultKernel);
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     // Run the kernel setting the schedule via a trait
     std::cout << "\n\nOpenMPScheduleMemberKernel setting the schedule via a static member:\n";
-    alpaka::exec<Acc>(queue, workDiv, OpenMPScheduleMemberKernel{});
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     // Run the kernel setting the schedule via a trait
     std::cout << "\n\nOpenMPScheduleTraitKernel setting the schedule via trait:\n";
-    alpaka::exec<Acc>(queue, workDiv, OpenMPScheduleTraitKernel{});
+    alpaka::exec<Acc>(queue, workDiv, kernelBundle);
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;