Skip to content

Commit

Permalink
refactor examples and tests
Browse files Browse the repository at this point in the history
- rename `bundeledKernelFill` into `kernelBundle`
- use the kernel bundle everywhere where we already have this object for
  the automatically work division creation

Co-authored-by: René Widera <r.widera@hzdr.de>
  • Loading branch information
mehmetyusufoglu and psychocoderHPC committed Aug 7, 2024
1 parent f9de76c commit d83def0
Show file tree
Hide file tree
Showing 24 changed files with 116 additions and 211 deletions.
4 changes: 2 additions & 2 deletions docs/source/basic/cheatsheet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ Prepare Kernel Bundle

HeatEquationKernel heatEqKernel;
// Arguments of KernelBundle: The kernel instance and the kernel arguments
auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
auto const& kernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);

Automatically select a valid kernel launch configuration
.. code-block:: c++
Expand All @@ -191,7 +191,7 @@ Automatically select a valid kernel launch configuration

auto autoWorkDiv = getValidWorkDivForKernel<Acc>(
device,
bundeledKernel,
kernelBundle,
globalThreadExtent, elementsPerThread,
false,
GridBlockExtentSubDivRestrictions::Unrestricted);
Expand Down
24 changes: 13 additions & 11 deletions example/bufferCopy/src/bufferCopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,11 @@ auto example(TAccTag const&) -> int

FillBufferKernel fillBufferKernel;

auto const& bundeledFillBufferKernel = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
auto const& fillBufferKernelBundle = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
auto const hostWorkDiv
= alpaka::getValidWorkDivForKernel<Host>(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread);
= alpaka::getValidWorkDivForKernel<Host>(devHost, fillBufferKernelBundle, threadsPerGrid, elementsPerThread);

alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernel,
hostViewPlainPtrMdSpan); // 1st kernel argument
alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernelBundle); // 1st kernel argument

// Copy host to device Buffer
//
Expand Down Expand Up @@ -203,14 +202,17 @@ auto example(TAccTag const&) -> int
auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2);

TestBufferKernel testBufferKernel;
auto const& bundeledTestBufferKernel = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);
auto const& restBufferKernelBundle1 = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);
auto const& restBufferKernelBundle2 = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan2);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const devWorkDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledTestBufferKernel, threadsPerGrid, elementsPerThread);
auto const devWorkDiv1
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, restBufferKernelBundle1, threadsPerGrid, elementsPerThread);
auto const devWorkDiv2
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, restBufferKernelBundle2, threadsPerGrid, elementsPerThread);

alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1);
alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2);
alpaka::exec<Acc>(devQueue, devWorkDiv1, restBufferKernelBundle1);
alpaka::exec<Acc>(devQueue, devWorkDiv2, restBufferKernelBundle2);

// Print device Buffer
//
Expand All @@ -223,11 +225,11 @@ auto example(TAccTag const&) -> int
// completely distorted.

PrintBufferKernel printBufferKernel;
alpaka::exec<Acc>(devQueue, devWorkDiv, printBufferKernel, deviceBufferMdSpan1);
alpaka::exec<Acc>(devQueue, devWorkDiv1, printBufferKernel, deviceBufferMdSpan1);
alpaka::wait(devQueue);
std::cout << std::endl;

alpaka::exec<Acc>(devQueue, devWorkDiv, printBufferKernel, deviceBufferMdSpan2);
alpaka::exec<Acc>(devQueue, devWorkDiv2, printBufferKernel, deviceBufferMdSpan2);
alpaka::wait(devQueue);
std::cout << std::endl;

Expand Down
7 changes: 3 additions & 4 deletions example/complex/src/complex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,14 @@ auto example(TAccTag const&) -> int
Idx const threadsPerGrid = 1u;
Idx const elementsPerThread = 1u;

ComplexKernel complexKernel;
alpaka::KernelBundle kernelBundle = ComplexKernel{};

auto const& bundeledKernel = alpaka::KernelBundle(complexKernel);
// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);

// Run the kernel
alpaka::exec<Acc>(queue, workDiv, complexKernel);
alpaka::exec<Acc>(queue, workDiv, kernelBundle);
alpaka::wait(queue);

// Usage of alpaka::Complex<T> on the host side is the same as inside kernels, except math functions are not
Expand Down
12 changes: 3 additions & 9 deletions example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,24 +148,18 @@ auto example(TAccTag const&) -> int
ConvolutionKernelMdspan2D convolutionKernel2D;

// Make a bundle
auto const& bundeledKernel = alpaka::KernelBundle(
auto const& kernelBundle = alpaka::KernelBundle(
convolutionKernel2D,
alpaka::experimental::getMdSpan(bufInputAcc),
alpaka::experimental::getMdSpan(outputDeviceMemory),
alpaka::experimental::getMdSpan(bufFilterAcc));

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());
auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, kernelBundle, extent, Vec::ones());


// Run the kernel, pass 3 arrays as 2D mdspans
alpaka::exec<DevAcc>(
queueAcc,
workDiv,
convolutionKernel2D,
alpaka::experimental::getMdSpan(bufInputAcc),
alpaka::experimental::getMdSpan(outputDeviceMemory),
alpaka::experimental::getMdSpan(bufFilterAcc));
alpaka::exec<DevAcc>(queueAcc, workDiv, kernelBundle);

// Allocate memory on host to receive the resulting matrix as an array
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent);
Expand Down
14 changes: 3 additions & 11 deletions example/convolution1D/src/convolution1D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ auto example(TAccTag const&) -> int
DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory);
DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory);

auto const& bundeledKernel = alpaka::KernelBundle(
auto const& kernelBundle = alpaka::KernelBundle(
convolutionKernel,
nativeInputDeviceMemory,
nativeFilterDeviceMemory,
Expand All @@ -150,17 +150,9 @@ auto example(TAccTag const&) -> int

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
= alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);
// Run the kernel
alpaka::exec<DevAcc>(
queue,
workDiv,
convolutionKernel,
nativeInputDeviceMemory,
nativeFilterDeviceMemory,
nativeOutputDeviceMemory,
inputSize,
filterSize);
alpaka::exec<DevAcc>(queue, workDiv, kernelBundle);

// Allocate memory on host
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, inputSize);
Expand Down
21 changes: 5 additions & 16 deletions example/convolution2D/src/convolution2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ auto example(TAccTag const&) -> int
alpaka::wait(queueAcc);

// Calculate the allocated width, due to padding it might be larger then the matrix width
auto const intputWidthAllocated = [&]() -> const Idx
auto const intputWidthAllocated = [&]() -> Idx const
{
// Calculate pitch: The size of one line in bytes including padding.
auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]};
Expand Down Expand Up @@ -294,7 +294,7 @@ auto example(TAccTag const&) -> int
alpaka::wait(queueAcc);

// Calculate the allocated width, due to padding it might be larger then the matrix width
auto const filterWidthAllocated = [&]() -> const Idx
auto const filterWidthAllocated = [&]() -> Idx const
{
// Calculate pitch: The size of one line in bytes including padding.
auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]};
Expand All @@ -305,7 +305,7 @@ auto example(TAccTag const&) -> int
// ConvolutionKernel2DSharedMemory
ConvolutionKernel2DSharedMemory convolutionKernel2D;

auto const& bundeledKernel = alpaka::KernelBundle(
auto const& kernelBundle = alpaka::KernelBundle(
convolutionKernel2D,
alpaka::getPtrNative(bufInputAcc),
alpaka::getPtrNative(outputDeviceMemory),
Expand All @@ -317,21 +317,10 @@ auto example(TAccTag const&) -> int
filterWidthAllocated);

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());
auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, kernelBundle, extent, Vec::ones());

// Run the kernel
alpaka::exec<DevAcc>(
queueAcc,
workDiv,
convolutionKernel2D,
std::data(bufInputAcc),
std::data(outputDeviceMemory),
matrixWidth,
matrixHeight,
std::data(bufFilterAcc),
filterWidth,
intputWidthAllocated,
filterWidthAllocated);
alpaka::exec<DevAcc>(queueAcc, workDiv, kernelBundle);

// Allocate memory on host to receive the resulting matrix as an array
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent1D);
Expand Down
20 changes: 6 additions & 14 deletions example/counterBasedRng/src/counterBasedRng.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,27 +147,19 @@ auto example(TAccTag const&) -> int
BufAcc bufAcc(alpaka::allocBuf<Data, Idx>(devAcc, extent));

CounterBasedRngKernel counterBasedRngKernel;
auto const& bundeledKernel
auto const& kernelBundleAcc
= alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufAcc), key);
auto const& bundeledKernel2
auto const& kernelBundleHost
= alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufHost), key);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDivAcc = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elementsPerThread);
auto const workDivAcc = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundleAcc, extent, elementsPerThread);
auto const workDivHost
= alpaka::getValidWorkDivForKernel<AccHost>(devHost, bundeledKernel2, extent, elementsPerThreadHost);
= alpaka::getValidWorkDivForKernel<AccHost>(devHost, kernelBundleHost, extent, elementsPerThreadHost);

// Create the kernel execution task.
auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(
workDivAcc,
CounterBasedRngKernel(),
alpaka::experimental::getMdSpan(bufAcc),
key);
auto const taskKernelHost = alpaka::createTaskKernel<AccHost>(
workDivHost,
CounterBasedRngKernel(),
alpaka::experimental::getMdSpan(bufHost),
key);
auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(workDivAcc, kernelBundleAcc);
auto const taskKernelHost = alpaka::createTaskKernel<AccHost>(workDivHost, kernelBundleHost);

// Enqueue the kernel execution task
alpaka::enqueue(queueHost, taskKernelHost);
Expand Down
8 changes: 5 additions & 3 deletions example/heatEquation/src/heatEquation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ auto example(TAccTag const&) -> int

HeatEquationKernel heatEqKernel;

auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
auto const& kernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elemPerThread);
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, extent, elemPerThread);

// Copy host -> device
alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost);
Expand All @@ -146,8 +146,10 @@ auto example(TAccTag const&) -> int

for(uint32_t step = 0; step < numTimeSteps; step++)
{
auto const& tmpKernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);

// Compute next values
alpaka::exec<Acc>(queue, workDiv, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
alpaka::exec<Acc>(queue, workDiv, tmpKernelBundle);

// We assume the boundary conditions are constant and so these values
// do not need to be updated.
Expand Down
11 changes: 3 additions & 8 deletions example/helloWorld/src/helloWorld.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,11 @@ auto example(TAccTag const&) -> int
// Kernels can be everything that is trivially copyable, has a
// callable operator() and takes the accelerator as first
// argument. So a kernel can be a class or struct, a lambda, etc.
HelloWorldKernel helloWorldKernel;
alpaka::KernelBundle kernelBundle = HelloWorldKernel{};

auto const& bundeledKernel = alpaka::KernelBundle(helloWorldKernel);
// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);

// Run the kernel
//
Expand All @@ -149,11 +148,7 @@ auto example(TAccTag const&) -> int
// The queue can be blocking or non-blocking
// depending on the chosen queue type (see type definitions above).
// Here it is synchronous which means that the kernel is directly executed.
alpaka::exec<Acc>(
queue,
workDiv,
helloWorldKernel
/* put kernel arguments here */);
alpaka::exec<Acc>(queue, workDiv, kernelBundle);
alpaka::wait(queue);

return EXIT_SUCCESS;
Expand Down
6 changes: 3 additions & 3 deletions example/helloWorldLambda/src/helloWorldLambda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,12 @@ auto example(TAccTag const&) -> int
printf("\n");
};

auto const& bundeledKernel = alpaka::KernelBundle(kernelLambda, nExclamationMarks);
auto const& kernelBundle = alpaka::KernelBundle(kernelLambda, nExclamationMarks);
// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);

alpaka::exec<Acc>(queue, workDiv, kernelLambda, nExclamationMarks);
alpaka::exec<Acc>(queue, workDiv, kernelBundle);
alpaka::wait(queue);

return EXIT_SUCCESS;
Expand Down
7 changes: 3 additions & 4 deletions example/kernelSpecialization/src/kernelSpecialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,14 @@ auto example(TAccTag const&) -> int
// Define the work division
std::size_t const threadsPerGrid = 16u;
std::size_t const elementsPerThread = 1u;
Kernel kernel;
alpaka::KernelBundle kernelBundle = Kernel{};

auto const& bundeledKernel = alpaka::KernelBundle(kernel);
// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);

// Run the kernel
alpaka::exec<Acc>(queue, workDiv, kernel);
alpaka::exec<Acc>(queue, workDiv, kernelBundle);
alpaka::wait(queue);

return EXIT_SUCCESS;
Expand Down
6 changes: 3 additions & 3 deletions example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,19 +147,19 @@ auto example(TAccTag const&) -> int
auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);

MatrixMulKernel kernel;
auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);
auto const& kernelBundle = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
devAcc,
bundeledKernel,
kernelBundle,
extentC,
Vec::ones(),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);

// Execute the kernel
alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC);
alpaka::exec<Acc>(queue, workDiv, kernelBundle);

// Copy result back to host
alpaka::memcpy(queue, bufHostC, bufDevC);
Expand Down
6 changes: 3 additions & 3 deletions example/monteCarloIntegration/src/monteCarloIntegration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ auto example(TAccTag const&) -> int
alpaka::memcpy(queue, bufAcc, bufHost);

Kernel kernel;
auto const& bundeledKernel = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{});
auto const& kernelBundle = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{});
// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
devAcc,
bundeledKernel,
kernelBundle,
Vec(numThreads),
Vec(numAlpakaElementsPerThread));

alpaka::exec<Acc>(queue, workDiv, kernel, numPoints, ptrBufAcc, Function{});
alpaka::exec<Acc>(queue, workDiv, kernelBundle);
alpaka::memcpy(queue, bufHost, bufAcc);
alpaka::wait(queue);

Expand Down
12 changes: 6 additions & 6 deletions example/openMPSchedule/src/openMPSchedule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,25 +107,25 @@ auto main() -> int
Idx const threadsPerGrid = 16u;
Idx const elementsPerThread = 1u;

OpenMPScheduleDefaultKernel openMPScheduleDefaultKernel;
auto const& bundeledKernel = alpaka::KernelBundle(openMPScheduleDefaultKernel);
alpaka::KernelBundle kernelBundle = OpenMPScheduleDefaultKernel{};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, kernelBundle, threadsPerGrid, elementsPerThread);

// Run the kernel setting no schedule explicitly.
std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n";
alpaka::exec<Acc>(queue, workDiv, openMPScheduleDefaultKernel);
alpaka::exec<Acc>(queue, workDiv, kernelBundle);
alpaka::wait(queue);

// Run the kernel setting the schedule via a trait
std::cout << "\n\nOpenMPScheduleMemberKernel setting the schedule via a static member:\n";
alpaka::exec<Acc>(queue, workDiv, OpenMPScheduleMemberKernel{});
alpaka::exec<Acc>(queue, workDiv, kernelBundle);
alpaka::wait(queue);

// Run the kernel setting the schedule via a trait
std::cout << "\n\nOpenMPScheduleTraitKernel setting the schedule via trait:\n";
alpaka::exec<Acc>(queue, workDiv, OpenMPScheduleTraitKernel{});
alpaka::exec<Acc>(queue, workDiv, kernelBundle);
alpaka::wait(queue);

return EXIT_SUCCESS;
Expand Down
Loading

0 comments on commit d83def0

Please sign in to comment.