From d83def0161125da43446be07d904be90c4cdc356 Mon Sep 17 00:00:00 2001 From: mehmet yusufoglu Date: Tue, 6 Aug 2024 11:56:36 +0200 Subject: [PATCH] refactor examples and tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - rename `bundeledKernelFill` into `kernelBundle` - use the kernel bundle everywhere where we already have this object for the automatically work division creation Co-authored-by: René Widera --- docs/source/basic/cheatsheet.rst | 4 +-- example/bufferCopy/src/bufferCopy.cpp | 24 +++++++++-------- example/complex/src/complex.cpp | 7 +++-- .../conv2DWithMdspan/src/conv2DWithMdspan.cpp | 12 +++------ example/convolution1D/src/convolution1D.cpp | 14 +++------- example/convolution2D/src/convolution2D.cpp | 21 ++++----------- .../counterBasedRng/src/counterBasedRng.cpp | 20 +++++--------- example/heatEquation/src/heatEquation.cpp | 8 +++--- example/helloWorld/src/helloWorld.cpp | 11 +++----- .../helloWorldLambda/src/helloWorldLambda.cpp | 6 ++--- .../src/kernelSpecialization.cpp | 7 +++-- .../src/matrixMulMdSpan.cpp | 6 ++--- .../src/monteCarloIntegration.cpp | 6 ++--- example/openMPSchedule/src/openMPSchedule.cpp | 12 ++++----- example/randomCells2D/src/randomCells2D.cpp | 26 ++++++------------- .../randomStrategies/src/randomStrategies.cpp | 26 ++++++------------- example/vectorAdd/src/vectorAdd.cpp | 12 +++------ .../alpaka/test/KernelExecutionFixture.hpp | 13 +++++----- test/integ/axpy/src/axpy.cpp | 14 +++------- test/integ/mandelbrot/src/mandelbrot.cpp | 18 +++---------- test/integ/matMul/src/matMul.cpp | 20 +++----------- test/integ/separableCompilation/src/main.cpp | 18 ++++++------- test/integ/sharedMem/src/sharedMem.cpp | 14 ++++++---- test/unit/math/src/TestTemplate.hpp | 8 +++--- 24 files changed, 116 insertions(+), 211 deletions(-) diff --git a/docs/source/basic/cheatsheet.rst b/docs/source/basic/cheatsheet.rst index 7cd60c4de97e..1e0d6dfc72b5 100644 --- a/docs/source/basic/cheatsheet.rst +++ b/docs/source/basic/cheatsheet.rst @@ -181,7 +181,7 @@ Prepare Kernel Bundle HeatEquationKernel heatEqKernel; // Arguments of KernelBundle: The kernel instance and the kernel arguments - auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt); + auto const& kernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt); Automatically select a valid kernel launch configuration .. code-block:: c++ @@ -191,7 +191,7 @@ Automatically select a valid kernel launch configuration auto autoWorkDiv = getValidWorkDivForKernel( device, - bundeledKernel, + kernelBundle, globalThreadExtent, elementsPerThread, false, GridBlockExtentSubDivRestrictions::Unrestricted); diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp index 53d9d25e7f84..b45d8aff777d 100644 --- a/example/bufferCopy/src/bufferCopy.cpp +++ b/example/bufferCopy/src/bufferCopy.cpp @@ -164,12 +164,11 @@ auto example(TAccTag const&) -> int FillBufferKernel fillBufferKernel; - auto const& bundeledFillBufferKernel = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan); + auto const& fillBufferKernelBundle = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan); auto const hostWorkDiv - = alpaka::getValidWorkDivForKernel(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread); + = alpaka::getValidWorkDivForKernel(devHost, fillBufferKernelBundle, threadsPerGrid, elementsPerThread); - alpaka::exec(hostQueue, hostWorkDiv, fillBufferKernel, - hostViewPlainPtrMdSpan); // 1st kernel argument + alpaka::exec(hostQueue, hostWorkDiv, fillBufferKernelBundle); // 1st kernel argument // Copy host to device Buffer // @@ -203,14 +202,17 @@ auto example(TAccTag const&) -> int auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2); TestBufferKernel testBufferKernel; - auto const& bundeledTestBufferKernel = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1); + auto const& restBufferKernelBundle1 = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1); + auto const& restBufferKernelBundle2 = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan2); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const devWorkDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledTestBufferKernel, threadsPerGrid, elementsPerThread); + auto const devWorkDiv1 + = alpaka::getValidWorkDivForKernel(devAcc, restBufferKernelBundle1, threadsPerGrid, elementsPerThread); + auto const devWorkDiv2 + = alpaka::getValidWorkDivForKernel(devAcc, restBufferKernelBundle2, threadsPerGrid, elementsPerThread); - alpaka::exec(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1); - alpaka::exec(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2); + alpaka::exec(devQueue, devWorkDiv1, restBufferKernelBundle1); + alpaka::exec(devQueue, devWorkDiv2, restBufferKernelBundle2); // Print device Buffer // @@ -223,11 +225,11 @@ auto example(TAccTag const&) -> int // completely distorted. PrintBufferKernel printBufferKernel; - alpaka::exec(devQueue, devWorkDiv, printBufferKernel, deviceBufferMdSpan1); + alpaka::exec(devQueue, devWorkDiv1, printBufferKernel, deviceBufferMdSpan1); alpaka::wait(devQueue); std::cout << std::endl; - alpaka::exec(devQueue, devWorkDiv, printBufferKernel, deviceBufferMdSpan2); + alpaka::exec(devQueue, devWorkDiv2, printBufferKernel, deviceBufferMdSpan2); alpaka::wait(devQueue); std::cout << std::endl; diff --git a/example/complex/src/complex.cpp b/example/complex/src/complex.cpp index 7c9b39563460..1ea8783842ed 100644 --- a/example/complex/src/complex.cpp +++ b/example/complex/src/complex.cpp @@ -56,15 +56,14 @@ auto example(TAccTag const&) -> int Idx const threadsPerGrid = 1u; Idx const elementsPerThread = 1u; - ComplexKernel complexKernel; + alpaka::KernelBundle kernelBundle = ComplexKernel{}; - auto const& bundeledKernel = alpaka::KernelBundle(complexKernel); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, threadsPerGrid, elementsPerThread); // Run the kernel - alpaka::exec(queue, workDiv, complexKernel); + alpaka::exec(queue, workDiv, kernelBundle); alpaka::wait(queue); // Usage of alpaka::Complex on the host side is the same as inside kernels, except math functions are not diff --git a/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp b/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp index 0a8b7d165b7d..5a41b7bf45ef 100644 --- a/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp +++ b/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp @@ -148,24 +148,18 @@ auto example(TAccTag const&) -> int ConvolutionKernelMdspan2D convolutionKernel2D; // Make a bundle - auto const& bundeledKernel = alpaka::KernelBundle( + auto const& kernelBundle = alpaka::KernelBundle( convolutionKernel2D, alpaka::experimental::getMdSpan(bufInputAcc), alpaka::experimental::getMdSpan(outputDeviceMemory), alpaka::experimental::getMdSpan(bufFilterAcc)); // Let alpaka calculate good block and grid sizes given our full problem extent. - auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, Vec::ones()); + auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, extent, Vec::ones()); // Run the kernel, pass 3 arrays as 2D mdspans - alpaka::exec( - queueAcc, - workDiv, - convolutionKernel2D, - alpaka::experimental::getMdSpan(bufInputAcc), - alpaka::experimental::getMdSpan(outputDeviceMemory), - alpaka::experimental::getMdSpan(bufFilterAcc)); + alpaka::exec(queueAcc, workDiv, kernelBundle); // Allocate memory on host to receive the resulting matrix as an array auto resultGpuHost = alpaka::allocBuf(devHost, extent); diff --git a/example/convolution1D/src/convolution1D.cpp b/example/convolution1D/src/convolution1D.cpp index 098dc8501d09..500e3284a46e 100644 --- a/example/convolution1D/src/convolution1D.cpp +++ b/example/convolution1D/src/convolution1D.cpp @@ -140,7 +140,7 @@ auto example(TAccTag const&) -> int DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory); DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory); - auto const& bundeledKernel = alpaka::KernelBundle( + auto const& kernelBundle = alpaka::KernelBundle( convolutionKernel, nativeInputDeviceMemory, nativeFilterDeviceMemory, @@ -150,17 +150,9 @@ auto example(TAccTag const&) -> int // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, threadsPerGrid, elementsPerThread); // Run the kernel - alpaka::exec( - queue, - workDiv, - convolutionKernel, - nativeInputDeviceMemory, - nativeFilterDeviceMemory, - nativeOutputDeviceMemory, - inputSize, - filterSize); + alpaka::exec(queue, workDiv, kernelBundle); // Allocate memory on host auto resultGpuHost = alpaka::allocBuf(devHost, inputSize); diff --git a/example/convolution2D/src/convolution2D.cpp b/example/convolution2D/src/convolution2D.cpp index 2c8a6b28d850..7d137adad6ee 100644 --- a/example/convolution2D/src/convolution2D.cpp +++ b/example/convolution2D/src/convolution2D.cpp @@ -265,7 +265,7 @@ auto example(TAccTag const&) -> int alpaka::wait(queueAcc); // Calculate the allocated width, due to padding it might be larger then the matrix width - auto const intputWidthAllocated = [&]() -> const Idx + auto const intputWidthAllocated = [&]() -> Idx const { // Calculate pitch: The size of one line in bytes including padding. auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]}; @@ -294,7 +294,7 @@ auto example(TAccTag const&) -> int alpaka::wait(queueAcc); // Calculate the allocated width, due to padding it might be larger then the matrix width - auto const filterWidthAllocated = [&]() -> const Idx + auto const filterWidthAllocated = [&]() -> Idx const { // Calculate pitch: The size of one line in bytes including padding. auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]}; @@ -305,7 +305,7 @@ auto example(TAccTag const&) -> int // ConvolutionKernel2DSharedMemory ConvolutionKernel2DSharedMemory convolutionKernel2D; - auto const& bundeledKernel = alpaka::KernelBundle( + auto const& kernelBundle = alpaka::KernelBundle( convolutionKernel2D, alpaka::getPtrNative(bufInputAcc), alpaka::getPtrNative(outputDeviceMemory), @@ -317,21 +317,10 @@ auto example(TAccTag const&) -> int filterWidthAllocated); // Let alpaka calculate good block and grid sizes given our full problem extent. - auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, Vec::ones()); + auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, extent, Vec::ones()); // Run the kernel - alpaka::exec( - queueAcc, - workDiv, - convolutionKernel2D, - std::data(bufInputAcc), - std::data(outputDeviceMemory), - matrixWidth, - matrixHeight, - std::data(bufFilterAcc), - filterWidth, - intputWidthAllocated, - filterWidthAllocated); + alpaka::exec(queueAcc, workDiv, kernelBundle); // Allocate memory on host to receive the resulting matrix as an array auto resultGpuHost = alpaka::allocBuf(devHost, extent1D); diff --git a/example/counterBasedRng/src/counterBasedRng.cpp b/example/counterBasedRng/src/counterBasedRng.cpp index 7a9a9abfc7fe..49f23c756384 100644 --- a/example/counterBasedRng/src/counterBasedRng.cpp +++ b/example/counterBasedRng/src/counterBasedRng.cpp @@ -147,27 +147,19 @@ auto example(TAccTag const&) -> int BufAcc bufAcc(alpaka::allocBuf(devAcc, extent)); CounterBasedRngKernel counterBasedRngKernel; - auto const& bundeledKernel + auto const& kernelBundleAcc = alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufAcc), key); - auto const& bundeledKernel2 + auto const& kernelBundleHost = alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufHost), key); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDivAcc = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, elementsPerThread); + auto const workDivAcc = alpaka::getValidWorkDivForKernel(devAcc, kernelBundleAcc, extent, elementsPerThread); auto const workDivHost - = alpaka::getValidWorkDivForKernel(devHost, bundeledKernel2, extent, elementsPerThreadHost); + = alpaka::getValidWorkDivForKernel(devHost, kernelBundleHost, extent, elementsPerThreadHost); // Create the kernel execution task. - auto const taskKernelAcc = alpaka::createTaskKernel( - workDivAcc, - CounterBasedRngKernel(), - alpaka::experimental::getMdSpan(bufAcc), - key); - auto const taskKernelHost = alpaka::createTaskKernel( - workDivHost, - CounterBasedRngKernel(), - alpaka::experimental::getMdSpan(bufHost), - key); + auto const taskKernelAcc = alpaka::createTaskKernel(workDivAcc, kernelBundleAcc); + auto const taskKernelHost = alpaka::createTaskKernel(workDivHost, kernelBundleHost); // Enqueue the kernel execution task alpaka::enqueue(queueHost, taskKernelHost); diff --git a/example/heatEquation/src/heatEquation.cpp b/example/heatEquation/src/heatEquation.cpp index df43a4e0ed47..958961078649 100644 --- a/example/heatEquation/src/heatEquation.cpp +++ b/example/heatEquation/src/heatEquation.cpp @@ -134,9 +134,9 @@ auto example(TAccTag const&) -> int HeatEquationKernel heatEqKernel; - auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt); + auto const& kernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, elemPerThread); + auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, extent, elemPerThread); // Copy host -> device alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost); @@ -146,8 +146,10 @@ auto example(TAccTag const&) -> int for(uint32_t step = 0; step < numTimeSteps; step++) { + auto const& tmpKernelBundle = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt); + // Compute next values - alpaka::exec(queue, workDiv, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt); + alpaka::exec(queue, workDiv, tmpKernelBundle); // We assume the boundary conditions are constant and so these values // do not need to be updated. diff --git a/example/helloWorld/src/helloWorld.cpp b/example/helloWorld/src/helloWorld.cpp index 646df34d7b66..579759517ddf 100644 --- a/example/helloWorld/src/helloWorld.cpp +++ b/example/helloWorld/src/helloWorld.cpp @@ -133,12 +133,11 @@ auto example(TAccTag const&) -> int // Kernels can be everything that is trivially copyable, has a // callable operator() and takes the accelerator as first // argument. So a kernel can be a class or struct, a lambda, etc. - HelloWorldKernel helloWorldKernel; + alpaka::KernelBundle kernelBundle = HelloWorldKernel{}; - auto const& bundeledKernel = alpaka::KernelBundle(helloWorldKernel); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, threadsPerGrid, elementsPerThread); // Run the kernel // @@ -149,11 +148,7 @@ auto example(TAccTag const&) -> int // The queue can be blocking or non-blocking // depending on the chosen queue type (see type definitions above). // Here it is synchronous which means that the kernel is directly executed. - alpaka::exec( - queue, - workDiv, - helloWorldKernel - /* put kernel arguments here */); + alpaka::exec(queue, workDiv, kernelBundle); alpaka::wait(queue); return EXIT_SUCCESS; diff --git a/example/helloWorldLambda/src/helloWorldLambda.cpp b/example/helloWorldLambda/src/helloWorldLambda.cpp index b0e028cea2d7..442d019bc287 100644 --- a/example/helloWorldLambda/src/helloWorldLambda.cpp +++ b/example/helloWorldLambda/src/helloWorldLambda.cpp @@ -117,12 +117,12 @@ auto example(TAccTag const&) -> int printf("\n"); }; - auto const& bundeledKernel = alpaka::KernelBundle(kernelLambda, nExclamationMarks); + auto const& kernelBundle = alpaka::KernelBundle(kernelLambda, nExclamationMarks); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, threadsPerGrid, elementsPerThread); - alpaka::exec(queue, workDiv, kernelLambda, nExclamationMarks); + alpaka::exec(queue, workDiv, kernelBundle); alpaka::wait(queue); return EXIT_SUCCESS; diff --git a/example/kernelSpecialization/src/kernelSpecialization.cpp b/example/kernelSpecialization/src/kernelSpecialization.cpp index 6bb7ccbda79f..ba78a189b179 100644 --- a/example/kernelSpecialization/src/kernelSpecialization.cpp +++ b/example/kernelSpecialization/src/kernelSpecialization.cpp @@ -79,15 +79,14 @@ auto example(TAccTag const&) -> int // Define the work division std::size_t const threadsPerGrid = 16u; std::size_t const elementsPerThread = 1u; - Kernel kernel; + alpaka::KernelBundle kernelBundle = Kernel{}; - auto const& bundeledKernel = alpaka::KernelBundle(kernel); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, threadsPerGrid, elementsPerThread); // Run the kernel - alpaka::exec(queue, workDiv, kernel); + alpaka::exec(queue, workDiv, kernelBundle); alpaka::wait(queue); return EXIT_SUCCESS; diff --git a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp index 1a5ee577b405..46c1f8b14a38 100644 --- a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp +++ b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp @@ -147,19 +147,19 @@ auto example(TAccTag const&) -> int auto mdDevC = alpaka::experimental::getMdSpan(bufDevC); MatrixMulKernel kernel; - auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC); + auto const& kernelBundle = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv = alpaka::getValidWorkDivForKernel( devAcc, - bundeledKernel, + kernelBundle, extentC, Vec::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); // Execute the kernel - alpaka::exec(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC); + alpaka::exec(queue, workDiv, kernelBundle); // Copy result back to host alpaka::memcpy(queue, bufHostC, bufDevC); diff --git a/example/monteCarloIntegration/src/monteCarloIntegration.cpp b/example/monteCarloIntegration/src/monteCarloIntegration.cpp index fd0961979b36..152fa7644643 100644 --- a/example/monteCarloIntegration/src/monteCarloIntegration.cpp +++ b/example/monteCarloIntegration/src/monteCarloIntegration.cpp @@ -113,15 +113,15 @@ auto example(TAccTag const&) -> int alpaka::memcpy(queue, bufAcc, bufHost); Kernel kernel; - auto const& bundeledKernel = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{}); + auto const& kernelBundle = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{}); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv = alpaka::getValidWorkDivForKernel( devAcc, - bundeledKernel, + kernelBundle, Vec(numThreads), Vec(numAlpakaElementsPerThread)); - alpaka::exec(queue, workDiv, kernel, numPoints, ptrBufAcc, Function{}); + alpaka::exec(queue, workDiv, kernelBundle); alpaka::memcpy(queue, bufHost, bufAcc); alpaka::wait(queue); diff --git a/example/openMPSchedule/src/openMPSchedule.cpp b/example/openMPSchedule/src/openMPSchedule.cpp index 1febb42cd685..60ed66e2eee5 100644 --- a/example/openMPSchedule/src/openMPSchedule.cpp +++ b/example/openMPSchedule/src/openMPSchedule.cpp @@ -107,25 +107,25 @@ auto main() -> int Idx const threadsPerGrid = 16u; Idx const elementsPerThread = 1u; - OpenMPScheduleDefaultKernel openMPScheduleDefaultKernel; - auto const& bundeledKernel = alpaka::KernelBundle(openMPScheduleDefaultKernel); + alpaka::KernelBundle kernelBundle = OpenMPScheduleDefaultKernel{}; + // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread); + = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, threadsPerGrid, elementsPerThread); // Run the kernel setting no schedule explicitly. std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n"; - alpaka::exec(queue, workDiv, openMPScheduleDefaultKernel); + alpaka::exec(queue, workDiv, kernelBundle); alpaka::wait(queue); // Run the kernel setting the schedule via a trait std::cout << "\n\nOpenMPScheduleMemberKernel setting the schedule via a static member:\n"; - alpaka::exec(queue, workDiv, OpenMPScheduleMemberKernel{}); + alpaka::exec(queue, workDiv, kernelBundle); alpaka::wait(queue); // Run the kernel setting the schedule via a trait std::cout << "\n\nOpenMPScheduleTraitKernel setting the schedule via trait:\n"; - alpaka::exec(queue, workDiv, OpenMPScheduleTraitKernel{}); + alpaka::exec(queue, workDiv, kernelBundle); alpaka::wait(queue); return EXIT_SUCCESS; diff --git a/example/randomCells2D/src/randomCells2D.cpp b/example/randomCells2D/src/randomCells2D.cpp index b5b45a5ef423..0194d7e747bb 100644 --- a/example/randomCells2D/src/randomCells2D.cpp +++ b/example/randomCells2D/src/randomCells2D.cpp @@ -202,16 +202,17 @@ auto example(TAccTag const&) -> int auto pitchBufAccRandV = alpaka::getPitchesInBytes(bufAccRandV)[0]; - auto const& bundeledKernelInitRandom + auto const& kernelBundleInitRandom = alpaka::KernelBundle(initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDivInitRandom - = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernelInitRandom, extent, Vec(perThreadY, perThreadX)); + = alpaka::getValidWorkDivForKernel(devAcc, kernelBundleInitRandom, extent, Vec(perThreadY, perThreadX)); alpaka::exec(queue, workDivInitRandom, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS); alpaka::wait(queue); - alpaka::exec(queue, workDivInitRandom, initRandomKernel, extent, ptrBufAccRandV, pitchBufAccRandV); + // execute the same kernel with different pointers + alpaka::exec(queue, workDivInitRandom, kernelBundleInitRandom); alpaka::wait(queue); auto pitchHostS = alpaka::getPitchesInBytes(bufHostS)[0]; @@ -230,7 +231,7 @@ auto example(TAccTag const&) -> int alpaka::memcpy(queue, bufAccS, bufHostS); RunTimestepKernelSingle runTimestepKernelSingle; - auto const& bundeledKernelRuntimeStep = alpaka::KernelBundle( + auto const& kernelBundleRuntimeStep = alpaka::KernelBundle( runTimestepKernelSingle, extent, ptrBufAccRandS, @@ -239,21 +240,10 @@ auto example(TAccTag const&) -> int pitchBufAccS); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDivRuntimeStep = alpaka::getValidWorkDivForKernel( - devAcc, - bundeledKernelRuntimeStep, - extent, - Vec(perThreadY, perThreadX)); + auto const workDivRuntimeStep + = alpaka::getValidWorkDivForKernel(devAcc, kernelBundleRuntimeStep, extent, Vec(perThreadY, perThreadX)); - alpaka::exec( - queue, - workDivRuntimeStep, - runTimestepKernelSingle, - extent, - ptrBufAccRandS, - ptrBufAccS, - pitchBufAccRandS, - pitchBufAccS); + alpaka::exec(queue, workDivRuntimeStep, kernelBundleRuntimeStep); alpaka::memcpy(queue, bufHostS, bufAccS); auto pitchBufAccV = alpaka::getPitchesInBytes(bufAccV)[0]; diff --git a/example/randomStrategies/src/randomStrategies.cpp b/example/randomStrategies/src/randomStrategies.cpp index ea87d290a2c4..0e4c0b125c70 100644 --- a/example/randomStrategies/src/randomStrategies.cpp +++ b/example/randomStrategies/src/randomStrategies.cpp @@ -247,7 +247,7 @@ void runStrategy(Box& box) // the initial parameters solely from the thread index - auto const& bundeledKernel = alpaka::KernelBundle( + auto const& kernelBundle = alpaka::KernelBundle( initRandomKernel, box.extentRand, ptrBufAccRand, @@ -256,7 +256,7 @@ void runStrategy(Box& box) // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDivRand = alpaka::getValidWorkDivForKernel::Acc>( alpaka::getDevByIdx(box.accPlatform, 0), - bundeledKernel, + kernelBundle, box.extentRand, typename Box::Vec(typename Box::Idx{1}), false, @@ -266,11 +266,7 @@ void runStrategy(Box& box) alpaka::exec::Acc>( box.queue, workDivRand, - initRandomKernel, - box.extentRand, - ptrBufAccRand, - static_cast( - box.extentResult[0] / box.extentRand[0])); // == NUM_ROLLS; amount of work to be performed by each thread + kernelBundle); alpaka::wait(box.queue); @@ -291,27 +287,21 @@ void runStrategy(Box& box) alpaka::memcpy(box.queue, box.bufAccResult, box.bufHostResult); FillKernel fillKernel; - auto const& bundeledKernelFill - = alpaka::KernelBundle(fillKernel, box.extentResult, ptrBufAccRand, ptrBufAccResult); + auto const& kernelBundleFill = alpaka::KernelBundle(fillKernel, box.extentResult, ptrBufAccRand, ptrBufAccResult); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workdivResult = alpaka::getValidWorkDivForKernel::Acc>( alpaka::getDevByIdx(box.accPlatform, 0), - bundeledKernelFill, + kernelBundleFill, box.extentResult, + // One thread per "point"; each performs NUM_ROLLS "rolls" typename Box::Vec(static_cast::Idx>( - NUM_ROLLS)), // One thread per "point"; each performs NUM_ROLLS "rolls" + NUM_ROLLS)), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); - alpaka::exec::Acc>( - box.queue, - workdivResult, - fillKernel, - box.extentResult, - ptrBufAccRand, - ptrBufAccResult); + alpaka::exec::Acc>(box.queue, workdivResult, kernelBundleFill); alpaka::memcpy(box.queue, box.bufHostResult, box.bufAccResult); alpaka::wait(box.queue); diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp index a99393fb8b5b..04d1aad73dc8 100644 --- a/example/vectorAdd/src/vectorAdd.cpp +++ b/example/vectorAdd/src/vectorAdd.cpp @@ -130,23 +130,17 @@ auto example(TAccTag const&) -> int // Instantiate the kernel function object VectorAddKernel kernel; - auto const& bundeledKernel = alpaka::KernelBundle( + auto const& kernelBundle = alpaka::KernelBundle( kernel, alpaka::getPtrNative(bufAccA), alpaka::getPtrNative(bufAccB), alpaka::getPtrNative(bufAccC), numElements); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, elementsPerThread); + auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, extent, elementsPerThread); // Create the kernel execution task. - auto const taskKernel = alpaka::createTaskKernel( - workDiv, - kernel, - std::data(bufAccA), - std::data(bufAccB), - std::data(bufAccC), - numElements); + auto const& taskKernel = alpaka::createTaskKernel(workDiv, kernelBundle); // Enqueue the kernel execution task { diff --git a/include/alpaka/test/KernelExecutionFixture.hpp b/include/alpaka/test/KernelExecutionFixture.hpp index 89d17d95b34f..5b146818cff0 100644 --- a/include/alpaka/test/KernelExecutionFixture.hpp +++ b/include/alpaka/test/KernelExecutionFixture.hpp @@ -62,21 +62,22 @@ namespace alpaka::test memset(m_queue, bufAccResult, static_cast(true)); - auto bundeledKernel = alpaka::KernelBundle( - kernelFnObj, - getPtrNative(bufAccResult), - std::forward(args)...); + auto const& kernelBundle + = alpaka::KernelBundle( + kernelFnObj, + getPtrNative(bufAccResult), + std::forward(args)...); // set workdiv if it is not before if(m_workDiv == WorkDiv{Vec::all(0), Vec::all(0), Vec::all(0)}) m_workDiv = alpaka::getValidWorkDivForKernel>( m_device, - bundeledKernel, + kernelBundle, m_extent, Vec::ones()); - exec(m_queue, m_workDiv, kernelFnObj, getPtrNative(bufAccResult), std::forward(args)...); + exec(m_queue, m_workDiv, kernelBundle); // Copy the result value to the host auto bufHostResult = allocBuf(m_devHost, static_cast(1u)); diff --git a/test/integ/axpy/src/axpy.cpp b/test/integ/axpy/src/axpy.cpp index 4553dba458f9..cbbaaee6b66b 100644 --- a/test/integ/axpy/src/axpy.cpp +++ b/test/integ/axpy/src/axpy.cpp @@ -91,7 +91,6 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs) // Get a queue on this device. QueueAcc queue(devAcc); - alpaka::Vec const extent(numElements); // Allocate host memory buffers in pinned memory. @@ -146,13 +145,12 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs) std::cout << std::endl; #endif - - auto const& bundeledKernel + auto const& kernelBundle = alpaka::KernelBundle(kernel, numElements, alpha, std::data(memBufAccX), std::data(memBufAccY)); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv = alpaka::getValidWorkDivForKernel( devAcc, - bundeledKernel, + kernelBundle, extent, static_cast(3u), false, @@ -164,13 +162,7 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs) << std::endl; // Create the kernel execution task. - auto const taskKernel = alpaka::createTaskKernel( - workDiv, - kernel, - numElements, - alpha, - std::data(memBufAccX), - std::data(memBufAccY)); + auto const taskKernel = alpaka::createTaskKernel(workDiv, kernelBundle); // Profile the kernel execution. std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms" diff --git a/test/integ/mandelbrot/src/mandelbrot.cpp b/test/integ/mandelbrot/src/mandelbrot.cpp index 6424b3e986e1..b6ab824d63f8 100644 --- a/test/integ/mandelbrot/src/mandelbrot.cpp +++ b/test/integ/mandelbrot/src/mandelbrot.cpp @@ -307,7 +307,7 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs) auto const [rowPitch, _] = alpaka::getPitchesInBytes(bufColorAcc); CHECK(rowPitch % sizeof(Val) == 0); - auto const& bundeledKernel = alpaka::KernelBundle( + auto const& kernelBundle = alpaka::KernelBundle( kernel, std::data(bufColorAcc), numRows, @@ -321,7 +321,7 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs) // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv = alpaka::getValidWorkDivForKernel( devAcc, - bundeledKernel, + kernelBundle, extent, alpaka::Vec::ones(), false, @@ -334,19 +334,7 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs) << ", kernel: " << alpaka::core::demangled << ", workDiv: " << workDiv << ")" << std::endl; - - auto const taskKernel = alpaka::createTaskKernel( - workDiv, - kernel, - std::data(bufColorAcc), - numRows, - numCols, - rowPitch, - fMinR, - fMaxR, - fMinI, - fMaxI, - maxIterations); + auto const taskKernel = alpaka::createTaskKernel(workDiv, kernelBundle); // Profile the kernel execution. std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms" diff --git a/test/integ/matMul/src/matMul.cpp b/test/integ/matMul/src/matMul.cpp index 41e2e4f9cdb0..0ead4f1761fe 100644 --- a/test/integ/matMul/src/matMul.cpp +++ b/test/integ/matMul/src/matMul.cpp @@ -244,7 +244,7 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs) std::cout << "pitchesC " << alpaka::getPitchesInBytes(bufCAcc) << " ldc: " << ldc << "\n"; - auto const& bundeledKernel = alpaka::KernelBundle( + auto const& kernelBundle = alpaka::KernelBundle( kernel, m, n, @@ -260,7 +260,7 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs) // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv = alpaka::getValidWorkDivForKernel( devAcc, - bundeledKernel, + kernelBundle, extentC, alpaka::Vec::ones(), false, @@ -272,22 +272,8 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs) << ", kernel: " << alpaka::core::demangled << ", workDiv: " << workDiv << ")" << std::endl; - // Create the kernel execution task. - auto const taskKernel = alpaka::createTaskKernel( - workDiv, - kernel, - m, - n, - k, - static_cast(1), - std::data(bufAAcc), - lda, - std::data(bufBAcc), - ldb, - static_cast(1), - std::data(bufCAcc), - ldc); + auto const taskKernel = alpaka::createTaskKernel(workDiv, kernelBundle); // Profile the kernel execution. std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queueAcc, taskKernel) << " ms" diff --git a/test/integ/separableCompilation/src/main.cpp b/test/integ/separableCompilation/src/main.cpp index 3fb4f3245682..0020e6ddf8e7 100644 --- a/test/integ/separableCompilation/src/main.cpp +++ b/test/integ/separableCompilation/src/main.cpp @@ -111,23 +111,21 @@ TEMPLATE_LIST_TEST_CASE("separableCompilation", "[separableCompilation]", TestAc alpaka::memcpy(queueAcc, memBufAccA, memBufHostA); alpaka::memcpy(queueAcc, memBufAccB, memBufHostB); - auto const& bundeledKernel - = alpaka::KernelBundle(kernel, memBufAccA.data(), memBufAccB.data(), memBufAccC.data(), numElements); + auto const& kernelBundle = alpaka::KernelBundle( + kernel, + std::data(memBufAccA), + std::data(memBufAccB), + std::data(memBufAccC), + numElements); // Let alpaka calculate good block and grid sizes given our full problem extent - auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, bundeledKernel, extent, static_cast(3u)); + auto const workDiv = alpaka::getValidWorkDivForKernel(devAcc, kernelBundle, extent, static_cast(3u)); std::cout << alpaka::core::demangled << "(" << "accelerator: " << alpaka::getAccName() << ", workDiv: " << workDiv << ", numElements:" << numElements << ")" << std::endl; // Create the executor task. - auto const taskKernel = alpaka::createTaskKernel( - workDiv, - kernel, - memBufAccA.data(), - memBufAccB.data(), - memBufAccC.data(), - numElements); + auto const taskKernel = alpaka::createTaskKernel(workDiv, kernelBundle); // Profile the kernel execution. std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queueAcc, taskKernel) << " ms" diff --git a/test/integ/sharedMem/src/sharedMem.cpp b/test/integ/sharedMem/src/sharedMem.cpp index 0377f623a5e1..db5648c90be4 100644 --- a/test/integ/sharedMem/src/sharedMem.cpp +++ b/test/integ/sharedMem/src/sharedMem.cpp @@ -131,14 +131,14 @@ TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs) auto blockRetValuesDummy = alpaka::allocBuf(devAcc, static_cast(1)); - // Kernel input during the runtim of kernel will be different and is chosen to depend on workdiv. + // Kernel input during the runtime of kernel will be different and is chosen to depend on workdiv. // Therefore initially a workdiv is needed to find the parameter. Therefore in kernel bundle, we can not use the // real input for the buffer pointer. - auto const& bundeledKernel = alpaka::KernelBundle(kernel, std::data(blockRetValuesDummy)); + auto const& kernelBundle = alpaka::KernelBundle(kernel, std::data(blockRetValuesDummy)); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv = alpaka::getValidWorkDivForKernel( devAcc, - bundeledKernel, + kernelBundle, numElements, static_cast(1u), false, @@ -149,6 +149,7 @@ TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs) << ", kernel: " << alpaka::core::demangled << ", workDiv: " << workDiv << ")" << std::endl; + // Data size depends on workdiv Idx const gridBlocksCount(alpaka::getWorkDiv(workDiv)[0u]); Idx const blockThreadCount(alpaka::getWorkDiv(workDiv)[0u]); @@ -160,8 +161,11 @@ TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs) auto blockRetValsAcc = alpaka::allocBuf(devAcc, resultElemCount); alpaka::memcpy(queue, blockRetValsAcc, blockRetVals, resultElemCount); - // Create the kernel execution task. - auto const taskKernel = alpaka::createTaskKernel(workDiv, kernel, std::data(blockRetValsAcc)); + + // Create the kernel execution task using the real data. + auto const& kernelBundle2 = alpaka::KernelBundle(kernel, std::data(blockRetValsAcc)); + auto const taskKernel = alpaka::createTaskKernel(workDiv, kernelBundle2); + // Profile the kernel execution. std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms" diff --git a/test/unit/math/src/TestTemplate.hpp b/test/unit/math/src/TestTemplate.hpp index 78607712da93..bfdff9419313 100644 --- a/test/unit/math/src/TestTemplate.hpp +++ b/test/unit/math/src/TestTemplate.hpp @@ -73,7 +73,6 @@ namespace mathtest // SETUP (defines and initialising) // DevAcc is defined in Buffer.hpp too. using DevAcc = alpaka::Dev; - using QueueAcc = alpaka::test::DefaultQueue; using TArgsItem = ArgsItem; @@ -99,12 +98,12 @@ namespace mathtest Results results{devAcc}; - auto const& bundeledKernel + auto const& kernelBundle = alpaka::KernelBundle(kernel, results.pDevBuffer, wrappedFunctor, args.pDevBuffer); // Let alpaka calculate good block and grid sizes given our full problem extent auto const workDiv = alpaka::getValidWorkDivForKernel( devAcc, - bundeledKernel, + kernelBundle, sizeExtent, elementsPerThread, false, @@ -123,8 +122,7 @@ namespace mathtest results.copyToDevice(queue); // Enqueue the kernel execution task. - auto const taskKernel - = alpaka::createTaskKernel(workDiv, kernel, results.pDevBuffer, wrappedFunctor, args.pDevBuffer); + auto const taskKernel = alpaka::createTaskKernel(workDiv, kernelBundle); alpaka::enqueue(queue, taskKernel); // Copy back the results (encapsulated in the buffer class).