From c4351748038f73e2785ce9f26cd662aa50b0da46 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 19 Mar 2024 22:05:34 +0100 Subject: [PATCH] Improve error handling in GPU callbacks Update the callbacks used on the CUDA and HIP/ROCm backends to match the original CUDA implementation: in case of asynchronous errors, throw-catch an exception to let GDB intercept it, and propagate the exception to the framework. --- .../AlpakaCore/src/alpaka/EDMetadata.cc | 4 +- .../AlpakaInterface/BuildFile.xml | 1 + .../AlpakaInterface/interface/HostOnlyTask.h | 44 +++++++++++++++---- 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/HeterogeneousCore/AlpakaCore/src/alpaka/EDMetadata.cc b/HeterogeneousCore/AlpakaCore/src/alpaka/EDMetadata.cc index 16ff44581586c..8847d9b3f8e71 100644 --- a/HeterogeneousCore/AlpakaCore/src/alpaka/EDMetadata.cc +++ b/HeterogeneousCore/AlpakaCore/src/alpaka/EDMetadata.cc @@ -23,10 +23,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } void EDMetadata::enqueueCallback(edm::WaitingTaskWithArenaHolder holder) { - alpaka::enqueue(*queue_, alpaka::HostOnlyTask([holder = std::move(holder)]() { + alpaka::enqueue(*queue_, alpaka::HostOnlyTask([holder = std::move(holder)](std::exception_ptr eptr) { // The functor is required to be const, but the original waitingTaskHolder_ // needs to be notified... - const_cast(holder).doneWaiting(nullptr); + const_cast(holder).doneWaiting(eptr); })); } diff --git a/HeterogeneousCore/AlpakaInterface/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/BuildFile.xml index e9f4ecab743c1..8594a75cfe2fe 100644 --- a/HeterogeneousCore/AlpakaInterface/BuildFile.xml +++ b/HeterogeneousCore/AlpakaInterface/BuildFile.xml @@ -1,5 +1,6 @@ + diff --git a/HeterogeneousCore/AlpakaInterface/interface/HostOnlyTask.h b/HeterogeneousCore/AlpakaInterface/interface/HostOnlyTask.h index fc07921e24e12..aea82f0082f95 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/HostOnlyTask.h +++ b/HeterogeneousCore/AlpakaInterface/interface/HostOnlyTask.h @@ -4,6 +4,8 @@ #include #include +#include + #include namespace alpaka { @@ -14,12 +16,12 @@ namespace alpaka { //! dedicated host-side worker thread. class HostOnlyTask { public: - HostOnlyTask(std::function task) : task_(std::move(task)) {} + HostOnlyTask(std::function task) : task_(std::move(task)) {} - void operator()() const { task_(); } + void operator()(std::exception_ptr eptr) const { task_(eptr); } private: - std::function task_; + std::function task_; }; namespace trait { @@ -30,10 +32,22 @@ namespace alpaka { struct Enqueue { using TApi = ApiCudaRt; - static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) { - //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); + static void CUDART_CB callback(cudaStream_t queue, cudaError_t status, void* arg) { std::unique_ptr pTask(static_cast(arg)); - (*pTask)(); + if (status == cudaSuccess) { + (*pTask)(nullptr); + } else { + // wrap the exception in a try-catch block to let GDB "catch throw" break on it + try { + throw std::runtime_error(fmt::format("CUDA error: callback of stream {} received error {}: {}.", + fmt::ptr(queue), + cudaGetErrorName(status), + cudaGetErrorString(status))); + } catch (std::exception&) { + // pass the exception to the task + (*pTask)(std::current_exception()); + } + } } ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void { @@ -50,10 +64,22 @@ namespace alpaka { struct Enqueue { using TApi = ApiHipRt; - static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) { - //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); + static void callback(hipStream_t queue, hipError_t status, void* arg) { std::unique_ptr pTask(static_cast(arg)); - (*pTask)(); + if (status == hipSuccess) { + (*pTask)(nullptr); + } else { + // wrap the exception in a try-catch block to let GDB "catch throw" break on it + try { + throw std::runtime_error(fmt::format("HIP error: callback of stream {} received error {}: {}.", + fmt::ptr(queue), + hipGetErrorName(status), + hipGetErrorString(status))); + } catch (std::exception&) { + // pass the exception to the task + (*pTask)(std::current_exception()); + } + } } ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void {