diff --git a/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp b/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp index 22f83f53710d..69eb9ae3c80b 100644 --- a/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp +++ b/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp @@ -153,41 +153,41 @@ namespace alpaka // Reading only the necessary attributes with cudaDeviceGetAttribute is faster than reading all with cuda // https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/ int multiProcessorCount = {}; - ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaDeviceGetAttribute( &multiProcessorCount, cudaDevAttrMultiProcessorCount, dev.m_iDevice)); int maxGridSize[3] = {}; - ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaDeviceGetAttribute( &maxGridSize[0], cudaDevAttrMaxGridDimX, dev.m_iDevice)); - ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaDeviceGetAttribute( &maxGridSize[1], cudaDevAttrMaxGridDimY, dev.m_iDevice)); - ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaDeviceGetAttribute( &maxGridSize[2], cudaDevAttrMaxGridDimZ, dev.m_iDevice)); int maxBlockDim[3] = {}; - ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaDeviceGetAttribute( &maxBlockDim[0], cudaDevAttrMaxBlockDimX, dev.m_iDevice)); - ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaDeviceGetAttribute( &maxBlockDim[1], cudaDevAttrMaxBlockDimY, dev.m_iDevice)); - ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaDeviceGetAttribute( &maxBlockDim[2], cudaDevAttrMaxBlockDimZ, dev.m_iDevice)); int maxThreadsPerBlock = {}; - ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaDeviceGetAttribute( &maxThreadsPerBlock, cudaDevAttrMaxThreadsPerBlock, dev.m_iDevice)); @@ -219,7 +219,7 @@ namespace alpaka #else hipDeviceProp_t hipDevProp; - ALPAKA_HIP_RT_CHECK(hipGetDeviceProperties( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipGetDeviceProperties( &hipDevProp, dev.m_iDevice)); diff --git a/include/alpaka/core/Cuda.hpp b/include/alpaka/core/Cuda.hpp index 2a559b1cfb5a..098debafd175 100644 --- a/include/alpaka/core/Cuda.hpp +++ b/include/alpaka/core/Cuda.hpp @@ -34,10 +34,8 @@ #include #include -#include #include #include -#include #include #include #include @@ -50,97 +48,10 @@ #error "CUDA version 9.0 or greater required!" #endif -namespace alpaka -{ - namespace cuda - { - namespace detail - { - //----------------------------------------------------------------------------- - //! CUDA runtime API error checking with log and exception, ignoring specific error values - ALPAKA_FN_HOST inline auto cudaRtCheck( - cudaError_t const & error, - char const * desc, - char const * file, - int const & line) - -> void - { - if(error != cudaSuccess) - { - std::string const sError(std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '" + cudaGetErrorName(error) + "': '" + std::string(cudaGetErrorString(error)) + "'!"); -#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL - std::cerr << sError << std::endl; -#endif - ALPAKA_DEBUG_BREAK; - // reset the last error to allow user side error handling - cudaGetLastError(); - throw std::runtime_error(sError); - } - } - //----------------------------------------------------------------------------- - //! CUDA runtime API error checking with log and exception, ignoring specific error values - // NOTE: All ignored errors have to be convertible to cudaError_t. - template< - typename... TErrors> - ALPAKA_FN_HOST auto cudaRtCheckIgnore( - cudaError_t const & error, - char const * cmd, - char const * file, - int const & line, - TErrors && ... ignoredErrorCodes) - -> void - { - if(error != cudaSuccess) - { - std::array const aIgnoredErrorCodes{ignoredErrorCodes...}; - - // If the error code is not one of the ignored ones. - if(std::find(aIgnoredErrorCodes.cbegin(), aIgnoredErrorCodes.cend(), error) == aIgnoredErrorCodes.cend()) - { - cudaRtCheck(error, ("'" + std::string(cmd) + "' returned error ").c_str(), file, line); - } - } - } - //----------------------------------------------------------------------------- - //! CUDA runtime API last error checking with log and exception. - ALPAKA_FN_HOST inline auto cudaRtCheckLastError( - char const * desc, - char const * file, - int const & line) - -> void - { - cudaError_t const error(cudaGetLastError()); - cudaRtCheck(error, desc, file, line); - } - } - } -} - -#if BOOST_COMP_MSVC - //----------------------------------------------------------------------------- - //! CUDA runtime error checking with log and exception, ignoring specific error values - #define ALPAKA_CUDA_RT_CHECK_IGNORE(cmd, ...)\ - ::alpaka::cuda::detail::cudaRtCheckLastError("'" #cmd "' A previous CUDA call (not this one) set the error ", __FILE__, __LINE__);\ - ::alpaka::cuda::detail::cudaRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, __VA_ARGS__) -#else - #if BOOST_COMP_CLANG - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" - #endif - //----------------------------------------------------------------------------- - //! CUDA runtime error checking with log and exception, ignoring specific error values - #define ALPAKA_CUDA_RT_CHECK_IGNORE(cmd, ...)\ - ::alpaka::cuda::detail::cudaRtCheckLastError("'" #cmd "' A previous CUDA call (not this one) set the error ", __FILE__, __LINE__);\ - ::alpaka::cuda::detail::cudaRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, ##__VA_ARGS__) - #if BOOST_COMP_CLANG - #pragma clang diagnostic pop - #endif -#endif - -//----------------------------------------------------------------------------- -//! CUDA runtime error checking with log and exception. -#define ALPAKA_CUDA_RT_CHECK(cmd)\ - ALPAKA_CUDA_RT_CHECK_IGNORE(cmd) +#define ALPAKA_PP_CONCAT_DO(X,Y) X##Y +#define ALPAKA_PP_CONCAT(X,Y) ALPAKA_PP_CONCAT_DO(X,Y) +//! prefix a name with `cuda` +#define ALPAKA_API_PREFIX(name) ALPAKA_PP_CONCAT_DO(cuda,name) namespace alpaka { @@ -747,4 +658,6 @@ namespace alpaka } } +#include + #endif diff --git a/include/alpaka/core/Hip.hpp b/include/alpaka/core/Hip.hpp index df6ada481efe..79d264430995 100644 --- a/include/alpaka/core/Hip.hpp +++ b/include/alpaka/core/Hip.hpp @@ -26,107 +26,18 @@ #include -#include #include #include -#include -#include -#include #include #ifdef __HIP_PLATFORM_HCC__ #define HIPRT_CB #endif - -namespace alpaka -{ - namespace hip - { - namespace detail - { - //----------------------------------------------------------------------------- - //! HIP runtime API error checking with log and exception, ignoring specific error values - ALPAKA_FN_HOST inline auto hipRtCheck( - hipError_t const & error, - char const * desc, - char const * file, - int const & line) - -> void - { - if(error != hipSuccess) - { - std::string const sError(std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '" + hipGetErrorName(error) + "': '" + std::string(hipGetErrorString(error)) + "'!"); -#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL - std::cerr << sError << std::endl; -#endif - ALPAKA_DEBUG_BREAK; - throw std::runtime_error(sError); - } - } - //----------------------------------------------------------------------------- - //! HIP runtime API error checking with log and exception, ignoring specific error values - // NOTE: All ignored errors have to be convertible to hipError_t. - template< - typename... TErrors> - ALPAKA_FN_HOST auto hipRtCheckIgnore( - hipError_t const & error, - char const * cmd, - char const * file, - int const & line, - TErrors && ... ignoredErrorCodes) - -> void - { - if(error != hipSuccess) - { - std::array const aIgnoredErrorCodes{ignoredErrorCodes...}; - // If the error code is not one of the ignored ones. - if(std::find(aIgnoredErrorCodes.cbegin(), aIgnoredErrorCodes.cend(), error) == aIgnoredErrorCodes.cend()) - { - hipRtCheck(error, ("'" + std::string(cmd) + "' returned error ").c_str(), file, line); - } - } - } - //----------------------------------------------------------------------------- - //! HIP runtime API last error checking with log and exception. - ALPAKA_FN_HOST inline auto hipRtCheckLastError( - char const * desc, - char const * file, - int const & line) - -> void - { - hipError_t const error(hipGetLastError()); - hipRtCheck(error, desc, file, line); - } - } - } -} - -#if BOOST_COMP_MSVC - //----------------------------------------------------------------------------- - //! HIP runtime error checking with log and exception, ignoring specific error values - #define ALPAKA_HIP_RT_CHECK_IGNORE(cmd, ...)\ - ::alpaka::hip::detail::hipRtCheckLastError("'" #cmd "' A previous HIP call (not this one) set the error ", __FILE__, __LINE__);\ - ::alpaka::hip::detail::hipRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, __VA_ARGS__) -#else - #if BOOST_COMP_CLANG - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" - #endif - //----------------------------------------------------------------------------- - //! HIP runtime error checking with log and exception, ignoring specific error values - #define ALPAKA_HIP_RT_CHECK_IGNORE(cmd, ...)\ - ::alpaka::hip::detail::hipRtCheckLastError("'" #cmd "' A previous HIP call (not this one) set the error ", __FILE__, __LINE__);\ - ::alpaka::hip::detail::hipRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, ##__VA_ARGS__) - #if BOOST_COMP_CLANG - #pragma clang diagnostic pop - #endif -#endif - -//----------------------------------------------------------------------------- -//! HIP runtime error checking with log and exception. -#define ALPAKA_HIP_RT_CHECK(cmd)\ - ALPAKA_HIP_RT_CHECK_IGNORE(cmd) +#define ALPAKA_PP_CONCAT_DO(X,Y) X##Y +#define ALPAKA_PP_CONCAT(X,Y) ALPAKA_PP_CONCAT_DO(X,Y) +//! prefix a name with `hip` +#define ALPAKA_API_PREFIX(name) ALPAKA_PP_CONCAT_DO(hip,name) //----------------------------------------------------------------------------- // HIP vector_types.h trait specializations. @@ -704,4 +615,6 @@ namespace alpaka } } +#include + #endif diff --git a/include/alpaka/core/UniformCudaHip.hpp b/include/alpaka/core/UniformCudaHip.hpp new file mode 100644 index 000000000000..44ad40de4d96 --- /dev/null +++ b/include/alpaka/core/UniformCudaHip.hpp @@ -0,0 +1,136 @@ +/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) + +#include + + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA + #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA! +#endif + +#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP + #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP! +#endif + +// Backend specific includes. +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + #include +#else + #include +#endif + +#include +#include +#include +#include +#include + +namespace alpaka +{ + namespace uniform_cuda_hip + { + namespace detail + { + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + using Error_t = cudaError; +#else + using Error_t = hipError_t; +#endif + //----------------------------------------------------------------------------- + //! CUDA/HIP runtime API error checking with log and exception, ignoring specific error values + ALPAKA_FN_HOST inline auto rtCheck( + Error_t const & error, + char const * desc, + char const * file, + int const & line) + -> void + { + if(error != ALPAKA_API_PREFIX(Success)) + { + std::string const sError(std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '" + ALPAKA_API_PREFIX(GetErrorName)(error) + "': '" + std::string(ALPAKA_API_PREFIX(GetErrorString)(error)) + "'!"); +#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL + std::cerr << sError << std::endl; +#endif + ALPAKA_DEBUG_BREAK; + // reset the last error to allow user side error handling + ALPAKA_API_PREFIX(GetLastError)(); + throw std::runtime_error(sError); + } + } + //----------------------------------------------------------------------------- + //! CUDA/Hip runtime API error checking with log and exception, ignoring specific error values + // NOTE: All ignored errors have to be convertible to Error_t. + template< + typename... TErrors> + ALPAKA_FN_HOST auto rtCheckIgnore( + Error_t const & error, + char const * cmd, + char const * file, + int const & line, + TErrors && ... ignoredErrorCodes) + -> void + { + if(error != ALPAKA_API_PREFIX(Success)) + { + std::array const aIgnoredErrorCodes{ignoredErrorCodes...}; + + // If the error code is not one of the ignored ones. + if(std::find(aIgnoredErrorCodes.cbegin(), aIgnoredErrorCodes.cend(), error) == aIgnoredErrorCodes.cend()) + { + rtCheck(error, ("'" + std::string(cmd) + "' returned error ").c_str(), file, line); + } + } + } + //----------------------------------------------------------------------------- + //! CUDA runtime API last error checking with log and exception. + ALPAKA_FN_HOST inline auto rtCheckLastError( + char const * desc, + char const * file, + int const & line) + -> void + { + Error_t const error(ALPAKA_API_PREFIX(GetLastError)()); + rtCheck(error, desc, file, line); + } + } + } +} + +#if BOOST_COMP_MSVC + //----------------------------------------------------------------------------- + //! CUDA runtime error checking with log and exception, ignoring specific error values + #define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd, ...)\ + ::alpaka::uniform_cuda_hip::detail::rtCheckLastError("'" #cmd "' A previous API call (not this one) set the error ", __FILE__, __LINE__);\ + ::alpaka::uniform_cuda_hip::detail::rtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, __VA_ARGS__) +#else + #if BOOST_COMP_CLANG + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" + #endif + //----------------------------------------------------------------------------- + //! CUDA runtime error checking with log and exception, ignoring specific error values + #define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd, ...)\ + ::alpaka::uniform_cuda_hip::detail::rtCheckLastError("'" #cmd "' A previous API call (not this one) set the error ", __FILE__, __LINE__);\ + ::alpaka::uniform_cuda_hip::detail::rtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, ##__VA_ARGS__) + #if BOOST_COMP_CLANG + #pragma clang diagnostic pop + #endif +#endif + +//----------------------------------------------------------------------------- +//! CUDA runtime error checking with log and exception. +#define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd)\ + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd) + +#endif diff --git a/include/alpaka/dev/DevUniformCudaHipRt.hpp b/include/alpaka/dev/DevUniformCudaHipRt.hpp index 226f32c3070b..e6b69ac2f569 100644 --- a/include/alpaka/dev/DevUniformCudaHipRt.hpp +++ b/include/alpaka/dev/DevUniformCudaHipRt.hpp @@ -116,24 +116,18 @@ namespace alpaka dev::DevUniformCudaHipRt const & dev) -> std::string { + // There is cuda/hip-DeviceGetAttribute as faster alternative to cuda/hip-GetDeviceProperties to get a single device property but it has no option to get the name #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - // There is cudaDeviceGetAttribute as faster alternative to cudaGetDeviceProperties to get a single device property but it has no option to get the name - cudaDeviceProp cudaDevProp; - ALPAKA_CUDA_RT_CHECK( - cudaGetDeviceProperties( - &cudaDevProp, - dev.m_iDevice)); - - return std::string(cudaDevProp.name); + cudaDeviceProp devProp; #else - hipDeviceProp_t hipDevProp; - ALPAKA_HIP_RT_CHECK( - hipGetDeviceProperties( - &hipDevProp, + hipDeviceProp_t devProp; +#endif + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(GetDeviceProperties)( + &devProp, dev.m_iDevice)); - return std::string(hipDevProp.name); -#endif + return std::string(devProp.name); } }; @@ -148,37 +142,20 @@ namespace alpaka dev::DevUniformCudaHipRt const & dev) -> std::size_t { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED // Set the current device to wait for. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - dev.m_iDevice)); - - std::size_t freeInternal(0u); - std::size_t totalInternal(0u); - - ALPAKA_CUDA_RT_CHECK( - cudaMemGetInfo( - &freeInternal, - &totalInternal)); - - return totalInternal; -#else - // Set the current device to wait for. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( dev.m_iDevice)); std::size_t freeInternal(0u); std::size_t totalInternal(0u); - ALPAKA_HIP_RT_CHECK( - hipMemGetInfo( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemGetInfo)( &freeInternal, &totalInternal)); return totalInternal; -#endif } }; @@ -193,37 +170,20 @@ namespace alpaka dev::DevUniformCudaHipRt const & dev) -> std::size_t { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - // Set the current device to wait for. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - dev.m_iDevice)); - - std::size_t freeInternal(0u); - std::size_t totalInternal(0u); - - ALPAKA_CUDA_RT_CHECK( - cudaMemGetInfo( - &freeInternal, - &totalInternal)); - - return freeInternal; -#else // Set the current device to wait for. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( dev.m_iDevice)); std::size_t freeInternal(0u); std::size_t totalInternal(0u); - ALPAKA_HIP_RT_CHECK( - hipMemGetInfo( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemGetInfo)( &freeInternal, &totalInternal)); return freeInternal; -#endif } }; @@ -240,21 +200,12 @@ namespace alpaka { ALPAKA_DEBUG_FULL_LOG_SCOPE; -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED // Set the current device to wait for. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( dev.m_iDevice)); - ALPAKA_CUDA_RT_CHECK( - cudaDeviceReset()); -#else - // Set the current device to wait for. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - dev.m_iDevice)); - ALPAKA_HIP_RT_CHECK( - hipDeviceReset()); -#endif + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(DeviceReset)()); } }; } @@ -321,17 +272,11 @@ namespace alpaka -> void { ALPAKA_DEBUG_FULL_LOG_SCOPE; -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - // Set the current device to wait for. - ALPAKA_CUDA_RT_CHECK(cudaSetDevice( - dev.m_iDevice)); - ALPAKA_CUDA_RT_CHECK(cudaDeviceSynchronize()); -#else + // Set the current device to wait for. - ALPAKA_HIP_RT_CHECK(hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)( dev.m_iDevice)); - ALPAKA_HIP_RT_CHECK(hipDeviceSynchronize()); -#endif + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(DeviceSynchronize)()); } }; } diff --git a/include/alpaka/event/EventUniformCudaHipRt.hpp b/include/alpaka/event/EventUniformCudaHipRt.hpp index 5c9107fe1d32..336b77a9ed61 100644 --- a/include/alpaka/event/EventUniformCudaHipRt.hpp +++ b/include/alpaka/event/EventUniformCudaHipRt.hpp @@ -62,31 +62,21 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - m_dev.m_iDevice)); - // Create the event on the current device with the specified flags. Valid flags include: - // - cudaEventDefault: Default event creation flag. - // - cudaEventBlockingSync : Specifies that event should use blocking synchronization. - // A host thread that uses cudaEventSynchronize() to wait on an event created with this flag will block until the event actually completes. - // - cudaEventDisableTiming : Specifies that the created event does not need to record timing data. - // Events created with this flag specified and the cudaEventBlockingSync flag not specified will provide the best performance when used with cudaStreamWaitEvent() and cudaEventQuery(). - ALPAKA_CUDA_RT_CHECK( - cudaEventCreateWithFlags( - &m_UniformCudaHipEvent, - (bBusyWait ? cudaEventDefault : cudaEventBlockingSync) | cudaEventDisableTiming)); -#else - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( m_dev.m_iDevice)); - ALPAKA_HIP_RT_CHECK( - hipEventCreateWithFlags( + // Create the event on the current device with the specified flags. Valid flags include: + // - cuda/hip-EventDefault: Default event creation flag. + // - cuda/hip-EventBlockingSync : Specifies that event should use blocking synchronization. + // A host thread that uses cuda/hip-EventSynchronize() to wait on an event created with this flag will block until the event actually completes. + // - cuda/hip-EventDisableTiming : Specifies that the created event does not need to record timing data. + // Events created with this flag specified and the cuda/hip-EventBlockingSync flag not specified will provide the best performance when used with cudaStreamWaitEvent() and cudaEventQuery(). + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(EventCreateWithFlags)( &m_UniformCudaHipEvent, - (bBusyWait ? hipEventDefault : hipEventBlockingSync) | hipEventDisableTiming)); -#endif + (bBusyWait ? ALPAKA_API_PREFIX(EventDefault) : ALPAKA_API_PREFIX(EventBlockingSync)) | ALPAKA_API_PREFIX(EventDisableTiming))); } //----------------------------------------------------------------------------- EventUniformCudaHipImpl(EventUniformCudaHipImpl const &) = delete; @@ -101,30 +91,21 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - // Set the current device. \TODO: Is setting the current device before cudaEventDestroy required? -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaSetDevice( + // Set the current device. \TODO: Is setting the current device before cuda/hip-EventDestroy required? + + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)( m_dev.m_iDevice)); - // In case event has been recorded but has not yet been completed when cudaEventDestroy() is called, the function will return immediately + // In case event has been recorded but has not yet been completed when cuda/hip-EventDestroy() is called, the function will return immediately // and the resources associated with event will be released automatically once the device has completed event. // -> No need to synchronize here. - ALPAKA_CUDA_RT_CHECK(cudaEventDestroy( - m_UniformCudaHipEvent)); -#else - ALPAKA_HIP_RT_CHECK(hipSetDevice( - m_dev.m_iDevice)); - ALPAKA_HIP_RT_CHECK(hipEventDestroy( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(EventDestroy)( m_UniformCudaHipEvent)); -#endif } public: dev::DevUniformCudaHipRt const m_dev; //!< The device this event is bound to. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - cudaEvent_t m_UniformCudaHipEvent; -#else - hipEvent_t m_UniformCudaHipEvent; -#endif + + ALPAKA_API_PREFIX(Event_t) m_UniformCudaHipEvent; }; } } @@ -208,21 +189,12 @@ namespace alpaka ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; // Query is allowed even for events on non current device. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - cudaError_t ret = cudaSuccess; - ALPAKA_CUDA_RT_CHECK_IGNORE( - ret = cudaEventQuery( + ALPAKA_API_PREFIX(Error_t) ret = ALPAKA_API_PREFIX(Success); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE( + ret = ALPAKA_API_PREFIX(EventQuery)( event.m_spEventImpl->m_UniformCudaHipEvent), - cudaErrorNotReady); - return (ret == cudaSuccess); -#else - hipError_t ret = hipSuccess; - ALPAKA_HIP_RT_CHECK_IGNORE( - ret = hipEventQuery( - event.m_spEventImpl->m_UniformCudaHipEvent), - hipErrorNotReady); - return (ret == hipSuccess); -#endif + ALPAKA_API_PREFIX(ErrorNotReady)); + return (ret == ALPAKA_API_PREFIX(Success)); } }; } @@ -245,15 +217,10 @@ namespace alpaka -> void { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaEventRecord( + + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(EventRecord)( event.m_spEventImpl->m_UniformCudaHipEvent, queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else - ALPAKA_HIP_RT_CHECK(hipEventRecord( - event.m_spEventImpl->m_UniformCudaHipEvent, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif } }; //############################################################################# @@ -270,15 +237,10 @@ namespace alpaka -> void { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaEventRecord( - event.m_spEventImpl->m_UniformCudaHipEvent, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else - ALPAKA_HIP_RT_CHECK(hipEventRecord( + + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(EventRecord)( event.m_spEventImpl->m_UniformCudaHipEvent, queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif } }; } @@ -304,13 +266,8 @@ namespace alpaka ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; // Sync is allowed even for events on non current device. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaEventSynchronize( - event.m_spEventImpl->m_UniformCudaHipEvent)); -#else - ALPAKA_HIP_RT_CHECK(hipEventSynchronize( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(EventSynchronize)( event.m_spEventImpl->m_UniformCudaHipEvent)); -#endif } }; //############################################################################# @@ -328,17 +285,10 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaStreamWaitEvent( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamWaitEvent)( queue.m_spQueueImpl->m_UniformCudaHipQueue, event.m_spEventImpl->m_UniformCudaHipEvent, 0)); -#else - ALPAKA_HIP_RT_CHECK(hipStreamWaitEvent( - queue.m_spQueueImpl->m_UniformCudaHipQueue, - event.m_spEventImpl->m_UniformCudaHipEvent, - 0)); -#endif } }; //############################################################################# @@ -356,17 +306,10 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaStreamWaitEvent( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamWaitEvent)( queue.m_spQueueImpl->m_UniformCudaHipQueue, event.m_spEventImpl->m_UniformCudaHipEvent, 0)); -#else - ALPAKA_HIP_RT_CHECK(hipStreamWaitEvent( - queue.m_spQueueImpl->m_UniformCudaHipQueue, - event.m_spEventImpl->m_UniformCudaHipEvent, - 0)); -#endif } }; //############################################################################# @@ -387,28 +330,14 @@ namespace alpaka ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; // Set the current device. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( dev.m_iDevice)); -#else - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - dev.m_iDevice)); -#endif - -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaStreamWaitEvent( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamWaitEvent)( nullptr, event.m_spEventImpl->m_UniformCudaHipEvent, 0)); -#else - ALPAKA_HIP_RT_CHECK(hipStreamWaitEvent( - nullptr, - event.m_spEventImpl->m_UniformCudaHipEvent, - 0)); -#endif } }; } diff --git a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp index bbaadf830c91..b347c51239f5 100644 --- a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp +++ b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp @@ -379,16 +379,10 @@ namespace alpaka #endif #endif -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - queue.m_spQueueImpl->m_dev.m_iDevice)); -#else - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( queue.m_spQueueImpl->m_dev.m_iDevice)); -#endif // Enqueue the kernel execution. // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch language extension expects the arguments by value. // This forces the type of a float argument given with std::forward to this function to be of type float instead of e.g. "float const & __ptr64" (MSVC). @@ -421,19 +415,12 @@ namespace alpaka task.m_args); #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL - #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Wait for the kernel execution to finish but do not check error return of this call. - // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message. - cudaStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue); - std::string const msg("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with"); - ::alpaka::cuda::detail::cudaRtCheckLastError(msg.c_str(), __FILE__, __LINE__); - #else - hipStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue); - std::string const msg("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with"); - ::alpaka::hip::detail::hipRtCheckLastError(msg.c_str(), __FILE__, __LINE__); - #endif + // Wait for the kernel execution to finish but do not check error return of this call. + // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message. + ALPAKA_API_PREFIX(StreamSynchronize)( + queue.m_spQueueImpl->m_UniformCudaHipQueue); + std::string const msg("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with"); + ::alpaka::uniform_cuda_hip::detail::rtCheckLastError(msg.c_str(), __FILE__, __LINE__); #endif } }; @@ -520,25 +507,13 @@ namespace alpaka auto kernelName = kernel::uniform_cuda_hip::detail::uniformCudaHipKernel...>; #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - // Log the function attributes. - #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - cudaFuncAttributes funcAttrs; - cudaFuncGetAttributes(&funcAttrs, kernelName); - std::cout << __func__ - << " binaryVersion: " << funcAttrs.binaryVersion - << " constSizeBytes: " << funcAttrs.constSizeBytes << " B" - << " localSizeBytes: " << funcAttrs.localSizeBytes << " B" - << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock - << " numRegs: " << funcAttrs.numRegs - << " ptxVersion: " << funcAttrs.ptxVersion - << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" - << std::endl; - #else // hipFuncAttributes not ported from HIP to HIP. + // TODO why this is currently not possible + // // Log the function attributes. - /* TODO why this is currently not possible - hipFuncAttributes funcAttrs; - hipFuncGetAttributes(&funcAttrs, kernelName); + #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + ALPAKA_API_PREFIX(FuncAttributes) funcAttrs; + ALPAKA_API_PREFIX(FuncGetAttributes)(&funcAttrs, kernelName); std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion << " constSizeBytes: " << funcAttrs.constSizeBytes << " B" @@ -548,21 +523,14 @@ namespace alpaka << " ptxVersion: " << funcAttrs.ptxVersion << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl; - */ #endif - #endif // Set the current device. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( queue.m_spQueueImpl->m_dev.m_iDevice)); -#else - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - queue.m_spQueueImpl->m_dev.m_iDevice)); -#endif + // Enqueue the kernel execution. meta::apply( [&](std::decay_t const & ... args) @@ -593,20 +561,11 @@ namespace alpaka // Wait for the kernel execution to finish but do not check error return of this call. // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - cudaStreamSynchronize( + ALPAKA_API_PREFIX(StreamSynchronize)( queue.m_spQueueImpl->m_UniformCudaHipQueue); -#else - hipStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue); -#endif #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL std::string const msg("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with"); - #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ::alpaka::cuda::detail::cudaRtCheckLastError(msg.c_str(), __FILE__, __LINE__); - #else - ::alpaka::hip::detail::hipRtCheckLastError(msg.c_str(), __FILE__, __LINE__); - #endif + ::alpaka::uniform_cuda_hip::detail::rtCheckLastError(msg.c_str(), __FILE__, __LINE__); #endif } }; diff --git a/include/alpaka/mem/buf/BufCpu.hpp b/include/alpaka/mem/buf/BufCpu.hpp index 46587c2542a1..a3560b906057 100644 --- a/include/alpaka/mem/buf/BufCpu.hpp +++ b/include/alpaka/mem/buf/BufCpu.hpp @@ -456,31 +456,19 @@ namespace alpaka if(!mem::buf::isPinned(buf)) { -#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP) if(buf.m_spBufCpuImpl->m_extentElements.prod() != 0) { // - cudaHostRegisterDefault: // See http://cgi.cs.indiana.edu/~nhusted/dokuwiki/doku.php?id=programming:cudaperformance1 // - cudaHostRegisterPortable: // The memory returned by this call will be considered as pinned memory by all CUDA contexts, not just the one that performed the allocation. - ALPAKA_CUDA_RT_CHECK_IGNORE( - cudaHostRegister( - const_cast(reinterpret_cast(mem::view::getPtrNative(buf))), - extent::getExtentProduct(buf) * sizeof(elem::Elem>), - cudaHostRegisterDefault), - cudaErrorHostMemoryAlreadyRegistered); - - buf.m_spBufCpuImpl->m_bPinned = true; - } -#elif (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP) - if(buf.m_spBufCpuImpl->m_extentElements.prod() != 0) - { - ALPAKA_HIP_RT_CHECK_IGNORE( - hipHostRegister( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE( + ALPAKA_API_PREFIX(HostRegister)( const_cast(reinterpret_cast(mem::view::getPtrNative(buf))), extent::getExtentProduct(buf) * sizeof(elem::Elem>), - hipHostRegisterDefault), - hipErrorHostMemoryAlreadyRegistered); + ALPAKA_API_PREFIX(HostRegisterDefault)), + ALPAKA_API_PREFIX(ErrorHostMemoryAlreadyRegistered)); buf.m_spBufCpuImpl->m_bPinned = true; } @@ -527,18 +515,11 @@ namespace alpaka if(mem::buf::isPinned(bufImpl)) { -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA - ALPAKA_CUDA_RT_CHECK_IGNORE( - cudaHostUnregister( - const_cast(reinterpret_cast(bufImpl.m_pMem))), - cudaErrorHostMemoryNotRegistered); - - bufImpl.m_bPinned = false; -#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP - ALPAKA_HIP_RT_CHECK_IGNORE( - hipHostUnregister( +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP) + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE( + ALPAKA_API_PREFIX(HostUnregister)( const_cast(reinterpret_cast(bufImpl.m_pMem))), - hipErrorHostMemoryNotRegistered); + ALPAKA_API_PREFIX(ErrorHostMemoryNotRegistered)); bufImpl.m_bPinned = false; #else diff --git a/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp b/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp index 7c3fc9df94a7..565f2210ad61 100644 --- a/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp +++ b/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp @@ -117,23 +117,14 @@ namespace alpaka -> void { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - dev.m_iDevice)); - // Free the buffer. - ALPAKA_CUDA_RT_CHECK( - cudaFree(reinterpret_cast(memPtr))); -#else + // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( dev.m_iDevice)); // Free the buffer. - ALPAKA_HIP_RT_CHECK( - hipFree(reinterpret_cast(memPtr))); -#endif + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Free)(reinterpret_cast(memPtr))); } public: @@ -356,29 +347,17 @@ namespace alpaka auto const width(extent::getWidth(extent)); auto const widthBytes(width * static_cast(sizeof(TElem))); -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - dev.m_iDevice)); - // Allocate the buffer on this device. - void * memPtr; - ALPAKA_CUDA_RT_CHECK( - cudaMalloc( - &memPtr, - static_cast(widthBytes))); -#else + // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( dev.m_iDevice)); // Allocate the buffer on this device. void * memPtr; - ALPAKA_HIP_RT_CHECK( - hipMalloc( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Malloc)( &memPtr, static_cast(widthBytes))); -#endif #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL std::cout << __func__ @@ -420,44 +399,30 @@ namespace alpaka auto const widthBytes(width * static_cast(sizeof(TElem))); auto const height(extent::getHeight(extent)); -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - dev.m_iDevice)); - // Allocate the buffer on this device. - void * memPtr; - std::size_t pitchBytes; - ALPAKA_CUDA_RT_CHECK( - cudaMallocPitch( - &memPtr, - &pitchBytes, - static_cast(widthBytes), - static_cast(height))); - ALPAKA_ASSERT(pitchBytes >= static_cast(widthBytes) || (width * height) == 0); -#else + void * memPtr = nullptr; std::size_t pitchBytes = widthBytes; - +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED //FIXME: hcc cannot handle zero-size input (throws Unknown Error) - if(width!=0 && height!=0) { + if(width!=0 && height!=0) +#endif + { // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( dev.m_iDevice)); // Allocate the buffer on this device. - ALPAKA_HIP_RT_CHECK( - hipMallocPitch( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MallocPitch)( &memPtr, &pitchBytes, static_cast(widthBytes), static_cast(height))); ALPAKA_ASSERT(pitchBytes >= static_cast(widthBytes) || (width * height) == 0); } -#endif #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL std::cout << __func__ @@ -497,91 +462,52 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaExtent const cudaExtentVal( - make_cudaExtent( + ALPAKA_API_PREFIX(Extent) const extentVal( + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Extent))( static_cast(extent::getWidth(extent) * static_cast(sizeof(TElem))), static_cast(extent::getHeight(extent)), static_cast(extent::getDepth(extent)))); - // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - dev.m_iDevice)); - // Allocate the buffer on this device. - cudaPitchedPtr cudaPitchedPtrVal; - ALPAKA_CUDA_RT_CHECK( - cudaMalloc3D( - &cudaPitchedPtrVal, - cudaExtentVal)); -#else - hipExtent const hipExtentVal( - make_hipExtent( - static_cast(extent::getWidth(extent) * static_cast(sizeof(TElem))), - static_cast(extent::getHeight(extent)), - static_cast(extent::getDepth(extent)))); - - hipPitchedPtr hipPitchedPtrVal = {0}; - + ALPAKA_API_PREFIX(PitchedPtr) pitchedPtrVal; + pitchedPtrVal.ptr = nullptr; +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED //FIXME: hcc cannot handle zero-size input - if(hipExtentVal.width!=0 - && hipExtentVal.height!=0 - && hipExtentVal.depth!=0) { + if(extentVal.width!=0 + && extentVal.height!=0 + && extentVal.depth!=0) +#endif + { // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( dev.m_iDevice)); // Allocate the buffer on this device. - ALPAKA_HIP_RT_CHECK( - hipMalloc3D( - &hipPitchedPtrVal, - hipExtentVal)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Malloc3D)( + &pitchedPtrVal, + extentVal)); } -#endif - #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED std::cout << __func__ << " ew: " << extent::getWidth(extent) - << " eh: " << cudaExtentVal.height - << " ed: " << cudaExtentVal.depth - << " ewb: " << cudaExtentVal.width - << " ptr: " << cudaPitchedPtrVal.ptr - << " pitch: " << cudaPitchedPtrVal.pitch - << " wb: " << cudaPitchedPtrVal.xsize - << " h: " << cudaPitchedPtrVal.ysize + << " eh: " << extentVal.height + << " ed: " << extentVal.depth + << " ewb: " << extentVal.width + << " ptr: " << pitchedPtrVal.ptr + << " pitch: " << pitchedPtrVal.pitch + << " wb: " << pitchedPtrVal.xsize + << " h: " << pitchedPtrVal.ysize << std::endl; - #else - std::cout << __func__ - << " ew: " << extent::getWidth(extent) - << " eh: " << hipExtentVal.height - << " ed: " << hipExtentVal.depth - << " ewb: " << hipExtentVal.width - << " ptr: " << hipPitchedPtrVal.ptr - << " pitch: " << hipPitchedPtrVal.pitch - << " wb: " << hipPitchedPtrVal.xsize - << " h: " << hipPitchedPtrVal.ysize - << std::endl; - #endif #endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED return mem::buf::BufUniformCudaHipRt, TIdx>( dev, - reinterpret_cast(cudaPitchedPtrVal.ptr), - static_cast(cudaPitchedPtrVal.pitch), + reinterpret_cast(pitchedPtrVal.ptr), + static_cast(pitchedPtrVal.pitch), extent); -#else - return - mem::buf::BufUniformCudaHipRt, TIdx>( - dev, - reinterpret_cast(hipPitchedPtrVal.ptr), - static_cast(hipPitchedPtrVal.pitch), - extent); -#endif } }; //############################################################################# @@ -786,25 +712,14 @@ namespace alpaka // If it is already the same device, nothing has to be mapped. if(dev::getDev(buf) != dev) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - // cudaHostRegisterMapped: + // cuda/hip-HostRegisterMapped: // Maps the allocation into the CUDA/HIP address space.The device pointer to the memory may be obtained by calling cudaHostGetDevicePointer(). // This feature is available only on GPUs with compute capability greater than or equal to 1.1. - ALPAKA_CUDA_RT_CHECK( - cudaHostRegister( - const_cast(reinterpret_cast(mem::view::getPtrNative(buf))), - extent::getExtentProduct(buf) * sizeof(elem::Elem>), - cudaHostRegisterMapped)); -#else - // hipHostRegisterMapped: - // Maps the allocation into the HIP address space.The device pointer to the memory may be obtained by calling hipHostGetDevicePointer(). - // This feature is available only on GPUs with compute capability greater than or equal to 1.1. - ALPAKA_HIP_RT_CHECK( - hipHostRegister( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(HostRegister)( const_cast(reinterpret_cast(mem::view::getPtrNative(buf))), extent::getExtentProduct(buf) * sizeof(elem::Elem>), - hipHostRegisterMapped)); -#endif + ALPAKA_API_PREFIX(HostRegisterMapped))); } } }; @@ -830,15 +745,9 @@ namespace alpaka { // Unmaps the memory range whose base address is specified by ptr, and makes it pageable again. // \FIXME: If the memory has separately been pinned before we destroy the pinning state. -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - ALPAKA_CUDA_RT_CHECK( - cudaHostUnregister( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(HostUnregister)( const_cast(reinterpret_cast(mem::view::getPtrNative(buf))))); -#else - ALPAKA_HIP_RT_CHECK( - hipHostUnregister( - const_cast(reinterpret_cast(mem::view::getPtrNative(buf))))); -#endif } // If it is already the same device, nothing has to be unmapped. } @@ -867,19 +776,13 @@ namespace alpaka { // TODO: Check if the memory is mapped at all! TElem * pDev(nullptr); -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - ALPAKA_CUDA_RT_CHECK( - cudaHostGetDevicePointer( - &pDev, - const_cast(reinterpret_cast(mem::view::getPtrNative(buf))), - 0)); -#else - ALPAKA_HIP_RT_CHECK( - hipHostGetDevicePointer( + + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(HostGetDevicePointer)( &pDev, const_cast(reinterpret_cast(mem::view::getPtrNative(buf))), 0)); -#endif + return pDev; } //----------------------------------------------------------------------------- @@ -890,19 +793,13 @@ namespace alpaka { // TODO: Check if the memory is mapped at all! TElem * pDev(nullptr); -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - ALPAKA_CUDA_RT_CHECK( - cudaHostGetDevicePointer( - &pDev, - mem::view::getPtrNative(buf), - 0)); -#else - ALPAKA_HIP_RT_CHECK( - hipHostGetDevicePointer( + + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(HostGetDevicePointer)( &pDev, mem::view::getPtrNative(buf), 0)); -#endif + return pDev; } }; diff --git a/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp b/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp index 7f7c9817611f..3a40b9a4dd8a 100644 --- a/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp +++ b/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp @@ -74,11 +74,8 @@ namespace alpaka TViewSrc, TExtent> { -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - using MemcpyKind = cudaMemcpyKind; -#else - using MemcpyKind = hipMemcpyKind; -#endif + using MemcpyKind = ALPAKA_API_PREFIX(MemcpyKind); + static_assert( !std::is_const::value, "The destination view can not be const!"); @@ -164,11 +161,8 @@ namespace alpaka TViewSrc, TExtent> { -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - using MemcpyKind = cudaMemcpyKind; -#else - using MemcpyKind = hipMemcpyKind; -#endif + using MemcpyKind = ALPAKA_API_PREFIX(MemcpyKind); + static_assert( !std::is_const::value, "The destination view can not be const!"); @@ -285,11 +279,8 @@ namespace alpaka TViewSrc, TExtent> { -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - using MemcpyKind = cudaMemcpyKind; -#else - using MemcpyKind = hipMemcpyKind; -#endif + using MemcpyKind = ALPAKA_API_PREFIX(MemcpyKind); + static_assert( !std::is_const::value, "The destination view can not be const!"); @@ -431,11 +422,8 @@ namespace alpaka alreadyCheckedPeerAccessDevices.insert(devicePair); int canAccessPeer = 0; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devSrc, devDst)); -#else - ALPAKA_HIP_RT_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, devSrc, devDst)); -#endif + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(DeviceCanAccessPeer)(&canAccessPeer, devSrc, devDst)); + if(!canAccessPeer) { #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL std::cout << __func__ @@ -446,19 +434,12 @@ namespace alpaka #endif return; } -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaSetDevice(devSrc)); -#else - ALPAKA_HIP_RT_CHECK(hipSetDevice(devSrc)); -#endif + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(devSrc)); + // NOTE: "until access is explicitly disabled using cudaDeviceDisablePeerAccess() or either device is reset using cudaDeviceReset()." // We do not remove a device from the enabled device pairs on cudaDeviceReset. // Note that access granted by this call is unidirectional and that in order to access memory on the current device from peerDevice, a separate symmetric call to cudaDeviceEnablePeerAccess() is required. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaDeviceEnablePeerAccess(devDst, 0)); -#else - ALPAKA_HIP_RT_CHECK(hipDeviceEnablePeerAccess(devDst, 0)); -#endif + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(DeviceEnablePeerAccess)(devDst, 0)); } } } @@ -506,11 +487,7 @@ namespace alpaka viewDst, viewSrc, extent, -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaMemcpyDeviceToHost, -#else - hipMemcpyDeviceToHost, -#endif + ALPAKA_API_PREFIX(MemcpyDeviceToHost), iDevice, iDevice); } @@ -553,11 +530,7 @@ namespace alpaka viewDst, viewSrc, extent, -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaMemcpyHostToDevice, -#else - hipMemcpyHostToDevice, -#endif + ALPAKA_API_PREFIX(MemcpyHostToDevice), iDevice, iDevice); } @@ -597,11 +570,7 @@ namespace alpaka viewDst, viewSrc, extent, -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaMemcpyDeviceToDevice, -#else - hipMemcpyDeviceToDevice, -#endif + ALPAKA_API_PREFIX(MemcpyDeviceToDevice), dev::getDev(viewDst).m_iDevice, dev::getDev(viewSrc).m_iDevice); } @@ -618,11 +587,7 @@ namespace alpaka typename TViewDst> ALPAKA_FN_HOST auto buildUniformCudaHipMemcpy3DParms( mem::view::uniform_cuda_hip::detail::TaskCopyUniformCudaHip, TViewDst, TViewSrc, TExtent> const & task) -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - -> cudaMemcpy3DParms -#else - -> hipMemcpy3DParms -#endif + -> ALPAKA_API_PREFIX(Memcpy3DParms) { ALPAKA_DEBUG_FULL_LOG_SCOPE; @@ -645,63 +610,34 @@ namespace alpaka auto const & srcNativePtr(task.m_srcMemNative); // Fill CUDA/HIP parameter structure. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - cudaMemcpy3DParms cudaMemCpy3DParms; - cudaMemCpy3DParms.srcArray = nullptr; // Either srcArray or srcPtr. - cudaMemCpy3DParms.srcPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes. - cudaMemCpy3DParms.srcPtr = - make_cudaPitchedPtr( + ALPAKA_API_PREFIX(Memcpy3DParms) memCpy3DParms; + memCpy3DParms.srcArray = nullptr; // Either srcArray or srcPtr. + memCpy3DParms.srcPos = ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Pos))(0, 0, 0); // Optional. Offset in bytes. + memCpy3DParms.srcPtr = + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( const_cast(srcNativePtr), static_cast(srcPitchBytesX), static_cast(srcWidth), static_cast(srcPitchBytesY/srcPitchBytesX)); - cudaMemCpy3DParms.dstArray = nullptr; // Either dstArray or dstPtr. - cudaMemCpy3DParms.dstPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes. - cudaMemCpy3DParms.dstPtr = - make_cudaPitchedPtr( + memCpy3DParms.dstArray = nullptr; // Either dstArray or dstPtr. + memCpy3DParms.dstPos = ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Pos))(0, 0, 0); // Optional. Offset in bytes. + memCpy3DParms.dstPtr = + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( dstNativePtr, static_cast(dstPitchBytesX), static_cast(dstWidth), static_cast(dstPitchBytesY / dstPitchBytesX)); - cudaMemCpy3DParms.extent = - make_cudaExtent( + memCpy3DParms.extent = + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Extent))( static_cast(extentWidthBytes), static_cast(extentHeight), static_cast(extentDepth)); - cudaMemCpy3DParms.kind = task.m_uniformMemCpyKind; - return cudaMemCpy3DParms; - +#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_PLATFORM_NVCC__) + memCpy3DParms.kind = hipMemcpyKindToCudaMemcpyKind(task.m_uniformMemCpyKind); #else - hipMemcpy3DParms hipMemCpy3DParms; - hipMemCpy3DParms.srcArray = nullptr; // Either srcArray or srcPtr. - hipMemCpy3DParms.srcPos = make_hipPos(0, 0, 0); // Optional. Offset in bytes. - hipMemCpy3DParms.srcPtr = - make_hipPitchedPtr( - const_cast(srcNativePtr), - static_cast(srcPitchBytesX), - static_cast(srcWidth), - static_cast(srcPitchBytesY/srcPitchBytesX)); - hipMemCpy3DParms.dstArray = nullptr; // Either dstArray or dstPtr. - hipMemCpy3DParms.dstPos = make_hipPos(0, 0, 0); // Optional. Offset in bytes. - hipMemCpy3DParms.dstPtr = - make_hipPitchedPtr( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(dstWidth), - static_cast(dstPitchBytesY/dstPitchBytesX)); - hipMemCpy3DParms.extent = - make_hipExtent( - static_cast(extentWidthBytes), - static_cast(extentHeight), - static_cast(extentDepth)); - #ifdef __HIP_PLATFORM_NVCC__ - hipMemCpy3DParms.kind = hipMemcpyKindToCudaMemcpyKind(task.m_uniformMemCpyKind); - #else - hipMemCpy3DParms.kind = task.m_uniformMemCpyKind; - #endif - return hipMemCpy3DParms; - + memCpy3DParms.kind = task.m_uniformMemCpyKind; #endif + return memCpy3DParms; } #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) //----------------------------------------------------------------------------- @@ -869,16 +805,15 @@ namespace alpaka auto const & uniformCudaHipMemCpyKind(task.m_uniformMemCpyKind); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) if(iDstDev == iSrcDev) { // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDstDev)); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( - cudaMemcpyAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemcpyAsync)( dstNativePtr, srcNativePtr, static_cast(extentWidthBytes), @@ -890,8 +825,8 @@ namespace alpaka alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( - cudaMemcpyPeerAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemcpyPeerAsync)( dstNativePtr, iDstDev, srcNativePtr, @@ -899,35 +834,6 @@ namespace alpaka static_cast(extentWidthBytes), queue.m_spQueueImpl->m_UniformCudaHipQueue)); } -#else - if(iDstDev == iSrcDev) - { - // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - iDstDev)); - // Initiate the memory copy. - ALPAKA_HIP_RT_CHECK( - hipMemcpyAsync( - dstNativePtr, - srcNativePtr, - static_cast(extentWidthBytes), - uniformCudaHipMemCpyKind, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - else - { - // Initiate the memory copy. - ALPAKA_HIP_RT_CHECK( - hipMemcpyPeerAsync( - dstNativePtr, - iDstDev, - srcNativePtr, - iSrcDev, - static_cast(extentWidthBytes), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } -#endif } }; //############################################################################# @@ -964,19 +870,17 @@ namespace alpaka auto const & dstNativePtr(task.m_dstMemNative); auto const & srcNativePtr(task.m_srcMemNative); - -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) if(iDstDev == iSrcDev) { auto const & uniformCudaHipMemCpyKind(task.m_uniformMemCpyKind); // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDstDev)); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( - cudaMemcpyAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemcpyAsync)( dstNativePtr, srcNativePtr, static_cast(extentWidthBytes), @@ -988,8 +892,8 @@ namespace alpaka alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( - cudaMemcpyPeerAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemcpyPeerAsync)( dstNativePtr, iDstDev, srcNativePtr, @@ -997,43 +901,9 @@ namespace alpaka static_cast(extentWidthBytes), queue.m_spQueueImpl->m_UniformCudaHipQueue)); } - ALPAKA_CUDA_RT_CHECK( - cudaStreamSynchronize( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(StreamSynchronize)( queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else - if(iDstDev == iSrcDev) - { - auto const & uniformCudaHipMemCpyKind(task.m_uniformMemCpyKind); - - // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - iDstDev)); - // Initiate the memory copy. - ALPAKA_HIP_RT_CHECK( - hipMemcpyAsync( - dstNativePtr, - srcNativePtr, - static_cast(extentWidthBytes), - uniformCudaHipMemCpyKind, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - else - { - // Initiate the memory copy. - ALPAKA_HIP_RT_CHECK( - hipMemcpyPeerAsync( - dstNativePtr, - iDstDev, - srcNativePtr, - iSrcDev, - static_cast(extentWidthBytes), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } - - ALPAKA_HIP_RT_CHECK( hipStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif } }; //############################################################################# @@ -1076,71 +946,41 @@ namespace alpaka auto const & dstNativePtr(task.m_dstMemNative); auto const & srcNativePtr(task.m_srcMemNative); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - auto const & cudaMemcpyKind(task.m_uniformMemCpyKind); + + auto const & memcpyKind(task.m_uniformMemCpyKind); // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDstDev)); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( - cudaMemcpy2DAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memcpy2DAsync)( dstNativePtr, static_cast(dstPitchBytesX), srcNativePtr, static_cast(srcPitchBytesX), static_cast(extentWidthBytes), static_cast(extentHeight), - cudaMemcpyKind, + memcpyKind, queue.m_spQueueImpl->m_UniformCudaHipQueue)); } else { alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); - +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // There is no cudaMemcpy2DPeerAsync, therefore we use cudaMemcpy3DPeerAsync. // Create the struct describing the copy. - cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms( + ALPAKA_API_PREFIX(Memcpy3DPeerParms) const memCpy3DPeerParms( mem::view::uniform_cuda_hip::detail::buildCudaMemcpy3DPeerParms( task)); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaMemcpy3DPeerAsync( - &cudaMemCpy3DPeerParms, + &memCpy3DPeerParms, queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } -#else - auto const & hipMemCpyKind(task.m_uniformMemCpyKind); - - // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - iDstDev)); - - if(iDstDev != iSrcDev) - { - // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers. - // P2P access has to be enabled to avoid host transfer. - // Checks if devices are connected via PCIe switch and enable P2P access then. - alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); - } - - ALPAKA_HIP_RT_CHECK( - hipMemcpy2DAsync( - dstNativePtr, - static_cast(dstPitchBytesX), - srcNativePtr, - static_cast(srcPitchBytesX), - static_cast(extentWidthBytes), - static_cast(extentHeight), - hipMemCpyKind, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - - ALPAKA_HIP_RT_CHECK( hipStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } #endif + } } }; //############################################################################# @@ -1172,7 +1012,7 @@ namespace alpaka auto const & iDstDev(task.m_iDstDevice); auto const & iSrcDev(task.m_iSrcDevice); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + if(iDstDev == iSrcDev) { auto const & extentWidthBytes(task.m_extentWidthBytes); @@ -1183,80 +1023,43 @@ namespace alpaka auto const & dstNativePtr(task.m_dstMemNative); auto const & srcNativePtr(task.m_srcMemNative); - auto const & cudaMemcpyKind(task.m_uniformMemCpyKind); + auto const & memcpyKind(task.m_uniformMemCpyKind); // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDstDev)); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( - cudaMemcpy2DAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memcpy2DAsync)( dstNativePtr, static_cast(dstPitchBytesX), srcNativePtr, static_cast(srcPitchBytesX), static_cast(extentWidthBytes), static_cast(extentHeight), - cudaMemcpyKind, + memcpyKind, queue.m_spQueueImpl->m_UniformCudaHipQueue)); } else { alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); - +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // There is no cudaMemcpy2DPeerAsync, therefore we use cudaMemcpy3DPeerAsync. // Create the struct describing the copy. - cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms( + cudaMemcpy3DPeerParms const memCpy3DPeerParms( mem::view::uniform_cuda_hip::detail::buildCudaMemcpy3DPeerParms( task)); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaMemcpy3DPeerAsync( - &cudaMemCpy3DPeerParms, + &memCpy3DPeerParms, queue.m_spQueueImpl->m_UniformCudaHipQueue)); +#endif } - ALPAKA_CUDA_RT_CHECK( - cudaStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else - auto const & extentWidthBytes(task.m_extentWidthBytes); - auto const & extentHeight(task.m_extentHeight); - - auto const & dstPitchBytesX(task.m_dstpitchBytesX); - auto const & srcPitchBytesX(task.m_srcpitchBytesX); - - auto const & dstNativePtr(task.m_dstMemNative); - auto const & srcNativePtr(task.m_srcMemNative); - auto const & hipMemCpyKind(task.m_uniformMemCpyKind); - - // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - iDstDev)); - - if(iDstDev != iSrcDev) - { - // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers. - // P2P access has to be enabled to avoid host transfer. - // Checks if devices are connected via PCIe switch and enable P2P access then. - alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); - } - - ALPAKA_HIP_RT_CHECK( - hipMemcpy2DAsync( - dstNativePtr, - static_cast(dstPitchBytesX), - srcNativePtr, - static_cast(srcPitchBytesX), - static_cast(extentWidthBytes), - static_cast(extentHeight), - hipMemCpyKind, + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(StreamSynchronize)( queue.m_spQueueImpl->m_UniformCudaHipQueue)); - - ALPAKA_HIP_RT_CHECK( hipStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif } }; //############################################################################# @@ -1288,61 +1091,37 @@ namespace alpaka auto const & iDstDev(task.m_iDstDevice); auto const & iSrcDev(task.m_iSrcDevice); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) if(iDstDev == iSrcDev) { // Create the struct describing the copy. - cudaMemcpy3DParms const uniformCudaHipMemCpy3DParms( + ALPAKA_API_PREFIX(Memcpy3DParms) const uniformCudaHipMemCpy3DParms( mem::view::uniform_cuda_hip::detail::buildUniformCudaHipMemcpy3DParms( task)); // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDstDev)); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( - cudaMemcpy3DAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memcpy3DAsync)( &uniformCudaHipMemCpy3DParms, queue.m_spQueueImpl->m_UniformCudaHipQueue)); } else { alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); - +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // Create the struct describing the copy. cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms( mem::view::uniform_cuda_hip::detail::buildCudaMemcpy3DPeerParms( task)); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaMemcpy3DPeerAsync( &cudaMemCpy3DPeerParms, queue.m_spQueueImpl->m_UniformCudaHipQueue)); - } -#else - // Create the struct describing the copy. - hipMemcpy3DParms const hipMemCpy3DParms( - mem::view::uniform_cuda_hip::detail::buildUniformCudaHipMemcpy3DParms( - task)); - // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - iDstDev)); - - if(iDstDev != iSrcDev) - { - // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers. - // P2P access has to be enabled to avoid host transfer. - // Checks if devices are connected via PCIe switch and enable P2P access then. - alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); - } - - // Initiate the memory copy. - ALPAKA_HIP_RT_CHECK( - hipMemcpy3DAsync( - &hipMemCpy3DParms, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); #endif + } } }; //############################################################################# @@ -1374,66 +1153,40 @@ namespace alpaka auto const & iDstDev(task.m_iDstDevice); auto const & iSrcDev(task.m_iSrcDevice); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) if(iDstDev == iSrcDev) { // Create the struct describing the copy. - cudaMemcpy3DParms const uniformCudaHipMemCpy3DParms( + ALPAKA_API_PREFIX(Memcpy3DParms) const uniformCudaHipMemCpy3DParms( mem::view::uniform_cuda_hip::detail::buildUniformCudaHipMemcpy3DParms( task)); // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDstDev)); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( - cudaMemcpy3DAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memcpy3DAsync)( &uniformCudaHipMemCpy3DParms, queue.m_spQueueImpl->m_UniformCudaHipQueue)); } else { alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); - +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // Create the struct describing the copy. cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms( mem::view::uniform_cuda_hip::detail::buildCudaMemcpy3DPeerParms( task)); // Initiate the memory copy. - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaMemcpy3DPeerAsync( &cudaMemCpy3DPeerParms, queue.m_spQueueImpl->m_UniformCudaHipQueue)); +#endif } - ALPAKA_CUDA_RT_CHECK( - cudaStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else - hipMemcpy3DParms const hipMemCpy3DParms( - mem::view::uniform_cuda_hip::detail::buildUniformCudaHipMemcpy3DParms( - task)); - // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - iDstDev)); - - if(iDstDev != iSrcDev) - { - // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers. - // P2P access has to be enabled to avoid host transfer. - // Checks if devices are connected via PCIe switch and enable P2P access then. - alpaka::mem::view::uniform_cuda_hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev); - } - - // Initiate the memory copy. - ALPAKA_HIP_RT_CHECK( - hipMemcpy3DAsync( - &hipMemCpy3DParms, + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(StreamSynchronize)( queue.m_spQueueImpl->m_UniformCudaHipQueue)); - - ALPAKA_HIP_RT_CHECK( hipStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif } }; } diff --git a/include/alpaka/mem/buf/uniformCudaHip/Set.hpp b/include/alpaka/mem/buf/uniformCudaHip/Set.hpp index aeea742ec38a..72c55064119a 100644 --- a/include/alpaka/mem/buf/uniformCudaHip/Set.hpp +++ b/include/alpaka/mem/buf/uniformCudaHip/Set.hpp @@ -174,31 +174,17 @@ namespace alpaka auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); ALPAKA_ASSERT(extentWidth <= dstWidth); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - iDevice)); - // Initiate the memory set. - ALPAKA_CUDA_RT_CHECK( - cudaMemsetAsync( - dstNativePtr, - static_cast(byte), - static_cast(extentWidthBytes), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDevice)); // Initiate the memory set. - ALPAKA_HIP_RT_CHECK( - hipMemsetAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemsetAsync)( dstNativePtr, static_cast(byte), static_cast(extentWidthBytes), queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif } }; //############################################################################# @@ -246,31 +232,17 @@ namespace alpaka auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(view))); ALPAKA_ASSERT(extentWidth <= dstWidth); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - iDevice)); - // Initiate the memory set. - ALPAKA_CUDA_RT_CHECK( - cudaMemsetAsync( - dstNativePtr, - static_cast(byte), - static_cast(extentWidthBytes), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDevice)); // Initiate the memory set. - ALPAKA_HIP_RT_CHECK( - hipMemsetAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(MemsetAsync)( dstNativePtr, static_cast(byte), static_cast(extentWidthBytes), queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif wait::wait(queue); } }; @@ -324,35 +296,19 @@ namespace alpaka ALPAKA_ASSERT(extentWidth <= dstWidth); ALPAKA_ASSERT(extentHeight <= dstHeight); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - iDevice)); - // Initiate the memory set. - ALPAKA_CUDA_RT_CHECK( - cudaMemset2DAsync( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(byte), - static_cast(extentWidthBytes), - static_cast(extentHeight), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDevice)); // Initiate the memory set. - ALPAKA_HIP_RT_CHECK( - hipMemset2DAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memset2DAsync)( dstNativePtr, static_cast(dstPitchBytesX), static_cast(byte), static_cast(extentWidthBytes), static_cast(extentHeight), queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif } }; //############################################################################# @@ -405,15 +361,14 @@ namespace alpaka ALPAKA_ASSERT(extentWidth <= dstWidth); ALPAKA_ASSERT(extentHeight <= dstHeight); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDevice)); // Initiate the memory set. - ALPAKA_CUDA_RT_CHECK( - cudaMemset2DAsync( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memset2DAsync)( dstNativePtr, static_cast(dstPitchBytesX), static_cast(byte), @@ -421,22 +376,6 @@ namespace alpaka static_cast(extentHeight), queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else - // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - iDevice)); - - // Initiate the memory set. - ALPAKA_HIP_RT_CHECK( - hipMemset2DAsync( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(byte), - static_cast(extentWidthBytes), - static_cast(extentHeight), - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif wait::wait(queue); } }; @@ -494,59 +433,31 @@ namespace alpaka ALPAKA_ASSERT(extentHeight <= dstHeight); ALPAKA_ASSERT(extentDepth <= dstDepth); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Fill CUDA parameter structures. - cudaPitchedPtr const cudaPitchedPtrVal( - make_cudaPitchedPtr( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(dstWidth * static_cast(sizeof(Elem))), - static_cast(dstPitchBytesY / dstPitchBytesX))); - - cudaExtent const cudaExtentVal( - make_cudaExtent( - static_cast(extentWidth * static_cast(sizeof(Elem))), - static_cast(extentHeight), - static_cast(extentDepth))); - - // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( - iDevice)); - // Initiate the memory set. - ALPAKA_CUDA_RT_CHECK( - cudaMemset3DAsync( - cudaPitchedPtrVal, - static_cast(byte), - cudaExtentVal, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else // Fill CUDA parameter structures. - hipPitchedPtr const hipPitchedPtrVal( - make_hipPitchedPtr( + ALPAKA_API_PREFIX(PitchedPtr) const pitchedPtrVal( + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( dstNativePtr, static_cast(dstPitchBytesX), static_cast(dstWidth * static_cast(sizeof(Elem))), static_cast(dstPitchBytesY / dstPitchBytesX))); - hipExtent const hipExtentVal( - make_hipExtent( + ALPAKA_API_PREFIX(Extent) const extentVal( + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Extent))( static_cast(extentWidth * static_cast(sizeof(Elem))), static_cast(extentHeight), static_cast(extentDepth))); // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDevice)); // Initiate the memory set. - ALPAKA_HIP_RT_CHECK( - hipMemset3DAsync( - hipPitchedPtrVal, + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memset3DAsync)( + pitchedPtrVal, static_cast(byte), - hipExtentVal, + extentVal, queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif } }; //############################################################################# @@ -603,59 +514,32 @@ namespace alpaka ALPAKA_ASSERT(extentHeight <= dstHeight); ALPAKA_ASSERT(extentDepth <= dstDepth); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // Fill CUDA parameter structures. - cudaPitchedPtr const cudaPitchedPtrVal( - make_cudaPitchedPtr( + ALPAKA_API_PREFIX(PitchedPtr) const pitchedPtrVal( + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(PitchedPtr))( dstNativePtr, static_cast(dstPitchBytesX), static_cast(dstWidth * static_cast(sizeof(Elem))), static_cast(dstPitchBytesY / dstPitchBytesX))); - cudaExtent const cudaExtentVal( - make_cudaExtent( + ALPAKA_API_PREFIX(Extent) const extentVal( + ALPAKA_PP_CONCAT(make_,ALPAKA_API_PREFIX(Extent))( static_cast(extentWidth * static_cast(sizeof(Elem))), static_cast(extentHeight), static_cast(extentDepth))); // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( iDevice)); // Initiate the memory set. - ALPAKA_CUDA_RT_CHECK( - cudaMemset3DAsync( - cudaPitchedPtrVal, + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(Memset3DAsync)( + pitchedPtrVal, static_cast(byte), - cudaExtentVal, + extentVal, queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#else - // Fill CUDA parameter structures. - hipPitchedPtr const hipPitchedPtrVal( - make_hipPitchedPtr( - dstNativePtr, - static_cast(dstPitchBytesX), - static_cast(dstWidth * static_cast(sizeof(Elem))), - static_cast(dstPitchBytesY / dstPitchBytesX))); - hipExtent const hipExtentVal( - make_hipExtent( - static_cast(extentWidth * static_cast(sizeof(Elem))), - static_cast(extentHeight), - static_cast(extentDepth))); - - // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - iDevice)); - // Initiate the memory set. - ALPAKA_HIP_RT_CHECK( - hipMemset3DAsync( - hipPitchedPtrVal, - static_cast(byte), - hipExtentVal, - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#endif wait::wait(queue); } }; diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp index 89e911db310a..342771739538 100644 --- a/include/alpaka/mem/view/ViewPlainPtr.hpp +++ b/include/alpaka/mem/view/ViewPlainPtr.hpp @@ -314,21 +314,22 @@ namespace alpaka TExtent const & extent) { TElem* pMemAcc(nullptr); + #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaGetSymbolAddress( reinterpret_cast(&pMemAcc), *pMem)); #else #ifdef __HIP_PLATFORM_NVCC__ - ALPAKA_HIP_RT_CHECK(hipCUDAErrorTohipError( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipCUDAErrorTohipError( cudaGetSymbolAddress( reinterpret_cast(&pMemAcc), *pMem))); #else // FIXME: still does not work in HIP(HCC) (results in hipErrorNotFound) // HIP_SYMBOL(X) not useful because it only does #X on HIP(HCC), while &X on HIP(NVCC) - ALPAKA_HIP_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( hipGetSymbolAddress( reinterpret_cast(&pMemAcc), pMem)); diff --git a/include/alpaka/pltf/PltfUniformCudaHipRt.hpp b/include/alpaka/pltf/PltfUniformCudaHipRt.hpp index 9bf24987b7d4..c2a021826754 100644 --- a/include/alpaka/pltf/PltfUniformCudaHipRt.hpp +++ b/include/alpaka/pltf/PltfUniformCudaHipRt.hpp @@ -80,15 +80,10 @@ namespace alpaka ALPAKA_DEBUG_FULL_LOG_SCOPE; int iNumDevices(0); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - cudaError_t error = cudaGetDeviceCount(&iNumDevices); - if(error != cudaSuccess) - iNumDevices = 0; -#else - hipError_t error = hipGetDeviceCount(&iNumDevices); - if(error != hipSuccess) + ALPAKA_API_PREFIX(Error_t) error = ALPAKA_API_PREFIX(GetDeviceCount)(&iNumDevices); + if(error != ALPAKA_API_PREFIX(Success)) iNumDevices = 0; -#endif + return static_cast(iNumDevices); } }; @@ -112,7 +107,7 @@ namespace alpaka if(devIdx >= devCount) { std::stringstream ssErr; - ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount << " CUDA devices!"; + ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount << " devices!"; throw std::runtime_error(ssErr.str()); } @@ -124,11 +119,10 @@ namespace alpaka #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) cudaDeviceProp devProp; - ALPAKA_CUDA_RT_CHECK(cudaGetDeviceProperties(&devProp, dev.m_iDevice)); #else - hipDeviceProp_t devProp; - ALPAKA_HIP_RT_CHECK(hipGetDeviceProperties(&devProp, dev.m_iDevice)); + hipDeviceProp_t devProp; #endif + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, dev.m_iDevice)); #endif #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL printDeviceProperties(devProp); @@ -155,77 +149,48 @@ namespace alpaka { #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) cudaError rc(cudaSetDevice(static_cast(iDevice))); - - cudaStream_t queue = {}; - // Create a dummy queue to check if the device is already used by an other process. - // cudaSetDevice never returns an error if another process already uses the selected device and gpu compute mode is set "process exclusive". - // \TODO: Check if this workaround is needed! - if(rc == cudaSuccess) - { - rc = cudaStreamCreate(&queue); - } - - if(rc == cudaSuccess) - { - // Destroy the dummy queue. - ALPAKA_CUDA_RT_CHECK( - cudaStreamDestroy( - queue)); - return true; - } - else - { - // Return the previous error from cudaStreamCreate. - ALPAKA_CUDA_RT_CHECK( - rc); - // Reset the Error state. - cudaGetLastError(); - return false; - } - } #else hipError_t rc(hipSetDevice(static_cast(iDevice))); +#endif - hipStream_t queue = {}; + ALPAKA_API_PREFIX(Stream_t) queue = {}; // Create a dummy queue to check if the device is already used by an other process. - // hipSetDevice never returns an error if another process already uses the selected device and gpu compute mode is set "process exclusive". + // cuda/hip-SetDevice never returns an error if another process already uses the selected device and gpu compute mode is set "process exclusive". // \TODO: Check if this workaround is needed! - if(rc == hipSuccess) + if(rc == ALPAKA_API_PREFIX(Success)) { - rc = hipStreamCreate(&queue); + rc = ALPAKA_API_PREFIX(StreamCreate)(&queue); } - if(rc == hipSuccess) + if(rc == ALPAKA_API_PREFIX(Success)) { // Destroy the dummy queue. - ALPAKA_HIP_RT_CHECK( - hipStreamDestroy( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(StreamDestroy)( queue)); return true; } else { - // Return the previous error from hipStreamCreate. - ALPAKA_HIP_RT_CHECK( + // Return the previous error from cudaStreamCreate. + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( rc); // Reset the Error state. - hipGetLastError(); - - return false; + ALPAKA_API_PREFIX(GetLastError)(); + return false; } } -#endif - #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL //----------------------------------------------------------------------------- //! Prints all the device properties to std::cout. ALPAKA_FN_HOST static auto printDeviceProperties( #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - cudaDeviceProp const & devProp) + cudaDeviceProp const & devProp #else - hipDeviceProp_t const & devProp) + hipDeviceProp_t const & devProp #endif + ) -> void { ALPAKA_DEBUG_FULL_LOG_SCOPE; diff --git a/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp b/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp index 9af657746769..560a8922d9b2 100644 --- a/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp +++ b/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp @@ -72,33 +72,21 @@ namespace alpaka ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; // Set the current device. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK( - cudaStreamCreateWithFlags( - &m_UniformCudaHipQueue, - cudaStreamNonBlocking)); -#else - ALPAKA_HIP_RT_CHECK( - hipSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( m_dev.m_iDevice)); -#endif + // - [cuda/hip]StreamDefault: Default queue creation flag. // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue), // and that the created queue should perform no implicit synchronization with queue 0. // Create the queue on the current device. // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue. // It would be too much work to implement implicit default queue synchronization on CPU. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK( - cudaStreamCreateWithFlags( - &m_UniformCudaHipQueue, - cudaStreamNonBlocking)); -#else - ALPAKA_HIP_RT_CHECK( - hipStreamCreateWithFlags( + + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(StreamCreateWithFlags)( &m_UniformCudaHipQueue, - hipStreamNonBlocking)); -#endif + ALPAKA_API_PREFIX(StreamNonBlocking))); } //----------------------------------------------------------------------------- @@ -119,34 +107,23 @@ namespace alpaka // In case the device is still doing work in the queue when [cuda/hip]StreamDestroy() is called, the function will return immediately // and the resources associated with queue will be released automatically once the device has completed all work in queue. // -> No need to synchronize here. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( m_dev.m_iDevice)); - // In case the device is still doing work in the queue when cudaStreamDestroy() is called, the function will return immediately + // In case the device is still doing work in the queue when cuda/hip-StreamDestroy() is called, the function will return immediately // and the resources associated with queue will be released automatically once the device has completed all work in queue. // -> No need to synchronize here. - ALPAKA_CUDA_RT_CHECK( - cudaStreamDestroy( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(StreamDestroy)( m_UniformCudaHipQueue)); -#else - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - m_dev.m_iDevice)); - ALPAKA_HIP_RT_CHECK( - hipStreamDestroy( - m_UniformCudaHipQueue)); -#endif + } public: dev::DevUniformCudaHipRt const m_dev; //!< The device this queue is bound to. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - cudaStream_t m_UniformCudaHipQueue; -#else - hipStream_t m_UniformCudaHipQueue; -#endif + ALPAKA_API_PREFIX(Stream_t) m_UniformCudaHipQueue; + #if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) int m_callees = 0; std::mutex m_mutex; @@ -271,10 +248,11 @@ namespace alpaka //----------------------------------------------------------------------------- #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - static void CUDART_CB uniformCudaHipRtCallback(cudaStream_t /*queue*/, cudaError_t /*status*/, void *arg) + static void CUDART_CB #else - static void HIPRT_CB uniformCudaHipRtCallback(hipStream_t /*queue*/, hipError_t /*status*/, void *arg) + static void HIPRT_CB #endif + uniformCudaHipRtCallback(ALPAKA_API_PREFIX(Stream_t) /*queue*/, ALPAKA_API_PREFIX(Error_t) /*status*/, void *arg) { // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished. const auto pCallbackSynchronizationData = reinterpret_cast(arg)->shared_from_this(); @@ -314,19 +292,12 @@ namespace alpaka #endif auto pCallbackSynchronizationData = std::make_shared(); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaStreamAddCallback( - queue.m_spQueueImpl->m_UniformCudaHipQueue, - uniformCudaHipRtCallback, - pCallbackSynchronizationData.get(), - 0u)); -#else - ALPAKA_HIP_RT_CHECK(hipStreamAddCallback( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamAddCallback)( queue.m_spQueueImpl->m_UniformCudaHipQueue, uniformCudaHipRtCallback, pCallbackSynchronizationData.get(), 0u)); -#endif + // We start a new std::thread which stores the task to be executed. // This circumvents the limitation that it is not possible to call CUDA/HIP methods within the CUDA/HIP callback thread. // The CUDA/HIP thread signals the std::thread when it is ready to execute the task. @@ -386,22 +357,16 @@ namespace alpaka -> bool { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Query is allowed even for queues on non current device. - cudaError_t ret = cudaSuccess; - ALPAKA_CUDA_RT_CHECK_IGNORE( - ret = cudaStreamQuery(queue.m_spQueueImpl->m_UniformCudaHipQueue), - cudaErrorNotReady); - return (ret == cudaSuccess); -#elif BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) + +#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) return (queue.m_spQueueImpl->m_callees==0); #else - hipError_t ret = hipSuccess; - ALPAKA_HIP_RT_CHECK_IGNORE( - ret = hipStreamQuery( - queue.m_spQueueImpl->m_UniformCudaHipQueue), - hipErrorNotReady); - return (ret == hipSuccess); + // Query is allowed even for queues on non current device. + ALPAKA_API_PREFIX(Error_t) ret = ALPAKA_API_PREFIX(Success); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE( + ret = ALPAKA_API_PREFIX(StreamQuery)(queue.m_spQueueImpl->m_UniformCudaHipQueue), + ALPAKA_API_PREFIX(ErrorNotReady)); + return (ret == ALPAKA_API_PREFIX(Success)); #endif } }; @@ -426,18 +391,13 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Sync is allowed even for queues on non current device. - ALPAKA_CUDA_RT_CHECK(cudaStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue)); - -#elif BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) +#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) while(queue.m_spQueueImpl->m_callees>0) { std::this_thread::sleep_for(std::chrono::milliseconds(10u)); } #else // Sync is allowed even for queues on non current device. - ALPAKA_HIP_RT_CHECK( hipStreamSynchronize( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( ALPAKA_API_PREFIX(StreamSynchronize)( queue.m_spQueueImpl->m_UniformCudaHipQueue)); #endif } diff --git a/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp b/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp index fd91ced2e6b0..8316bdfd4477 100644 --- a/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp +++ b/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp @@ -79,29 +79,15 @@ namespace alpaka // Create the queue on the current device. // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue. // It would be too much work to implement implicit default queue synchronization on CPU. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) // Set the current device. - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( m_dev.m_iDevice)); - ALPAKA_CUDA_RT_CHECK( - cudaStreamCreateWithFlags( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(StreamCreateWithFlags)( &m_UniformCudaHipQueue, - cudaStreamNonBlocking)); -#else - // Set the current device. - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - m_dev.m_iDevice)); - - ALPAKA_HIP_RT_CHECK( - hipStreamCreateWithFlags( - &m_UniformCudaHipQueue, - hipStreamNonBlocking)); -#endif - - + ALPAKA_API_PREFIX(StreamNonBlocking))); } //----------------------------------------------------------------------------- QueueUniformCudaHipRtNonBlockingImpl(QueueUniformCudaHipRtNonBlockingImpl const &) = delete; @@ -116,44 +102,29 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - // Set the current device. \TODO: Is setting the current device before cudaStreamDestroy required? + // Set the current device. \TODO: Is setting the current device before cuda/hip-StreamDestroy required? // In case the device is still doing work in the queue when [cuda/hip]StreamDestroy() is called, the function will return immediately // and the resources associated with queue will be released automatically once the device has completed all work in queue. // -> No need to synchronize here. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - - ALPAKA_CUDA_RT_CHECK( - cudaSetDevice( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(SetDevice)( m_dev.m_iDevice)); - ALPAKA_CUDA_RT_CHECK( - cudaStreamDestroy( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + ALPAKA_API_PREFIX(StreamDestroy)( m_UniformCudaHipQueue)); -#else - - ALPAKA_HIP_RT_CHECK( - hipSetDevice( - m_dev.m_iDevice)); - - ALPAKA_HIP_RT_CHECK( - hipStreamDestroy( - m_UniformCudaHipQueue)); -#endif } public: dev::DevUniformCudaHipRt const m_dev; //!< The device this queue is bound to. -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - cudaStream_t m_UniformCudaHipQueue; -#else - hipStream_t m_UniformCudaHipQueue; - #if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) + ALPAKA_API_PREFIX(Stream_t) m_UniformCudaHipQueue; +#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) int m_callees = 0; std::mutex m_mutex; - #endif #endif + }; } } @@ -282,11 +253,11 @@ namespace alpaka //----------------------------------------------------------------------------- #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - static void CUDART_CB uniformCudaHipRtCallback(cudaStream_t /*queue*/, cudaError_t /*status*/, void *arg) + static void CUDART_CB #else - static void HIPRT_CB uniformCudaHipRtCallback(hipStream_t /*queue*/, hipError_t /*status*/, void *arg) + static void HIPRT_CB #endif - + uniformCudaHipRtCallback(ALPAKA_API_PREFIX(Stream_t) /*queue*/, ALPAKA_API_PREFIX(Error_t) /*status*/, void *arg) { // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished. const auto pCallbackSynchronizationData = reinterpret_cast(arg)->shared_from_this(); @@ -334,19 +305,12 @@ namespace alpaka } #endif auto pCallbackSynchronizationData = std::make_shared(); -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - ALPAKA_CUDA_RT_CHECK(cudaStreamAddCallback( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamAddCallback)( queue.m_spQueueImpl->m_UniformCudaHipQueue, uniformCudaHipRtCallback, pCallbackSynchronizationData.get(), 0u)); -#else - ALPAKA_HIP_RT_CHECK(hipStreamAddCallback( - queue.m_spQueueImpl->m_UniformCudaHipQueue, - uniformCudaHipRtCallback, - pCallbackSynchronizationData.get(), - 0u)); -#endif + // We start a new std::thread which stores the task to be executed. // This circumvents the limitation that it is not possible to call CUDA methods within the CUDA/HIP callback thread. // The CUDA/HIP thread signals the std::thread when it is ready to execute the task. @@ -406,25 +370,17 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Query is allowed even for queues on non current device. - cudaError_t ret = cudaSuccess; - ALPAKA_CUDA_RT_CHECK_IGNORE( - ret = cudaStreamQuery( - queue.m_spQueueImpl->m_UniformCudaHipQueue), - cudaErrorNotReady); - return (ret == cudaSuccess); -#elif BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) +#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) return (queue.m_spQueueImpl->m_callees==0); #else // Query is allowed even for queues on non current device. - hipError_t ret = hipSuccess; - ALPAKA_HIP_RT_CHECK_IGNORE( - ret = hipStreamQuery( + ALPAKA_API_PREFIX(Error_t) ret = ALPAKA_API_PREFIX(Success); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE( + ret = ALPAKA_API_PREFIX(StreamQuery)( queue.m_spQueueImpl->m_UniformCudaHipQueue), - hipErrorNotReady); - return (ret == hipSuccess); + ALPAKA_API_PREFIX(ErrorNotReady)); + return (ret == ALPAKA_API_PREFIX(Success)); #endif } @@ -450,18 +406,13 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - // Sync is allowed even for queues on non current device. - ALPAKA_CUDA_RT_CHECK( - cudaStreamSynchronize( - queue.m_spQueueImpl->m_UniformCudaHipQueue)); -#elif BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) +#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking) while(queue.m_spQueueImpl->m_callees>0) { std::this_thread::sleep_for(std::chrono::milliseconds(10u)); } #else // Sync is allowed even for queues on non current device. - ALPAKA_HIP_RT_CHECK( hipStreamSynchronize( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( ALPAKA_API_PREFIX(StreamSynchronize)( queue.m_spQueueImpl->m_UniformCudaHipQueue)); #endif } diff --git a/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp b/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp index da57d1efc2f6..604438ec12e6 100644 --- a/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp +++ b/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp @@ -402,16 +402,16 @@ namespace alpaka ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; // Set the current device. - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaSetDevice( m_dev.m_iDevice)); // Allocate the buffer on this device. - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaMalloc( &m_devMem, static_cast(sizeof(int32_t)))); // Initiate the memory set. - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaMemset( m_devMem, static_cast(0u), @@ -431,11 +431,11 @@ namespace alpaka ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; // Set the current device. - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaSetDevice( m_dev.m_iDevice)); // Free the buffer. - ALPAKA_CUDA_RT_CHECK(cudaFree(m_devMem)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaFree(m_devMem)); } //----------------------------------------------------------------------------- @@ -445,11 +445,11 @@ namespace alpaka m_bIsReady = true; // Set the current device. - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaSetDevice( m_dev.m_iDevice)); // Initiate the memory set. - ALPAKA_CUDA_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( cudaMemset( m_devMem, static_cast(1u), @@ -709,16 +709,16 @@ namespace alpaka ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; // Set the current device. - ALPAKA_HIP_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( hipSetDevice( m_dev.m_iDevice)); // Allocate the buffer on this device. - ALPAKA_HIP_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( hipMalloc( &m_devMem, static_cast(sizeof(int32_t)))); // Initiate the memory set. - ALPAKA_HIP_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( hipMemset( m_devMem, static_cast(0u), @@ -737,11 +737,11 @@ namespace alpaka { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - ALPAKA_HIP_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( hipSetDevice( m_dev.m_iDevice)); // Free the buffer. - ALPAKA_HIP_RT_CHECK(hipFree(m_devMem)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipFree(m_devMem)); } //----------------------------------------------------------------------------- @@ -751,11 +751,11 @@ namespace alpaka m_bIsReady = true; // Set the current device. - ALPAKA_HIP_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( hipSetDevice( m_dev.m_iDevice)); // Initiate the memory set. - ALPAKA_HIP_RT_CHECK( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( hipMemset( m_devMem, static_cast(1u), @@ -930,11 +930,11 @@ namespace alpaka std::cerr << "[Workaround] polling of device-located value in stream, as hipStreamWaitValue32 is not available.\n"; #endif while(hostMem<0x01010101u) { - ALPAKA_HIP_RT_CHECK(hipMemcpyDtoHAsync(&hostMem, + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipMemcpyDtoHAsync(&hostMem, reinterpret_cast(event.m_spEventImpl->m_devMem), sizeof(int32_t), queue.m_spQueueImpl->m_UniformCudaHipQueue)); - ALPAKA_HIP_RT_CHECK(hipStreamSynchronize(queue.m_spQueueImpl->m_UniformCudaHipQueue)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipStreamSynchronize(queue.m_spQueueImpl->m_UniformCudaHipQueue)); } } }; @@ -971,7 +971,7 @@ namespace alpaka // the device build upon value-based HIP queue synchronization APIs such as // cuStreamWaitValue32() and cuStreamWriteValue32(). #if BOOST_COMP_NVCC - ALPAKA_HIP_RT_CHECK(hipCUResultTohipError( + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipCUResultTohipError( cuStreamWaitValue32( static_cast(queue.m_spQueueImpl->m_UniformCudaHipQueue), reinterpret_cast(event.m_spEventImpl->m_devMem), @@ -982,7 +982,7 @@ namespace alpaka std::uint32_t hmem = 0; do { std::this_thread::sleep_for(std::chrono::milliseconds(10u)); - ALPAKA_HIP_RT_CHECK(hipMemcpy(&hmem, event.m_spEventImpl->m_devMem, sizeof(std::uint32_t), hipMemcpyDefault)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipMemcpy(&hmem, event.m_spEventImpl->m_devMem, sizeof(std::uint32_t), hipMemcpyDefault)); } while(hmem < 0x01010101u); #endif