Skip to content

Commit

Permalink
Use CUDA 11.2+ features via dlopen (#990)
Browse files Browse the repository at this point in the history
By binding to the cudart 11.2 functions at runtime we remove the requirement that these symbols exist, therefore allowing
RMM to be compiled with 11.2+ and used with 11.0 or 11.1.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Rong Ou (https://github.com/rongou)
  - Mark Harris (https://github.com/harrism)

URL: #990
  • Loading branch information
robertmaynard authored Mar 18, 2022
1 parent b21734e commit 7434bd6
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 37 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ target_include_directories(rmm INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOUR
if(CUDA_STATIC_RUNTIME)
message(STATUS "RMM: Enabling static linking of cudart")
target_link_libraries(rmm INTERFACE CUDA::cudart_static)
target_compile_definitions(rmm INTERFACE RMM_STATIC_CUDART)
else()
target_link_libraries(rmm INTERFACE CUDA::cudart)
endif()
Expand Down
140 changes: 140 additions & 0 deletions include/rmm/detail/dynamic_load_runtime.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda_runtime_api.h>
#include <dlfcn.h>
#include <memory>
#include <optional>

namespace rmm::detail {

/**
* @brief `dynamic_load_runtime` loads the cuda runtime library at runtime
*
* By loading the cudart library at runtime we can use functions that
* are added in newer minor versions of the cuda runtime.
*/
struct dynamic_load_runtime {
static void* get_cuda_runtime_handle()
{
auto close_cudart = [](void* handle) { ::dlclose(handle); };
auto open_cudart = []() {
::dlerror();
const int major = CUDART_VERSION / 1000;
const std::string libname_ver = "libcudart.so." + std::to_string(major) + ".0";
const std::string libname = "libcudart.so";

auto ptr = ::dlopen(libname_ver.c_str(), RTLD_LAZY);
if (!ptr) { ptr = ::dlopen(libname.c_str(), RTLD_LAZY); }
if (ptr) { return ptr; }

RMM_FAIL("Unable to dlopen cudart");
};
static std::unique_ptr<void, decltype(close_cudart)> cudart_handle{open_cudart(), close_cudart};
return cudart_handle.get();
}

template <typename... Args>
using function_sig = std::add_pointer_t<cudaError_t(Args...)>;

template <typename signature>
static std::optional<signature> function(const char* func_name)
{
auto* runtime = get_cuda_runtime_handle();
auto* handle = ::dlsym(runtime, func_name);
if (!handle) { return std::nullopt; }
auto* function_ptr = reinterpret_cast<signature>(handle);
return std::optional<signature>(function_ptr);
}
};

#if defined(RMM_STATIC_CUDART)
// clang-format off
#define RMM_CUDART_API_WRAPPER(name, signature) \
template <typename... Args> \
static cudaError_t name(Args... args) \
{ \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Waddress\"") \
static_assert(static_cast<signature>(::name), \
"Failed to find #name function with arguments #signature"); \
_Pragma("GCC diagnostic pop") \
return ::name(args...); \
}
// clang-format on
#else
#define RMM_CUDART_API_WRAPPER(name, signature) \
template <typename... Args> \
static cudaError_t name(Args... args) \
{ \
static const auto func = dynamic_load_runtime::function<signature>(#name); \
if (func) { return (*func)(args...); } \
RMM_FAIL("Failed to find #name function in libcudart.so"); \
}
#endif

#if CUDART_VERSION >= 11020 // 11.2 introduced cudaMallocAsync
/**
* @brief Bind to the stream-ordered memory allocator functions
* at runtime.
*
* This allows RMM users to compile/link against CUDA 11.2+ and run with
* < CUDA 11.2 runtime as these functions are found at call time.
*/
struct async_alloc {
static bool is_supported()
{
#if defined(RMM_STATIC_CUDART)
static bool runtime_supports_pool = (CUDART_VERSION >= 11020);
#else
static bool runtime_supports_pool =
dynamic_load_runtime::function<dynamic_load_runtime::function_sig<void*, cudaStream_t>>(
"cudaFreeAsync")
.has_value();
#endif

static auto driver_supports_pool{[] {
int cuda_pool_supported{};
auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
cudaDevAttrMemoryPoolsSupported,
rmm::detail::current_device().value());
return result == cudaSuccess and cuda_pool_supported == 1;
}()};
return runtime_supports_pool and driver_supports_pool;
}

template <typename... Args>
using cudart_sig = dynamic_load_runtime::function_sig<Args...>;

using cudaMemPoolCreate_sig = cudart_sig<cudaMemPool_t*, const cudaMemPoolProps*>;
RMM_CUDART_API_WRAPPER(cudaMemPoolCreate, cudaMemPoolCreate_sig);

using cudaMemPoolSetAttribute_sig = cudart_sig<cudaMemPool_t, cudaMemPoolAttr, void*>;
RMM_CUDART_API_WRAPPER(cudaMemPoolSetAttribute, cudaMemPoolSetAttribute_sig);

using cudaMemPoolDestroy_sig = cudart_sig<cudaMemPool_t>;
RMM_CUDART_API_WRAPPER(cudaMemPoolDestroy, cudaMemPoolDestroy_sig);

using cudaMallocFromPoolAsync_sig = cudart_sig<void**, size_t, cudaMemPool_t, cudaStream_t>;
RMM_CUDART_API_WRAPPER(cudaMallocFromPoolAsync, cudaMallocFromPoolAsync_sig);

using cudaFreeAsync_sig = cudart_sig<void*, cudaStream_t>;
RMM_CUDART_API_WRAPPER(cudaFreeAsync, cudaFreeAsync_sig);
};
#endif

#undef RMM_CUDART_API_WRAPPER
} // namespace rmm::detail
48 changes: 13 additions & 35 deletions include/rmm/mr/device/cuda_async_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <rmm/cuda_device.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/cuda_util.hpp>
#include <rmm/detail/dynamic_load_runtime.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

Expand Down Expand Up @@ -62,7 +63,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
{
#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
// Check if cudaMallocAsync Memory pool supported
RMM_EXPECTS(is_supported(),
RMM_EXPECTS(rmm::detail::async_alloc::is_supported(),
"cudaMallocAsync not supported with this CUDA driver/runtime version");

// Construct explicit pool
Expand All @@ -71,7 +72,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor;
pool_props.location.type = cudaMemLocationTypeDevice;
pool_props.location.id = rmm::detail::current_device().value();
RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));
RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));

// CUDA drivers before 11.5 have known incompatibilities with the async allocator.
// We'll disable `cudaMemPoolReuseAllowOpportunistic` if cuda driver < 11.5.
Expand All @@ -81,16 +82,16 @@ class cuda_async_memory_resource final : public device_memory_resource {
constexpr auto min_async_version{11050};
if (driver_version < min_async_version) {
int disabled{0};
RMM_CUDA_TRY(
cudaMemPoolSetAttribute(cuda_pool_handle_, cudaMemPoolReuseAllowOpportunistic, &disabled));
RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
cuda_pool_handle_, cudaMemPoolReuseAllowOpportunistic, &disabled));
}

auto const [free, total] = rmm::detail::available_device_memory();

// Need an l-value to take address to pass to cudaMemPoolSetAttribute
uint64_t threshold = release_threshold.value_or(total);
RMM_CUDA_TRY(
cudaMemPoolSetAttribute(cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));
RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));

// Allocate and immediately deallocate the initial_pool_size to prime the pool with the
// specified size
Expand All @@ -115,40 +116,14 @@ class cuda_async_memory_resource final : public device_memory_resource {
~cuda_async_memory_resource() override
{
#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
RMM_ASSERT_CUDA_SUCCESS(cudaMemPoolDestroy(pool_handle()));
RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaMemPoolDestroy(pool_handle()));
#endif
}
cuda_async_memory_resource(cuda_async_memory_resource const&) = delete;
cuda_async_memory_resource(cuda_async_memory_resource&&) = delete;
cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = delete;
cuda_async_memory_resource& operator=(cuda_async_memory_resource&&) = delete;

/**
* @brief Is cudaMallocAsync supported with this cuda runtime/driver version?
* @return true if both the cuda runtime and driver are newer than 11.2
*/
static bool is_supported()
{
#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
static auto runtime_supports_pool{[] {
int runtime_version{};
RMM_CUDA_TRY(cudaRuntimeGetVersion(&runtime_version));
constexpr auto min_async_version{11020};
return runtime_version >= min_async_version;
}()};
static auto driver_supports_pool{[] {
int cuda_pool_supported{};
auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
cudaDevAttrMemoryPoolsSupported,
rmm::detail::current_device().value());
return result == cudaSuccess and cuda_pool_supported == 1;
}()};
return runtime_supports_pool and driver_supports_pool;
#else
return false;
#endif
}

/**
* @brief Query whether the resource supports use of non-null CUDA streams for
* allocation/deallocation. `cuda_memory_resource` does not support streams.
Expand Down Expand Up @@ -184,7 +159,8 @@ class cuda_async_memory_resource final : public device_memory_resource {
void* ptr{nullptr};
#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
if (bytes > 0) {
RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
RMM_CUDA_TRY_ALLOC(rmm::detail::async_alloc::cudaMallocFromPoolAsync(
&ptr, bytes, pool_handle(), stream.value()));
}
#else
(void)bytes;
Expand All @@ -203,7 +179,9 @@ class cuda_async_memory_resource final : public device_memory_resource {
void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view stream) override
{
#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
if (ptr != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(ptr, stream.value())); }
if (ptr != nullptr) {
RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaFreeAsync(ptr, stream.value()));
}
#else
(void)ptr;
(void)stream;
Expand Down
2 changes: 1 addition & 1 deletion tests/mr/device/cuda_async_mr_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class AsyncMRTest : public ::testing::Test {
protected:
void SetUp() override
{
if (!rmm::mr::cuda_async_memory_resource::is_supported()) {
if (!rmm::detail::async_alloc::is_supported()) {
GTEST_SKIP() << "Skipping tests since cudaMallocAsync not supported with this CUDA "
<< "driver/runtime version";
}
Expand Down
2 changes: 1 addition & 1 deletion tests/mr/device/mr_test.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>

inline auto make_cuda_async()
{
if (rmm::mr::cuda_async_memory_resource::is_supported()) {
if (rmm::detail::async_alloc::is_supported()) {
return std::make_shared<rmm::mr::cuda_async_memory_resource>();
}
return std::shared_ptr<rmm::mr::cuda_async_memory_resource>{nullptr};
Expand Down

0 comments on commit 7434bd6

Please sign in to comment.