Skip to content

Commit

Permalink
CUDA 11.2+ features are used via dlopen
Browse files Browse the repository at this point in the history
By binding to the cudart 11.2 functions at runtime we remove
the requirement that these symbols exist, therefore allowing
RMM to be compiled with 11.2+ and used with 11.0 or 11.1.
  • Loading branch information
robertmaynard committed Mar 9, 2022
1 parent f31cd8b commit 74372fa
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 11 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ target_include_directories(rmm INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOUR
if(CUDA_STATIC_RUNTIME)
message(STATUS "RMM: Enabling static linking of cudart")
target_link_libraries(rmm INTERFACE CUDA::cudart_static)
target_compile_definitions(rmm INTERFACE RMM_STATIC_CUDART)
else()
target_link_libraries(rmm INTERFACE CUDA::cudart)
endif()
Expand Down
105 changes: 105 additions & 0 deletions include/rmm/detail/dynamic_load_runtime.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <dlfcn.h>
#include <memory>

namespace rmm::detail {

/**
* @brief `dynamic_load_runtime` loads the cuda runtime library at runtime
*
* By loading the cudart library at runtime we can use functions that
* are added in newer minor versions of the cuda runtime.
*/
struct dynamic_load_runtime {
static constexpr auto dlclose_destructor = [](void* handle) { ::dlclose(handle); };
inline static std::unique_ptr<void, decltype(dlclose_destructor)> cuda_runtime_lib{
nullptr, dlclose_destructor};

static bool open_cuda_runtime()
{
if (!cuda_runtime_lib) {
::dlerror();
const int major = CUDART_VERSION / 1000;
const std::string libname_ver = "libcudart.so." + std::to_string(major) + ".0";
const std::string libname = "libcudart.so";

auto ptr = ::dlopen(libname_ver.c_str(), RTLD_LAZY);
if (!ptr) { ::dlopen(libname.c_str(), RTLD_LAZY); }
if (!ptr) { return false; }

cuda_runtime_lib.reset(ptr);
}
return true;
}

template <typename... Args>
using function_return_type = std::add_pointer_t<cudaError_t(Args...)>;

template <typename... Args>
static function_return_type<Args...> function(const char* func_name)
{
if (!open_cuda_runtime()) { return nullptr; }
auto* handle = ::dlsym(cuda_runtime_lib.get(), func_name);
if (!handle) { return nullptr; }
auto function_ptr = reinterpret_cast<function_return_type<Args...>>(handle);
return function_ptr;
}
};


/**
* @brief `async_alloc` bind to the Stream Ordered Memory Allocator functions
* at runtime.
*
* This allows us rmm users to compile/link against CUDA 11.2+ and run with
* < CUDA 11.2 runtime as these functions are found at call time
*/
struct async_alloc {

static bool is_supported()
{
#if defined(RMM_STATIC_CUDART)
static bool has_support = (CUDART_VERSION >= 11020);
#else
static bool has_support = dynamic_load_runtime::function<void*>("cudaFreeAsync") != nullptr;
#endif
return has_support;
}

#if defined(RMM_STATIC_CUDART)
#define RMM_SYNC_ALLOC_WRAPPER(name)\
template <typename... Args> \
static cudaError_t name(Args... args) { \
return ::name(args...); \
}
#else
#define RMM_SYNC_ALLOC_WRAPPER(name)\
template <typename... Args> \
static cudaError_t name(Args... args) { \
static const auto func = dynamic_load_runtime::function<Args...>(#name); \
return func(args...); \
}
#endif

RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolCreate);
RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolSetAttribute);
RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolDestroy);
RMM_SYNC_ALLOC_WRAPPER(cudaMallocFromPoolAsync);
RMM_SYNC_ALLOC_WRAPPER(cudaFreeAsync);
};
} // namespace rmm::detail
21 changes: 10 additions & 11 deletions include/rmm/mr/device/cuda_async_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <rmm/cuda_device.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/cuda_util.hpp>
#include <rmm/detail/dynamic_load_runtime.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

Expand Down Expand Up @@ -71,14 +72,14 @@ class cuda_async_memory_resource final : public device_memory_resource {
pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor;
pool_props.location.type = cudaMemLocationTypeDevice;
pool_props.location.id = rmm::detail::current_device().value();
RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));
RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));

auto const [free, total] = rmm::detail::available_device_memory();

// Need an l-value to take address to pass to cudaMemPoolSetAttribute
uint64_t threshold = release_threshold.value_or(total);
RMM_CUDA_TRY(
cudaMemPoolSetAttribute(cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));
RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));

// Allocate and immediately deallocate the initial_pool_size to prime the pool with the
// specified size
Expand Down Expand Up @@ -118,12 +119,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
static bool is_supported()
{
#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
static auto runtime_supports_pool{[] {
int runtime_version{};
RMM_CUDA_TRY(cudaRuntimeGetVersion(&runtime_version));
constexpr auto min_async_version{11020};
return runtime_version >= min_async_version;
}()};
static auto runtime_supports_pool = rmm::detail::async_alloc::is_supported();
static auto driver_supports_pool{[] {
int cuda_pool_supported{};
auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
Expand Down Expand Up @@ -172,7 +168,8 @@ class cuda_async_memory_resource final : public device_memory_resource {
void* ptr{nullptr};
#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
if (bytes > 0) {
RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
RMM_CUDA_TRY_ALLOC(rmm::detail::async_alloc::cudaMallocFromPoolAsync(
&ptr, bytes, pool_handle(), stream.value()));
}
#else
(void)bytes;
Expand All @@ -191,7 +188,9 @@ class cuda_async_memory_resource final : public device_memory_resource {
void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view stream) override
{
#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
if (ptr != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(ptr, stream.value())); }
if (ptr != nullptr) {
RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaFreeAsync(ptr, stream.value()));
}
#else
(void)ptr;
(void)stream;
Expand Down

0 comments on commit 74372fa

Please sign in to comment.