Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use CUDA 11.2+ features via dlopen #990

Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8944f92
CUDA 11.2+ features are used via dlopen
robertmaynard Mar 9, 2022
838d115
Remove cuda_async_memory_resource::is_supported
robertmaynard Mar 9, 2022
452b998
Give the typedef used by `function` a better name
robertmaynard Mar 9, 2022
f05cf86
Consistently use 'auto *'
robertmaynard Mar 9, 2022
b070025
Refactor 'open_cuda_runtime' to be thread safe
robertmaynard Mar 10, 2022
b598499
Remove an unneeded check from dynamic_load_runtime
robertmaynard Mar 10, 2022
b3ff2d8
Correct style issues found by CI
robertmaynard Mar 10, 2022
2b3a2b1
Refactor 'open_cuda_runtime' to 'get_cuda_runtime_handle'
robertmaynard Mar 10, 2022
da02b55
Ensure we call the correct cudart functions
robertmaynard Mar 14, 2022
9d228b9
Correct build failures
robertmaynard Mar 14, 2022
8780a93
verify RMM_SYNC_ALLOC_WRAPPER signature in static code path
robertmaynard Mar 14, 2022
9bfb83c
Update include/rmm/detail/dynamic_load_runtime.hpp
robertmaynard Mar 15, 2022
2424da3
Update include/rmm/detail/dynamic_load_runtime.hpp
robertmaynard Mar 15, 2022
0a32c5a
Update include/rmm/detail/dynamic_load_runtime.hpp
robertmaynard Mar 15, 2022
49df39e
Update include/rmm/detail/dynamic_load_runtime.hpp
robertmaynard Mar 15, 2022
80e5316
Update include/rmm/detail/dynamic_load_runtime.hpp
robertmaynard Mar 15, 2022
8463e0f
Rename RMM_SYNC_ALLOC_WRAPPER to RMM_CUDART_API_WRAPPER
robertmaynard Mar 15, 2022
ff9ea91
fix function_sig typo
robertmaynard Mar 15, 2022
a3a2cf5
Ensure we call cudaMemPoolDestroy via async_alloc
robertmaynard Mar 15, 2022
baf618b
dynamic_load_runtime now returns std::optional<function_ptr>
robertmaynard Mar 15, 2022
132305e
dynamic_load_runtime now returns std::optional<function_ptr>
robertmaynard Mar 15, 2022
2ff56b7
Correct style issues found by CI
robertmaynard Mar 15, 2022
207cfae
return empty optional instead of optional(nullptr)
robertmaynard Mar 15, 2022
13ef5ea
Correct std::optional issues found by review
robertmaynard Mar 16, 2022
2ddc6fd
Correct style issues found by ci
robertmaynard Mar 16, 2022
76b8db5
Merge branch 'branch-22.04' into cuda_async_memory_resource-dlopen-cu…
robertmaynard Mar 16, 2022
1aad91c
Merge branch 'branch-22.04' into cuda_async_memory_resource-dlopen-cu…
robertmaynard Mar 17, 2022
1994bc4
opportunistic reuse bug now updated to use `rmm::detail::async_alloc`
robertmaynard Mar 17, 2022
48e10d6
Use static_assert to validate function arguments are valid
robertmaynard Mar 17, 2022
86ef0f9
remove indirect function call when building statically
robertmaynard Mar 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ target_include_directories(rmm INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOUR
if(CUDA_STATIC_RUNTIME)
message(STATUS "RMM: Enabling static linking of cudart")
target_link_libraries(rmm INTERFACE CUDA::cudart_static)
target_compile_definitions(rmm INTERFACE RMM_STATIC_CUDART)
else()
target_link_libraries(rmm INTERFACE CUDA::cudart)
endif()
Expand Down
116 changes: 116 additions & 0 deletions include/rmm/detail/dynamic_load_runtime.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda_runtime_api.h>
#include <dlfcn.h>
#include <memory>

namespace rmm::detail {

/**
* @brief `dynamic_load_runtime` loads the cuda runtime library at runtime
*
* By loading the cudart library at runtime we can use functions that
* are added in newer minor versions of the cuda runtime.
*/
struct dynamic_load_runtime {
static void* open_cuda_runtime()
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
{
auto close_cudart = [](void* handle) { ::dlclose(handle); };
auto open_cudart = []() {
::dlerror();
const int major = CUDART_VERSION / 1000;
const std::string libname_ver = "libcudart.so." + std::to_string(major) + ".0";
const std::string libname = "libcudart.so";

auto ptr = ::dlopen(libname_ver.c_str(), RTLD_LAZY);
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
if (!ptr) { ptr = ::dlopen(libname.c_str(), RTLD_LAZY); }
if (ptr) { return ptr; }

RMM_FAIL("Unable to dlopen cudart");
};
static std::unique_ptr<void, decltype(close_cudart)> cudart_handle{open_cudart(), close_cudart};
return cudart_handle.get();
}

template <typename... Args>
using cudart_func_ptr = std::add_pointer_t<cudaError_t(Args...)>;

template <typename... Args>
static cudart_func_ptr<Args...> function(const char* func_name)
{
auto* runtime = open_cuda_runtime();
if (!runtime) { return nullptr; }
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
auto* handle = ::dlsym(runtime, func_name);
if (!handle) { return nullptr; }
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
auto* function_ptr = reinterpret_cast<cudart_func_ptr<Args...>>(handle);
return function_ptr;
}
};

#if CUDART_VERSION >= 11020 // 11.2 introduced cudaMallocAsync
/**
* @brief `async_alloc` bind to the Stream Ordered Memory Allocator functions
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
* at runtime.
*
* This allows us rmm users to compile/link against CUDA 11.2+ and run with
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
* < CUDA 11.2 runtime as these functions are found at call time
robertmaynard marked this conversation as resolved.
Show resolved Hide resolved
*/
struct async_alloc {
static bool is_supported()
{
#if defined(RMM_STATIC_CUDART)
static bool runtime_supports_pool = (CUDART_VERSION >= 11020);
#else
static bool runtime_supports_pool =
dynamic_load_runtime::function<void*>("cudaFreeAsync") != nullptr;
#endif

static auto driver_supports_pool{[] {
int cuda_pool_supported{};
auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
cudaDevAttrMemoryPoolsSupported,
rmm::detail::current_device().value());
return result == cudaSuccess and cuda_pool_supported == 1;
}()};
return runtime_supports_pool and driver_supports_pool;
}

#if defined(RMM_STATIC_CUDART)
#define RMM_SYNC_ALLOC_WRAPPER(name) \
template <typename... Args> \
static cudaError_t name(Args... args) \
{ \
return ::name(args...); \
}
#else
#define RMM_SYNC_ALLOC_WRAPPER(name) \
template <typename... Args> \
static cudaError_t name(Args... args) \
{ \
static const auto func = dynamic_load_runtime::function<Args...>(#name); \
return func(args...); \
}
#endif

RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolCreate);
RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolSetAttribute);
RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolDestroy);
RMM_SYNC_ALLOC_WRAPPER(cudaMallocFromPoolAsync);
RMM_SYNC_ALLOC_WRAPPER(cudaFreeAsync);
};
#endif
} // namespace rmm::detail
42 changes: 10 additions & 32 deletions include/rmm/mr/device/cuda_async_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <rmm/cuda_device.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/cuda_util.hpp>
#include <rmm/detail/dynamic_load_runtime.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

Expand Down Expand Up @@ -62,7 +63,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
{
#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
// Check if cudaMallocAsync Memory pool supported
RMM_EXPECTS(is_supported(),
RMM_EXPECTS(rmm::detail::async_alloc::is_supported(),
"cudaMallocAsync not supported with this CUDA driver/runtime version");

// Construct explicit pool
Expand All @@ -71,14 +72,14 @@ class cuda_async_memory_resource final : public device_memory_resource {
pool_props.handleTypes = cudaMemHandleTypePosixFileDescriptor;
pool_props.location.type = cudaMemLocationTypeDevice;
pool_props.location.id = rmm::detail::current_device().value();
RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));
RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));

auto const [free, total] = rmm::detail::available_device_memory();

// Need an l-value to take address to pass to cudaMemPoolSetAttribute
uint64_t threshold = release_threshold.value_or(total);
RMM_CUDA_TRY(
cudaMemPoolSetAttribute(cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));
RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));

// Allocate and immediately deallocate the initial_pool_size to prime the pool with the
// specified size
Expand Down Expand Up @@ -111,32 +112,6 @@ class cuda_async_memory_resource final : public device_memory_resource {
cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = delete;
cuda_async_memory_resource& operator=(cuda_async_memory_resource&&) = delete;

/**
* @brief Is cudaMallocAsync supported with this cuda runtime/driver version?
* @return true if both the cuda runtime and driver are newer than 11.2
*/
static bool is_supported()
{
#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
static auto runtime_supports_pool{[] {
int runtime_version{};
RMM_CUDA_TRY(cudaRuntimeGetVersion(&runtime_version));
constexpr auto min_async_version{11020};
return runtime_version >= min_async_version;
}()};
static auto driver_supports_pool{[] {
int cuda_pool_supported{};
auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
cudaDevAttrMemoryPoolsSupported,
rmm::detail::current_device().value());
return result == cudaSuccess and cuda_pool_supported == 1;
}()};
return runtime_supports_pool and driver_supports_pool;
#else
return false;
#endif
}

/**
* @brief Query whether the resource supports use of non-null CUDA streams for
* allocation/deallocation. `cuda_memory_resource` does not support streams.
Expand Down Expand Up @@ -172,7 +147,8 @@ class cuda_async_memory_resource final : public device_memory_resource {
void* ptr{nullptr};
#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
if (bytes > 0) {
RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
RMM_CUDA_TRY_ALLOC(rmm::detail::async_alloc::cudaMallocFromPoolAsync(
&ptr, bytes, pool_handle(), stream.value()));
}
#else
(void)bytes;
Expand All @@ -191,7 +167,9 @@ class cuda_async_memory_resource final : public device_memory_resource {
void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view stream) override
{
#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
if (ptr != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(ptr, stream.value())); }
if (ptr != nullptr) {
RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaFreeAsync(ptr, stream.value()));
}
#else
(void)ptr;
(void)stream;
Expand Down
2 changes: 1 addition & 1 deletion tests/mr/device/cuda_async_mr_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class AsyncMRTest : public ::testing::Test {
protected:
void SetUp() override
{
if (!rmm::mr::cuda_async_memory_resource::is_supported()) {
if (!rmm::detail::async_alloc::is_supported()) {
GTEST_SKIP() << "Skipping tests since cudaMallocAsync not supported with this CUDA "
<< "driver/runtime version";
}
Expand Down
2 changes: 1 addition & 1 deletion tests/mr/device/mr_test.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>

inline auto make_cuda_async()
{
if (rmm::mr::cuda_async_memory_resource::is_supported()) {
if (rmm::detail::async_alloc::is_supported()) {
return std::make_shared<rmm::mr::cuda_async_memory_resource>();
}
return std::shared_ptr<rmm::mr::cuda_async_memory_resource>{nullptr};
Expand Down