Use CUDA 11.2+ features via dlopen (#990)

By binding to the cudart 11.2 functions at runtime we remove the requirement that these symbols exist, therefore allowing RMM to be compiled with 11.2+ and used with 11.0 or 11.1. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Rong Ou (https://github.com/rongou) - Mark Harris (https://github.com/harrism) URL: #990
rapidsai · Mar 18, 2022 · 7434bd6 · 7434bd6
1 parent b21734e
commit 7434bd6
Show file tree

Hide file tree

Showing 5 changed files with 156 additions and 37 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -69,6 +69,7 @@ target_include_directories(rmm INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOUR
 if(CUDA_STATIC_RUNTIME)
   message(STATUS "RMM: Enabling static linking of cudart")
   target_link_libraries(rmm INTERFACE CUDA::cudart_static)
+  target_compile_definitions(rmm INTERFACE RMM_STATIC_CUDART)
 else()
   target_link_libraries(rmm INTERFACE CUDA::cudart)
 endif()

diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cuda_runtime_api.h>
+#include <dlfcn.h>
+#include <memory>
+#include <optional>
+
+namespace rmm::detail {
+
+/**
+ * @brief `dynamic_load_runtime` loads the cuda runtime library at runtime
+ *
+ * By loading the cudart library at runtime we can use functions that
+ * are added in newer minor versions of the cuda runtime.
+ */
+struct dynamic_load_runtime {
+  static void* get_cuda_runtime_handle()
+  {
+    auto close_cudart = [](void* handle) { ::dlclose(handle); };
+    auto open_cudart  = []() {
+      ::dlerror();
+      const int major               = CUDART_VERSION / 1000;
+      const std::string libname_ver = "libcudart.so." + std::to_string(major) + ".0";
+      const std::string libname     = "libcudart.so";
+
+      auto ptr = ::dlopen(libname_ver.c_str(), RTLD_LAZY);
+      if (!ptr) { ptr = ::dlopen(libname.c_str(), RTLD_LAZY); }
+      if (ptr) { return ptr; }
+
+      RMM_FAIL("Unable to dlopen cudart");
+    };
+    static std::unique_ptr<void, decltype(close_cudart)> cudart_handle{open_cudart(), close_cudart};
+    return cudart_handle.get();
+  }
+
+  template <typename... Args>
+  using function_sig = std::add_pointer_t<cudaError_t(Args...)>;
+
+  template <typename signature>
+  static std::optional<signature> function(const char* func_name)
+  {
+    auto* runtime = get_cuda_runtime_handle();
+    auto* handle  = ::dlsym(runtime, func_name);
+    if (!handle) { return std::nullopt; }
+    auto* function_ptr = reinterpret_cast<signature>(handle);
+    return std::optional<signature>(function_ptr);
+  }
+};
+
+#if defined(RMM_STATIC_CUDART)
+// clang-format off
+#define RMM_CUDART_API_WRAPPER(name, signature)                               \
+  template <typename... Args>                                                 \
+  static cudaError_t name(Args... args)                                       \
+  {                                                                           \
+    _Pragma("GCC diagnostic push")                                            \
+    _Pragma("GCC diagnostic ignored \"-Waddress\"")                           \
+    static_assert(static_cast<signature>(::name),                             \
+                  "Failed to find #name function with arguments #signature"); \
+    _Pragma("GCC diagnostic pop")                                             \
+    return ::name(args...);                                                   \
+  }
+// clang-format on
+#else
+#define RMM_CUDART_API_WRAPPER(name, signature)                                \
+  template <typename... Args>                                                  \
+  static cudaError_t name(Args... args)                                        \
+  {                                                                            \
+    static const auto func = dynamic_load_runtime::function<signature>(#name); \
+    if (func) { return (*func)(args...); }                                     \
+    RMM_FAIL("Failed to find #name function in libcudart.so");                 \
+  }
+#endif
+
+#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
+/**
+ * @brief Bind to the stream-ordered memory allocator functions
+ * at runtime.
+ *
+ * This allows RMM users to compile/link against CUDA 11.2+ and run with
+ * < CUDA 11.2 runtime as these functions are found at call time.
+ */
+struct async_alloc {
+  static bool is_supported()
+  {
+#if defined(RMM_STATIC_CUDART)
+    static bool runtime_supports_pool = (CUDART_VERSION >= 11020);
+#else
+    static bool runtime_supports_pool =
+      dynamic_load_runtime::function<dynamic_load_runtime::function_sig<void*, cudaStream_t>>(
+        "cudaFreeAsync")
+        .has_value();
+#endif
+
+    static auto driver_supports_pool{[] {
+      int cuda_pool_supported{};
+      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
+                                           cudaDevAttrMemoryPoolsSupported,
+                                           rmm::detail::current_device().value());
+      return result == cudaSuccess and cuda_pool_supported == 1;
+    }()};
+    return runtime_supports_pool and driver_supports_pool;
+  }
+
+  template <typename... Args>
+  using cudart_sig = dynamic_load_runtime::function_sig<Args...>;
+
+  using cudaMemPoolCreate_sig = cudart_sig<cudaMemPool_t*, const cudaMemPoolProps*>;
+  RMM_CUDART_API_WRAPPER(cudaMemPoolCreate, cudaMemPoolCreate_sig);
+
+  using cudaMemPoolSetAttribute_sig = cudart_sig<cudaMemPool_t, cudaMemPoolAttr, void*>;
+  RMM_CUDART_API_WRAPPER(cudaMemPoolSetAttribute, cudaMemPoolSetAttribute_sig);
+
+  using cudaMemPoolDestroy_sig = cudart_sig<cudaMemPool_t>;
+  RMM_CUDART_API_WRAPPER(cudaMemPoolDestroy, cudaMemPoolDestroy_sig);
+
+  using cudaMallocFromPoolAsync_sig = cudart_sig<void**, size_t, cudaMemPool_t, cudaStream_t>;
+  RMM_CUDART_API_WRAPPER(cudaMallocFromPoolAsync, cudaMallocFromPoolAsync_sig);
+
+  using cudaFreeAsync_sig = cudart_sig<void*, cudaStream_t>;
+  RMM_CUDART_API_WRAPPER(cudaFreeAsync, cudaFreeAsync_sig);
+};
+#endif
+
+#undef RMM_CUDART_API_WRAPPER
+}  // namespace rmm::detail
diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -18,6 +18,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/cuda_util.hpp>
+#include <rmm/detail/dynamic_load_runtime.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -62,7 +63,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   {
 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     // Check if cudaMallocAsync Memory pool supported
-    RMM_EXPECTS(is_supported(),
+    RMM_EXPECTS(rmm::detail::async_alloc::is_supported(),
                 "cudaMallocAsync not supported with this CUDA driver/runtime version");
 
     // Construct explicit pool
@@ -71,7 +72,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
     pool_props.handleTypes   = cudaMemHandleTypePosixFileDescriptor;
     pool_props.location.type = cudaMemLocationTypeDevice;
     pool_props.location.id   = rmm::detail::current_device().value();
-    RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));
+    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));
 
     // CUDA drivers before 11.5 have known incompatibilities with the async allocator.
     // We'll disable `cudaMemPoolReuseAllowOpportunistic` if cuda driver < 11.5.
@@ -81,16 +82,16 @@ class cuda_async_memory_resource final : public device_memory_resource {
     constexpr auto min_async_version{11050};
     if (driver_version < min_async_version) {
       int disabled{0};
-      RMM_CUDA_TRY(
-        cudaMemPoolSetAttribute(cuda_pool_handle_, cudaMemPoolReuseAllowOpportunistic, &disabled));
+      RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
+        cuda_pool_handle_, cudaMemPoolReuseAllowOpportunistic, &disabled));
     }
 
     auto const [free, total] = rmm::detail::available_device_memory();
 
     // Need an l-value to take address to pass to cudaMemPoolSetAttribute
     uint64_t threshold = release_threshold.value_or(total);
-    RMM_CUDA_TRY(
-      cudaMemPoolSetAttribute(cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));
+    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
+      cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));
 
     // Allocate and immediately deallocate the initial_pool_size to prime the pool with the
     // specified size
@@ -115,40 +116,14 @@ class cuda_async_memory_resource final : public device_memory_resource {
   ~cuda_async_memory_resource() override
   {
 #if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
-    RMM_ASSERT_CUDA_SUCCESS(cudaMemPoolDestroy(pool_handle()));
+    RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaMemPoolDestroy(pool_handle()));
 #endif
   }
   cuda_async_memory_resource(cuda_async_memory_resource const&) = delete;
   cuda_async_memory_resource(cuda_async_memory_resource&&)      = delete;
   cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = delete;
   cuda_async_memory_resource& operator=(cuda_async_memory_resource&&) = delete;
 
-  /**
-   * @brief Is cudaMallocAsync supported with this cuda runtime/driver version?
-   * @return true if both the cuda runtime and driver are newer than 11.2
-   */
-  static bool is_supported()
-  {
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
-    static auto runtime_supports_pool{[] {
-      int runtime_version{};
-      RMM_CUDA_TRY(cudaRuntimeGetVersion(&runtime_version));
-      constexpr auto min_async_version{11020};
-      return runtime_version >= min_async_version;
-    }()};
-    static auto driver_supports_pool{[] {
-      int cuda_pool_supported{};
-      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
-                                           cudaDevAttrMemoryPoolsSupported,
-                                           rmm::detail::current_device().value());
-      return result == cudaSuccess and cuda_pool_supported == 1;
-    }()};
-    return runtime_supports_pool and driver_supports_pool;
-#else
-    return false;
-#endif
-  }
-
   /**
    * @brief Query whether the resource supports use of non-null CUDA streams for
    * allocation/deallocation. `cuda_memory_resource` does not support streams.
@@ -184,7 +159,8 @@ class cuda_async_memory_resource final : public device_memory_resource {
     void* ptr{nullptr};
 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     if (bytes > 0) {
-      RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
+      RMM_CUDA_TRY_ALLOC(rmm::detail::async_alloc::cudaMallocFromPoolAsync(
+        &ptr, bytes, pool_handle(), stream.value()));
     }
 #else
     (void)bytes;
@@ -203,7 +179,9 @@ class cuda_async_memory_resource final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view stream) override
   {
 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-    if (ptr != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(ptr, stream.value())); }
+    if (ptr != nullptr) {
+      RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaFreeAsync(ptr, stream.value()));
+    }
 #else
     (void)ptr;
     (void)stream;

diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp
@@ -28,7 +28,7 @@ class AsyncMRTest : public ::testing::Test {
  protected:
   void SetUp() override
   {
-    if (!rmm::mr::cuda_async_memory_resource::is_supported()) {
+    if (!rmm::detail::async_alloc::is_supported()) {
       GTEST_SKIP() << "Skipping tests since cudaMallocAsync not supported with this CUDA "
                    << "driver/runtime version";
     }

diff --git a/tests/mr/device/mr_test.hpp b/tests/mr/device/mr_test.hpp
@@ -252,7 +252,7 @@ inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>
 
 inline auto make_cuda_async()
 {
-  if (rmm::mr::cuda_async_memory_resource::is_supported()) {
+  if (rmm::detail::async_alloc::is_supported()) {
     return std::make_shared<rmm::mr::cuda_async_memory_resource>();
   }
   return std::shared_ptr<rmm::mr::cuda_async_memory_resource>{nullptr};