CUDA 11.2+ features are used via dlopen

By binding to the cudart 11.2 functions at runtime we remove the requirement that these symbols exist, therefore allowing RMM to be compiled with 11.2+ and used with 11.0 or 11.1.
rapidsai · Mar 9, 2022 · 74372fa · 74372fa
1 parent f31cd8b
commit 74372fa
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 11 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -69,6 +69,7 @@ target_include_directories(rmm INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOUR
 if(CUDA_STATIC_RUNTIME)
   message(STATUS "RMM: Enabling static linking of cudart")
   target_link_libraries(rmm INTERFACE CUDA::cudart_static)
+  target_compile_definitions(rmm INTERFACE RMM_STATIC_CUDART)
 else()
   target_link_libraries(rmm INTERFACE CUDA::cudart)
 endif()

diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <dlfcn.h>
+#include <memory>
+
+namespace rmm::detail {
+
+/**
+ * @brief `dynamic_load_runtime` loads the cuda runtime library at runtime
+ *
+ * By loading the cudart library at runtime we can use functions that
+ * are added in newer minor versions of the cuda runtime.
+ */
+struct dynamic_load_runtime {
+  static constexpr auto dlclose_destructor = [](void* handle) { ::dlclose(handle); };
+  inline static std::unique_ptr<void, decltype(dlclose_destructor)> cuda_runtime_lib{
+    nullptr, dlclose_destructor};
+
+  static bool open_cuda_runtime()
+  {
+    if (!cuda_runtime_lib) {
+      ::dlerror();
+      const int major               = CUDART_VERSION / 1000;
+      const std::string libname_ver = "libcudart.so." + std::to_string(major) + ".0";
+      const std::string libname     = "libcudart.so";
+
+      auto ptr = ::dlopen(libname_ver.c_str(), RTLD_LAZY);
+      if (!ptr) { ::dlopen(libname.c_str(), RTLD_LAZY); }
+      if (!ptr) { return false; }
+
+      cuda_runtime_lib.reset(ptr);
+    }
+    return true;
+  }
+
+  template <typename... Args>
+  using function_return_type = std::add_pointer_t<cudaError_t(Args...)>;
+
+  template <typename... Args>
+  static function_return_type<Args...> function(const char* func_name)
+  {
+    if (!open_cuda_runtime()) { return nullptr; }
+    auto* handle = ::dlsym(cuda_runtime_lib.get(), func_name);
+    if (!handle) { return nullptr; }
+    auto function_ptr = reinterpret_cast<function_return_type<Args...>>(handle);
+    return function_ptr;
+  }
+};
+
+
+/**
+ * @brief `async_alloc` bind to the Stream Ordered Memory Allocator functions
+ * at runtime.
+ *
+ * This allows us rmm users to compile/link against CUDA 11.2+ and run with
+ * < CUDA 11.2 runtime as these functions are found at call time
+ */
+struct async_alloc {
+
+  static bool is_supported()
+  {
+#if defined(RMM_STATIC_CUDART)
+    static bool has_support = (CUDART_VERSION >= 11020);
+#else
+    static bool has_support = dynamic_load_runtime::function<void*>("cudaFreeAsync") != nullptr;
+#endif
+    return has_support;
+  }
+
+#if defined(RMM_STATIC_CUDART)
+  #define RMM_SYNC_ALLOC_WRAPPER(name)\
+  template <typename... Args> \
+  static cudaError_t name(Args... args) { \
+    return ::name(args...); \
+  }
+#else
+  #define RMM_SYNC_ALLOC_WRAPPER(name)\
+  template <typename... Args> \
+  static cudaError_t name(Args... args) { \
+    static const auto func = dynamic_load_runtime::function<Args...>(#name); \
+    return func(args...); \
+  }
+#endif
+
+  RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolCreate);
+  RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolSetAttribute);
+  RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolDestroy);
+  RMM_SYNC_ALLOC_WRAPPER(cudaMallocFromPoolAsync);
+  RMM_SYNC_ALLOC_WRAPPER(cudaFreeAsync);
+};
+}  // namespace rmm::detail
diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -18,6 +18,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/cuda_util.hpp>
+#include <rmm/detail/dynamic_load_runtime.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -71,14 +72,14 @@ class cuda_async_memory_resource final : public device_memory_resource {
     pool_props.handleTypes   = cudaMemHandleTypePosixFileDescriptor;
     pool_props.location.type = cudaMemLocationTypeDevice;
     pool_props.location.id   = rmm::detail::current_device().value();
-    RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));
+    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));
 
     auto const [free, total] = rmm::detail::available_device_memory();
 
     // Need an l-value to take address to pass to cudaMemPoolSetAttribute
     uint64_t threshold = release_threshold.value_or(total);
-    RMM_CUDA_TRY(
-      cudaMemPoolSetAttribute(cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));
+    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
+      cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));
 
     // Allocate and immediately deallocate the initial_pool_size to prime the pool with the
     // specified size
@@ -118,12 +119,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   static bool is_supported()
   {
 #if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
-    static auto runtime_supports_pool{[] {
-      int runtime_version{};
-      RMM_CUDA_TRY(cudaRuntimeGetVersion(&runtime_version));
-      constexpr auto min_async_version{11020};
-      return runtime_version >= min_async_version;
-    }()};
+    static auto runtime_supports_pool = rmm::detail::async_alloc::is_supported();
     static auto driver_supports_pool{[] {
       int cuda_pool_supported{};
       auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
@@ -172,7 +168,8 @@ class cuda_async_memory_resource final : public device_memory_resource {
     void* ptr{nullptr};
 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     if (bytes > 0) {
-      RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
+      RMM_CUDA_TRY_ALLOC(rmm::detail::async_alloc::cudaMallocFromPoolAsync(
+        &ptr, bytes, pool_handle(), stream.value()));
     }
 #else
     (void)bytes;
@@ -191,7 +188,9 @@ class cuda_async_memory_resource final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view stream) override
   {
 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-    if (ptr != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(ptr, stream.value())); }
+    if (ptr != nullptr) {
+      RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaFreeAsync(ptr, stream.value()));
+    }
 #else
     (void)ptr;
     (void)stream;