rapidsai · rapids-bot · Mar 18, 2022 · Mar 9, 2022 · Mar 9, 2022 · Mar 9, 2022
@@ -69,6 +69,7 @@ target_include_directories(rmm INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOUR
 if(CUDA_STATIC_RUNTIME)
   message(STATUS "RMM: Enabling static linking of cudart")
   target_link_libraries(rmm INTERFACE CUDA::cudart_static)
+  target_compile_definitions(rmm INTERFACE RMM_STATIC_CUDART)
 else()
   target_link_libraries(rmm INTERFACE CUDA::cudart)
 endif()

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cuda_runtime_api.h>
+#include <dlfcn.h>
+#include <memory>
+
+namespace rmm::detail {
+
+/**
+ * @brief `dynamic_load_runtime` loads the cuda runtime library at runtime
+ *
+ * By loading the cudart library at runtime we can use functions that
+ * are added in newer minor versions of the cuda runtime.
+ */
+struct dynamic_load_runtime {
+  static void* open_cuda_runtime()
+  {
+    auto close_cudart = [](void* handle) { ::dlclose(handle); };
+    auto open_cudart  = []() {
+      ::dlerror();
+      const int major               = CUDART_VERSION / 1000;
+      const std::string libname_ver = "libcudart.so." + std::to_string(major) + ".0";
+      const std::string libname     = "libcudart.so";
+
+      auto ptr = ::dlopen(libname_ver.c_str(), RTLD_LAZY);
+      if (!ptr) { ptr = ::dlopen(libname.c_str(), RTLD_LAZY); }
+      if (ptr) { return ptr; }
+
+      RMM_FAIL("Unable to dlopen cudart");
+    };
+    static std::unique_ptr<void, decltype(close_cudart)> cudart_handle{open_cudart(), close_cudart};
+    return cudart_handle.get();
+  }
+
+  template <typename... Args>
+  using cudart_func_ptr = std::add_pointer_t<cudaError_t(Args...)>;
+
+  template <typename... Args>
+  static cudart_func_ptr<Args...> function(const char* func_name)
+  {
+    auto* runtime = open_cuda_runtime();
+    if (!runtime) { return nullptr; }
+    auto* handle = ::dlsym(runtime, func_name);
+    if (!handle) { return nullptr; }
+    auto* function_ptr = reinterpret_cast<cudart_func_ptr<Args...>>(handle);
+    return function_ptr;
+  }
+};
+
+#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
+/**
+ * @brief `async_alloc` bind to the Stream Ordered Memory Allocator functions
+ * at runtime.
+ *
+ * This allows us rmm users to compile/link against CUDA 11.2+ and run with
+ * < CUDA 11.2 runtime as these functions are found at call time
+ */
+struct async_alloc {
+  static bool is_supported()
+  {
+#if defined(RMM_STATIC_CUDART)
+    static bool runtime_supports_pool = (CUDART_VERSION >= 11020);
+#else
+    static bool runtime_supports_pool =
+      dynamic_load_runtime::function<void*>("cudaFreeAsync") != nullptr;
+#endif
+
+    static auto driver_supports_pool{[] {
+      int cuda_pool_supported{};
+      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
+                                           cudaDevAttrMemoryPoolsSupported,
+                                           rmm::detail::current_device().value());
+      return result == cudaSuccess and cuda_pool_supported == 1;
+    }()};
+    return runtime_supports_pool and driver_supports_pool;
+  }
+
+#if defined(RMM_STATIC_CUDART)
+#define RMM_SYNC_ALLOC_WRAPPER(name)    \
+  template <typename... Args>           \
+  static cudaError_t name(Args... args) \
+  {                                     \
+    return ::name(args...);             \
+  }
+#else
+#define RMM_SYNC_ALLOC_WRAPPER(name)                                         \
+  template <typename... Args>                                                \
+  static cudaError_t name(Args... args)                                      \
+  {                                                                          \
+    static const auto func = dynamic_load_runtime::function<Args...>(#name); \
+    return func(args...);                                                    \
+  }
+#endif
+
+  RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolCreate);
+  RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolSetAttribute);
+  RMM_SYNC_ALLOC_WRAPPER(cudaMemPoolDestroy);
+  RMM_SYNC_ALLOC_WRAPPER(cudaMallocFromPoolAsync);
+  RMM_SYNC_ALLOC_WRAPPER(cudaFreeAsync);
+};
+#endif
+}  // namespace rmm::detail
@@ -18,6 +18,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/cuda_util.hpp>
+#include <rmm/detail/dynamic_load_runtime.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -62,7 +63,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   {
 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     // Check if cudaMallocAsync Memory pool supported
-    RMM_EXPECTS(is_supported(),
+    RMM_EXPECTS(rmm::detail::async_alloc::is_supported(),
                 "cudaMallocAsync not supported with this CUDA driver/runtime version");
 
     // Construct explicit pool
@@ -71,14 +72,14 @@ class cuda_async_memory_resource final : public device_memory_resource {
     pool_props.handleTypes   = cudaMemHandleTypePosixFileDescriptor;
     pool_props.location.type = cudaMemLocationTypeDevice;
     pool_props.location.id   = rmm::detail::current_device().value();
-    RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));
+    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle_, &pool_props));
 
     auto const [free, total] = rmm::detail::available_device_memory();
 
     // Need an l-value to take address to pass to cudaMemPoolSetAttribute
     uint64_t threshold = release_threshold.value_or(total);
-    RMM_CUDA_TRY(
-      cudaMemPoolSetAttribute(cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));
+    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
+      cuda_pool_handle_, cudaMemPoolAttrReleaseThreshold, &threshold));
 
     // Allocate and immediately deallocate the initial_pool_size to prime the pool with the
     // specified size
@@ -111,32 +112,6 @@ class cuda_async_memory_resource final : public device_memory_resource {
   cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = delete;
   cuda_async_memory_resource& operator=(cuda_async_memory_resource&&) = delete;
 
-  /**
-   * @brief Is cudaMallocAsync supported with this cuda runtime/driver version?
-   * @return true if both the cuda runtime and driver are newer than 11.2
-   */
-  static bool is_supported()
-  {
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
-    static auto runtime_supports_pool{[] {
-      int runtime_version{};
-      RMM_CUDA_TRY(cudaRuntimeGetVersion(&runtime_version));
-      constexpr auto min_async_version{11020};
-      return runtime_version >= min_async_version;
-    }()};
-    static auto driver_supports_pool{[] {
-      int cuda_pool_supported{};
-      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
-                                           cudaDevAttrMemoryPoolsSupported,
-                                           rmm::detail::current_device().value());
-      return result == cudaSuccess and cuda_pool_supported == 1;
-    }()};
-    return runtime_supports_pool and driver_supports_pool;
-#else
-    return false;
-#endif
-  }
-
   /**
    * @brief Query whether the resource supports use of non-null CUDA streams for
    * allocation/deallocation. `cuda_memory_resource` does not support streams.
@@ -172,7 +147,8 @@ class cuda_async_memory_resource final : public device_memory_resource {
     void* ptr{nullptr};
 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     if (bytes > 0) {
-      RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
+      RMM_CUDA_TRY_ALLOC(rmm::detail::async_alloc::cudaMallocFromPoolAsync(
+        &ptr, bytes, pool_handle(), stream.value()));
     }
 #else
     (void)bytes;
@@ -191,7 +167,9 @@ class cuda_async_memory_resource final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view stream) override
   {
 #ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-    if (ptr != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(ptr, stream.value())); }
+    if (ptr != nullptr) {
+      RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaFreeAsync(ptr, stream.value()));
+    }
 #else
     (void)ptr;
     (void)stream;

@@ -28,7 +28,7 @@ class AsyncMRTest : public ::testing::Test {
  protected:
   void SetUp() override
   {
-    if (!rmm::mr::cuda_async_memory_resource::is_supported()) {
+    if (!rmm::detail::async_alloc::is_supported()) {
       GTEST_SKIP() << "Skipping tests since cudaMallocAsync not supported with this CUDA "
                    << "driver/runtime version";
     }

@@ -252,7 +252,7 @@ inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>
 
 inline auto make_cuda_async()
 {
-  if (rmm::mr::cuda_async_memory_resource::is_supported()) {
+  if (rmm::detail::async_alloc::is_supported()) {
     return std::make_shared<rmm::mr::cuda_async_memory_resource>();
   }
   return std::shared_ptr<rmm::mr::cuda_async_memory_resource>{nullptr};