Improve the Arena allocator to reduce memory fragmentation (#916)

Currently the arena allocator divides GPU memory into a global arena and per-thread arenas. For smaller allocations, a per-thread arena allocates large chunks of memory (superblocks) from the global arena and divides them up for individual allocations. However, when deallocating from another arena (producer/consumer pattern), or when we run out of memory and return everything to the global arena, the superblock boundaries are broken. Overtime, this could cause the memory to get more and more fragmented. This PR makes superblocks concrete objects, not just virtual boundaries, and the only units of exchange between the global arena and per-thread arenas. This should make the allocator more resistant to memory fragmentation, especially for long running processes under constant memory pressure. Other notable changes: * The allocator now allocates a fixed but configurable amount of memory from CUDA. This introduces less fragmentation comparing to growing the pool size gradually. * Switched to C++17 `std::shared_mutex`. * Added a bunch of unit tests. fixes #919 fixes #906 Authors: - Rong Ou (https://github.com/rongou) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Mark Harris (https://github.com/harrism) URL: #916
rapidsai · Jan 12, 2022 · ea807e8 · ea807e8
1 parent 5a239d2
commit ea807e8
Show file tree

Hide file tree

Showing 4 changed files with 1,281 additions and 403 deletions.
diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -170,7 +170,9 @@ inline auto make_pool()
 
 inline auto make_arena()
 {
-  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
+  auto free = rmm::detail::available_device_memory().first;
+  constexpr auto reserve{64UL << 20};  // Leave some space for CUDA overhead.
+  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda(), free - reserve);
 }
 
 inline auto make_binning()

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,26 +78,21 @@ class arena_memory_resource final : public device_memory_resource {
    * @brief Construct an `arena_memory_resource`.
    *
    * @throws rmm::logic_error if `upstream_mr == nullptr`.
-   * @throws rmm::logic_error if `initial_size` is neither the default nor aligned to a multiple of
-   * 256 bytes.
-   * @throws rmm::logic_error if `maximum_size` is neither the default nor aligned to a multiple of
-   * 256 bytes.
    *
-   * @param upstream_mr The memory resource from which to allocate blocks for the pool
-   * @param initial_size Minimum size, in bytes, of the initial global arena. Defaults to half of
-   * the available memory on the current device.
-   * @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all
-   * of the available memory on the current device.
+   * @param upstream_mr The memory resource from which to allocate blocks for the global arena.
+   * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory
+   * on the current device.
+   * @param dump_log_on_failure If true, dump memory log when running out of memory.
    */
   explicit arena_memory_resource(Upstream* upstream_mr,
-                                 std::size_t initial_size = global_arena::default_initial_size,
-                                 std::size_t maximum_size = global_arena::default_maximum_size,
-                                 bool dump_log_on_failure = false)
-    : global_arena_{upstream_mr, initial_size, maximum_size},
-      dump_log_on_failure_{dump_log_on_failure}
+                                 std::optional<std::size_t> arena_size = std::nullopt,
+                                 bool dump_log_on_failure              = false)
+    : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
   {
     if (dump_log_on_failure_) {
       logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
+      // Set the level to `debug` for more detailed output.
+      logger_->set_level(spdlog::level::info);
     }
   }
 
@@ -125,17 +120,15 @@ class arena_memory_resource final : public device_memory_resource {
   bool supports_get_mem_info() const noexcept override { return false; }
 
  private:
-  using global_arena = detail::arena::global_arena<Upstream>;
-  using arena        = detail::arena::arena<Upstream>;
-  using read_lock    = std::shared_lock<std::shared_timed_mutex>;
-  using write_lock   = std::lock_guard<std::shared_timed_mutex>;
+  using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
+  using arena        = rmm::mr::detail::arena::arena<Upstream>;
 
   /**
    * @brief Allocates memory of size at least `bytes`.
    *
    * The returned pointer has at least 256-byte alignment.
    *
-   * @throws `std::bad_alloc` if the requested allocation could not be fulfilled.
+   * @throws `rmm::out_of_memory` if no more memory is available for the requested size.
    *
    * @param bytes The size in bytes of the allocation.
    * @param stream The stream to associate this allocation with.
@@ -144,52 +137,100 @@ class arena_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     if (bytes <= 0) { return nullptr; }
+#ifdef RMM_ARENA_USE_SIZE_CLASSES
+    bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
+#else
+    bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+#endif
+    auto& arena = get_arena(stream);
 
-    bytes         = detail::arena::align_up(bytes);
-    auto& arena   = get_arena(stream);
-    void* pointer = arena.allocate(bytes);
+    {
+      std::shared_lock lock(mtx_);
+      void* pointer = arena.allocate(bytes);
+      if (pointer != nullptr) { return pointer; }
+    }
 
-    if (pointer == nullptr) {
-      write_lock lock(mtx_);
+    {
+      std::unique_lock lock(mtx_);
       defragment();
-      pointer = arena.allocate(bytes);
+      void* pointer = arena.allocate(bytes);
       if (pointer == nullptr) {
         if (dump_log_on_failure_) { dump_memory_log(bytes); }
         RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
       }
+      return pointer;
     }
+  }
 
-    return pointer;
+  /**
+   * @brief Defragment memory by returning all superblocks to the global arena.
+   */
+  void defragment()
+  {
+    RMM_CUDA_TRY(cudaDeviceSynchronize());
+    for (auto& thread_arena : thread_arenas_) {
+      thread_arena.second->clean();
+    }
+    for (auto& stream_arena : stream_arenas_) {
+      stream_arena.second.clean();
+    }
   }
 
   /**
    * @brief Deallocate memory pointed to by `ptr`.
    *
    * @param ptr Pointer to be deallocated.
    * @param bytes The size in bytes of the allocation. This must be equal to the
-   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
    * @param stream Stream on which to perform deallocation.
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
     if (ptr == nullptr || bytes <= 0) { return; }
+#ifdef RMM_ARENA_USE_SIZE_CLASSES
+    bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
+#else
+    bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+#endif
+    auto& arena = get_arena(stream);
+
+    {
+      std::shared_lock lock(mtx_);
+      // If the memory being freed does not belong to the arena, the following will return false.
+      if (arena.deallocate(ptr, bytes, stream)) { return; }
+    }
+
+    {
+      // Since we are returning this memory to another stream, we need to make sure the current
+      // stream is caught up.
+      stream.synchronize_no_throw();
 
-    bytes = detail::arena::align_up(bytes);
-    get_arena(stream).deallocate(ptr, bytes, stream);
+      std::unique_lock lock(mtx_);
+      deallocate_from_other_arena(ptr, bytes, stream);
+    }
   }
 
   /**
-   * @brief Defragment memory by returning all free blocks to the global arena.
+   * @brief Deallocate memory pointed to by `ptr` that was allocated in a different arena.
+   *
+   * @param ptr Pointer to be deallocated.
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
+   * @param stream Stream on which to perform deallocation.
    */
-  void defragment()
+  void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
   {
-    RMM_CUDA_TRY(cudaDeviceSynchronize());
-    for (auto& thread_arena : thread_arenas_) {
-      thread_arena.second->clean();
-    }
-    for (auto& stream_arena : stream_arenas_) {
-      stream_arena.second.clean();
+    if (use_per_thread_arena(stream)) {
+      for (auto const& thread_arena : thread_arenas_) {
+        if (thread_arena.second->deallocate(ptr, bytes)) { return; }
+      }
+    } else {
+      for (auto& stream_arena : stream_arenas_) {
+        if (stream_arena.second.deallocate(ptr, bytes)) { return; }
+      }
     }
+
+    if (!global_arena_.deallocate(ptr, bytes)) { RMM_FAIL("allocation not found"); }
   }
 
   /**
@@ -213,12 +254,12 @@ class arena_memory_resource final : public device_memory_resource {
   {
     auto const thread_id = std::this_thread::get_id();
     {
-      read_lock lock(mtx_);
+      std::shared_lock lock(map_mtx_);
       auto const iter = thread_arenas_.find(thread_id);
       if (iter != thread_arenas_.end()) { return *iter->second; }
     }
     {
-      write_lock lock(mtx_);
+      std::unique_lock lock(map_mtx_);
       auto thread_arena = std::make_shared<arena>(global_arena_);
       thread_arenas_.emplace(thread_id, thread_arena);
       thread_local detail::arena::arena_cleaner<Upstream> cleaner{thread_arena};
@@ -235,12 +276,12 @@ class arena_memory_resource final : public device_memory_resource {
   {
     RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
     {
-      read_lock lock(mtx_);
+      std::shared_lock lock(map_mtx_);
       auto const iter = stream_arenas_.find(stream.value());
       if (iter != stream_arenas_.end()) { return iter->second; }
     }
     {
-      write_lock lock(mtx_);
+      std::unique_lock lock(map_mtx_);
       stream_arenas_.emplace(stream.value(), global_arena_);
       return stream_arenas_.at(stream.value());
     }
@@ -269,18 +310,6 @@ class arena_memory_resource final : public device_memory_resource {
     logger_->info("**************************************************");
     logger_->info("Global arena:");
     global_arena_.dump_memory_log(logger_);
-    logger_->info("Per-thread arenas:");
-    for (auto const& thread_arena : thread_arenas_) {
-      logger_->info("  Thread {}:", thread_arena.first);
-      thread_arena.second->dump_memory_log(logger_);
-    }
-    if (!stream_arenas_.empty()) {
-      logger_->info("Per-stream arenas:");
-      for (auto const& stream_arena : stream_arenas_) {
-        logger_->info("  Stream {}:", static_cast<void*>(stream_arena.first));
-        stream_arena.second.dump_memory_log(logger_);
-      }
-    }
     logger_->flush();
   }
 
@@ -304,11 +333,13 @@ class arena_memory_resource final : public device_memory_resource {
   /// Implementation note: for small sizes, map is more efficient than unordered_map.
   std::map<cudaStream_t, arena> stream_arenas_;
   /// If true, dump memory information to log on allocation failure.
-  bool dump_log_on_failure_;
+  bool dump_log_on_failure_{};
   /// The logger for memory dump.
   std::shared_ptr<spdlog::logger> logger_{};
-  /// Mutex for read and write locks.
-  mutable std::shared_timed_mutex mtx_;
+  /// Mutex for read and write locks on arena maps.
+  mutable std::shared_mutex map_mtx_;
+  /// Mutex for shared and unique locks on the mr.
+  mutable std::shared_mutex mtx_;
 };
 
 }  // namespace rmm::mr