Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the Arena allocator to reduce memory fragmentation #916

Merged
merged 46 commits into from
Jan 12, 2022
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
b041286
add some tests for arena mr
rongou Oct 8, 2021
241816c
Merge remote-tracking branch 'upstream/branch-21.12' into arena-super…
rongou Oct 29, 2021
8bda94e
make superblocks persistent between different arenas
rongou Nov 8, 2021
69a8778
Merge remote-tracking branch 'upstream/branch-21.12' into arena-super…
rongou Nov 8, 2021
5da4b59
fix segfault
rongou Nov 9, 2021
10ed42c
add back memory dump
rongou Nov 9, 2021
104e17c
Merge remote-tracking branch 'upstream/branch-21.12' into arena-super…
rongou Nov 10, 2021
3f5bf1e
switch to map for superblocks
rongou Nov 10, 2021
d33b9a0
add some tests
rongou Nov 11, 2021
b4a1d6a
add more tests
rongou Nov 11, 2021
288a056
Merge remote-tracking branch 'upstream/branch-21.12' into arena-super…
rongou Nov 11, 2021
d86d6b1
fix clang tidy warnings in test
rongou Nov 11, 2021
f87ba63
add some logging asserts
rongou Nov 12, 2021
ce633f2
more tests for global arena
rongou Nov 12, 2021
d47d5dd
Merge remote-tracking branch 'upstream/branch-22.02' into arena-super…
rongou Nov 12, 2021
23f679c
add back defrag
rongou Nov 12, 2021
a5a4881
more tests
rongou Nov 16, 2021
f77fb7e
add tests for arena
rongou Nov 16, 2021
dd86082
remove alignment changes
rongou Nov 16, 2021
29ae23b
small fixes
rongou Nov 16, 2021
abd7226
switch back to set, fix tests
rongou Nov 17, 2021
10771f5
stream synchronize before releasing superblock
rongou Nov 18, 2021
c16f026
update docs
rongou Nov 18, 2021
f3e6875
use byte literals in tests
rongou Nov 18, 2021
cb25f74
fix overflow bug
rongou Nov 18, 2021
6eb957f
more fixes
rongou Nov 23, 2021
0f96e0a
Merge remote-tracking branch 'upstream/branch-22.02' into arena-super…
rongou Nov 23, 2021
9a2e917
clean instead of defragment individual arenas
rongou Nov 30, 2021
fb1f193
lower superblock size to 1MB
rongou Nov 30, 2021
0999300
Merge remote-tracking branch 'upstream/branch-22.02' into arena-super…
rongou Dec 2, 2021
5148c51
align to size classes
rongou Dec 4, 2021
a13e8ad
keep track of large allocations in superblocks
rongou Dec 7, 2021
7082f22
Merge remote-tracking branch 'upstream/branch-22.02' into arena-super…
rongou Dec 7, 2021
fb9ce95
log max free in superblock
rongou Dec 7, 2021
65742cb
log fragmentation percentage
rongou Dec 8, 2021
b92c9eb
minor fix
rongou Dec 8, 2021
5452b82
clang format
rongou Dec 9, 2021
c782893
address review comments
rongou Dec 9, 2021
4355984
Merge remote-tracking branch 'upstream/branch-22.02' into arena-super…
rongou Dec 15, 2021
0fd715e
clang format
rongou Dec 15, 2021
9d142c5
Merge remote-tracking branch 'upstream/branch-22.02' into arena-super…
rongou Dec 17, 2021
03696b5
Merge remote-tracking branch 'upstream/branch-22.02' into arena-super…
rongou Jan 11, 2022
c42a4d4
review feedback
rongou Jan 12, 2022
96c976b
clang format
rongou Jan 12, 2022
a97565d
increase test coverage
rongou Jan 12, 2022
5cf9360
clang format
rongou Jan 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions include/rmm/detail/aligned.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcep
return (value + (alignment - 1)) & ~(alignment - 1);
}

/**
* @brief Align up to nearest multiple of the CUDA allocation alignment
*
* @param[in] v value to align
*
* @return Return the aligned value, as one would expect
*/
constexpr std::size_t align_up_cuda(std::size_t value) noexcept
rongou marked this conversation as resolved.
Show resolved Hide resolved
{
return align_up(value, CUDA_ALLOCATION_ALIGNMENT);
}

/**
* @brief Align down to the nearest multiple of specified power of 2
*
Expand All @@ -76,6 +88,18 @@ constexpr std::size_t align_down(std::size_t value, std::size_t alignment) noexc
return value & ~(alignment - 1);
}

/**
* @brief Align down to the nearest multiple of the CUDA allocation alignment
*
* @param[in] v value to align
*
* @return Return the aligned value, as one would expect
*/
constexpr std::size_t align_down_cuda(std::size_t value) noexcept
{
return align_down(value, CUDA_ALLOCATION_ALIGNMENT);
}

/**
* @brief Checks whether a value is aligned to a multiple of a specified power of 2
*
Expand All @@ -90,6 +114,18 @@ constexpr bool is_aligned(std::size_t value, std::size_t alignment) noexcept
return value == align_down(value, alignment);
}

/**
* @brief Checks whether a value is aligned to a multiple of the CUDA allocation alignment
*
* @param[in] v value to check for alignment
*
* @return true if aligned
*/
constexpr bool is_cuda_aligned(std::size_t value) noexcept
{
return is_aligned(value, CUDA_ALLOCATION_ALIGNMENT);
}

inline bool is_pointer_aligned(void* ptr, std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT)
{
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
Expand Down
92 changes: 69 additions & 23 deletions include/rmm/mr/device/arena_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,29 +77,39 @@ class arena_memory_resource final : public device_memory_resource {
* @brief Construct an `arena_memory_resource`.
*
* @throws rmm::logic_error if `upstream_mr == nullptr`.
* @throws rmm::logic_error if `initial_size` is neither the default nor aligned to a multiple of
* 256 bytes.
* @throws rmm::logic_error if `maximum_size` is neither the default nor aligned to a multiple of
* 256 bytes.
*
* @param upstream_mr The memory resource from which to allocate blocks for the pool
* @param initial_size Minimum size, in bytes, of the initial global arena. Defaults to half of
* the available memory on the current device.
* @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all
* of the available memory on the current device.
* @param upstream_mr The memory resource from which to allocate blocks for the pool.
* @param arena_size Size in bytes of the global arena. Defaults to all the available memory on
* the current device.
*/
explicit arena_memory_resource(Upstream* upstream_mr,
std::size_t initial_size = global_arena::default_initial_size,
std::size_t maximum_size = global_arena::default_maximum_size,
bool dump_log_on_failure = false)
: global_arena_{upstream_mr, initial_size, maximum_size},
dump_log_on_failure_{dump_log_on_failure}
std::optional<std::size_t> arena_size = std::nullopt,
bool dump_log_on_failure = false)
: global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
{
if (dump_log_on_failure_) {
logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
}
}

/**
* @brief Construct an `arena_memory_resource`.
*
* @throws rmm::logic_error if `upstream_mr == nullptr`.
*
* @param upstream_mr The memory resource from which to allocate blocks for the pool.
* @param arena_size Size in bytes of the global arena. Defaults to all the available memory on
* the current device.
* @param max_size Unused.
* @deprecated Use the version without the max size.
*/
arena_memory_resource(Upstream* upstream_mr,
std::optional<std::size_t> arena_size,
std::optional<std::size_t> max_size)
: arena_memory_resource{upstream_mr, arena_size, false}
{
}

~arena_memory_resource() override = default;

// Disable copy (and move) semantics.
Expand All @@ -124,10 +134,10 @@ class arena_memory_resource final : public device_memory_resource {
bool supports_get_mem_info() const noexcept override { return false; }

private:
using global_arena = detail::arena::global_arena<Upstream>;
using arena = detail::arena::arena<Upstream>;
using read_lock = std::shared_lock<std::shared_timed_mutex>;
using write_lock = std::lock_guard<std::shared_timed_mutex>;
using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
using arena = rmm::mr::detail::arena::arena<Upstream>;
using read_lock = std::shared_lock<std::shared_mutex>;
using write_lock = std::unique_lock<std::shared_mutex>;

/**
* @brief Allocates memory of size at least `bytes`.
Expand All @@ -144,7 +154,7 @@ class arena_memory_resource final : public device_memory_resource {
{
if (bytes <= 0) { return nullptr; }

bytes = detail::arena::align_up(bytes);
bytes = rmm::detail::align_up_cuda(bytes);
auto& arena = get_arena(stream);
void* pointer = arena.allocate(bytes);

Expand Down Expand Up @@ -173,8 +183,44 @@ class arena_memory_resource final : public device_memory_resource {
{
if (ptr == nullptr || bytes <= 0) { return; }

bytes = detail::arena::align_up(bytes);
get_arena(stream).deallocate(ptr, bytes, stream);
bytes = rmm::detail::align_up_cuda(bytes);
if (!get_arena(stream).deallocate(ptr, bytes, stream)) {
deallocate_from_other_arena(ptr, bytes, stream);
}
}

/**
* @brief Deallocate memory pointed to by `ptr` that was allocated in a different arena.
*
* @param ptr Pointer to be deallocated.
* @param bytes The size in bytes of the allocation. This must be equal to the
* value of `bytes` that was passed to the `allocate` call that returned `ptr`.
* @param stream Stream on which to perform deallocation.
*/
void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
{
stream.synchronize_no_throw();

read_lock lock(mtx_);

if (use_per_thread_arena(stream)) {
auto const id = std::this_thread::get_id();
for (auto&& kv : thread_arenas_) {
// If the arena does not belong to the current thread, try to deallocate from it, and return
// if successful.
if (kv.first != id && kv.second->deallocate(ptr, bytes, stream)) { return; }
}
} else {
for (auto&& kv : stream_arenas_) {
// If the arena does not belong to the current stream, try to deallocate from it, and return
// if successful.
if (stream.value() != kv.first && kv.second.deallocate(ptr, bytes, stream)) { return; }
}
}

// The thread that originally allocated the block has terminated, deallocate directly in the
// global arena.
global_arena_.deallocate_from_other_arena(ptr, bytes);
}

/**
Expand Down Expand Up @@ -302,11 +348,11 @@ class arena_memory_resource final : public device_memory_resource {
/// Implementation note: for small sizes, map is more efficient than unordered_map.
std::map<cudaStream_t, arena> stream_arenas_;
/// If true, dump memory information to log on allocation failure.
bool dump_log_on_failure_;
bool dump_log_on_failure_{};
/// The logger for memory dump.
std::shared_ptr<spdlog::logger> logger_{};
/// Mutex for read and write locks.
mutable std::shared_timed_mutex mtx_;
mutable std::shared_mutex mtx_;
};

} // namespace rmm::mr
Loading