Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GTEST/COMMON: Cache CUDA device BAR1 available size - v1.17.x #9883

Merged
merged 1 commit into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions contrib/lsan.supp
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
leak:libcuda
leak:nvmlInitWithFlags
1 change: 1 addition & 0 deletions test/gtest/common/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ int main(int argc, char **argv) {

/* set gpu context for tests that need it */
mem_buffer::set_device_context();
mem_buffer::get_bar1_free_size_nvml();

int ret;
ret = ucs::watchdog_start();
Expand Down
31 changes: 11 additions & 20 deletions test/gtest/common/mem_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -196,38 +196,29 @@ void mem_buffer::set_device_context()
device_set = true;
}

size_t mem_buffer::get_bar1_free_size()
{
/* All gtest CUDA tests explicitly assume that all memory allocations are
* done on the device 0. The same assumption is followed here. */
size_t available_size = SIZE_MAX;
size_t mem_buffer::m_bar1_free_size = SIZE_MAX;

void mem_buffer::get_bar1_free_size_nvml()
{
#if HAVE_CUDA
nvmlDevice_t device;
nvmlBAR1Memory_t bar1mem;

if (NVML_CALL(nvmlInit_v2()) != UCS_OK) {
return available_size;
}

if (NVML_CALL(nvmlDeviceGetHandleByIndex(0, &device)) != UCS_OK) {
/* For whatever reason we cannot open device handle.
* As a result let's assume there is no limit on the size
* and in the worse case scenario gtest will fail in runtime */
return available_size;
return;
}

if (NVML_CALL(nvmlDeviceGetBAR1MemoryInfo(device, &bar1mem)) != UCS_OK) {
/* Similarly let's assume there is no limit on the size */
return available_size;
/* Assume no size limit in case of failure, in the worst case scenario
* gtest will fail in runtime */
if (NVML_CALL(nvmlDeviceGetHandleByIndex(0, &device)) == UCS_OK) {
if (NVML_CALL(nvmlDeviceGetBAR1MemoryInfo(device, &bar1mem)) ==
UCS_OK) {
mem_buffer::m_bar1_free_size = (size_t)bar1mem.bar1Free;
}
}

available_size = (size_t)bar1mem.bar1Free;

NVML_CALL(nvmlShutdown());
#endif

return available_size;
}

void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type)
Expand Down
10 changes: 9 additions & 1 deletion test/gtest/common/mem_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,15 @@ class mem_buffer {
/* returns whether ROCM device supports hipMallocPitch */
static bool is_rocm_malloc_pitch_supported();

/* Get from NVML BAR1 free size */
static void get_bar1_free_size_nvml();

/* Return free memory on the BAR1 / GPU. If GPU is not used
* SIZE_MAX is returned */
static size_t get_bar1_free_size();
static size_t get_bar1_free_size()
{
return m_bar1_free_size;
}

mem_buffer(size_t size, ucs_memory_type_t mem_type);
mem_buffer(size_t size, ucs_memory_type_t mem_type, uint64_t seed);
Expand Down Expand Up @@ -144,6 +150,8 @@ class mem_buffer {
ucs_memory_type_t src_mem_type,
const uint64_t mem_types);

static size_t m_bar1_free_size;

const ucs_memory_type_t m_mem_type;
void * const m_ptr;
const size_t m_size;
Expand Down
Loading