Skip to content

Commit

Permalink
Python bindings for cuda_async_memory_resource (#718)
Browse files Browse the repository at this point in the history
Closes #701.

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - @jakirkham
  - Keith Kraus (@kkraus14)
  - Mark Harris (@harrism)
  - Christopher Harris (@cwharris)

URL: #718
  • Loading branch information
shwina authored Mar 3, 2021
1 parent bbc3e97 commit 3b4a555
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 14 deletions.
3 changes: 2 additions & 1 deletion include/rmm/mr/device/cuda_async_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ class cuda_async_memory_resource final : public device_memory_resource {
RMM_EXPECTS(e == cudaSuccess && v == 1,
"cudaMallocAsync not supported with this CUDA driver/runtime version");
#else
RMM_FAIL("cudaMallocAsync not supported");
RMM_FAIL(
"cudaMallocAsync not supported by the version of the CUDA Toolkit used for this build");
#endif
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# For CUDA 11.0
# For CUDA 11.0, 11.1 and 11.2

cdef extern from "cuda.h" nogil:
cpdef enum cudaDeviceAttr:
Expand Down
3 changes: 3 additions & 0 deletions python/rmm/_lib/memory_resource.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ cdef class CudaMemoryResource(DeviceMemoryResource):
cdef class ManagedMemoryResource(DeviceMemoryResource):
pass

cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
pass

cdef class PoolMemoryResource(UpstreamResourceAdaptor):
pass

Expand Down
17 changes: 17 additions & 0 deletions python/rmm/_lib/memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \
cdef cppclass managed_memory_resource(device_memory_resource):
managed_memory_resource() except +

cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
namespace "rmm::mr" nogil:
cdef cppclass cuda_async_memory_resource(device_memory_resource):
cuda_async_memory_resource() except +

cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \
namespace "rmm::mr" nogil:
cdef cppclass pool_memory_resource[Upstream](device_memory_resource):
Expand Down Expand Up @@ -99,6 +104,7 @@ cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
cdef device_memory_resource* _get_per_device_resource \
"rmm::mr::get_per_device_resource"(cuda_device_id id)


cdef class DeviceMemoryResource:

cdef device_memory_resource* get_mr(self):
Expand Down Expand Up @@ -135,6 +141,17 @@ cdef class CudaMemoryResource(DeviceMemoryResource):
pass


cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
"""
Memory resource that uses cudaMallocAsync/Free for
allocation/deallocation
"""
def __cinit__(self, device=None):
self.c_obj.reset(
new cuda_async_memory_resource()
)


cdef class ManagedMemoryResource(DeviceMemoryResource):
def __cinit__(self):
self.c_obj.reset(
Expand Down
2 changes: 2 additions & 0 deletions python/rmm/mr.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
from rmm._lib.memory_resource import (
BinningMemoryResource,
CudaAsyncMemoryResource,
CudaMemoryResource,
DeviceMemoryResource,
FixedSizeMemoryResource,
Expand All @@ -22,6 +23,7 @@

__all__ = [
"BinningMemoryResource",
"CudaAsyncMemoryResource",
"CudaMemoryResource",
"DeviceMemoryResource",
"FixedSizeMemoryResource",
Expand Down
34 changes: 34 additions & 0 deletions python/rmm/tests/test_rmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@

cuda.set_memory_manager(rmm.RMMNumbaManager)

_driver_version = rmm._cuda.gpu.driverGetVersion()
_runtime_version = rmm._cuda.gpu.runtimeGetVersion()


@pytest.fixture(scope="function", autouse=True)
def rmm_auto_reinitialize():
Expand Down Expand Up @@ -444,3 +447,34 @@ def test_mr_upstream_lifetime():
# Delete cuda_mr first. Should be kept alive by pool_mr
del cuda_mr
del pool_mr


@pytest.mark.skipif(
(_driver_version, _runtime_version) < (11020, 11020),
reason="cudaMallocAsync not supported",
)
@pytest.mark.parametrize("dtype", _dtypes)
@pytest.mark.parametrize("nelem", _nelems)
@pytest.mark.parametrize("alloc", _allocs)
def test_cuda_async_memory_resource(dtype, nelem, alloc):
mr = rmm.mr.CudaAsyncMemoryResource()
rmm.mr.set_current_device_resource(mr)
assert rmm.mr.get_current_device_resource_type() is type(mr)
array_tester(dtype, nelem, alloc)


@pytest.mark.skipif(
(_driver_version, _runtime_version) < (11020, 11020),
reason="cudaMallocAsync not supported",
)
@pytest.mark.parametrize("nelems", _nelems)
def test_cuda_async_memory_resource_stream(nelems):
# test that using CudaAsyncMemoryResource
# with a non-default stream works
mr = rmm.mr.CudaAsyncMemoryResource()
rmm.mr.set_current_device_resource(mr)
stream = rmm._cuda.stream.Stream()
expected = np.full(nelems, 5, dtype="u1")
dbuf = rmm.DeviceBuffer.to_device(expected, stream=stream)
result = np.asarray(dbuf.copy_to_host())
np.testing.assert_equal(expected, result)
43 changes: 31 additions & 12 deletions python/setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2019-2020, NVIDIA CORPORATION.
import filecmp
import glob
import os
import re
Expand Down Expand Up @@ -63,22 +64,40 @@ def get_cuda_version_from_header(cuda_include_dir):
# valid symbols for specific version of CUDA.

cwd = os.getcwd()
preprocess_files = ["gpu.pxd"]
supported_cuda_versions = {"10.1", "10.2", "11.0"}

for file_p in preprocess_files:
pxi_file = ".".join(file_p.split(".")[:-1])
pxi_file = pxi_file + ".pxi"

if CUDA_VERSION in supported_cuda_versions:
shutil.copyfile(
os.path.join(cwd, "rmm/_cuda", CUDA_VERSION, pxi_file),
os.path.join(cwd, "rmm/_cuda", file_p),
files_to_preprocess = ["gpu.pxd"]

# The .pxi file is unchanged between some CUDA versions
# (e.g., 11.0 & 11.1), so we keep only a single copy
# of it
cuda_version_to_pxi_dir = {
"10.1": "10.1",
"10.2": "10.2",
"11.0": "11.x",
"11.1": "11.x",
"11.2": "11.x",
}

for pxd_basename in files_to_preprocess:
pxi_basename = os.path.splitext(pxd_basename)[0] + ".pxi"
if CUDA_VERSION in cuda_version_to_pxi_dir:
pxi_pathname = os.path.join(
cwd,
"rmm/_cuda",
cuda_version_to_pxi_dir[CUDA_VERSION],
pxi_basename,
)
pxd_pathname = os.path.join(cwd, "rmm/_cuda", pxd_basename)
try:
if filecmp.cmp(pxi_pathname, pxd_pathname):
# files are the same, no need to copy
continue
except FileNotFoundError:
# pxd_pathname doesn't exist yet
pass
shutil.copyfile(pxi_pathname, pxd_pathname)
else:
raise TypeError(f"{CUDA_VERSION} is not supported.")


try:
nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0")
except Exception:
Expand Down

0 comments on commit 3b4a555

Please sign in to comment.