From d5ee82b9b70bd6cf81f80cdecd7bb2896b6bbecc Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Thu, 6 Aug 2020 15:45:54 -0400 Subject: [PATCH 01/97] DOC v0.16 Updates --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index cf9b466cc..d1367c552 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,7 +23,7 @@ from ucp import __version__ as release # The short X.Y version. -version = ".".join(release.split(".")[:2]) +version = '0.16' project = "ucx-py" copyright = "2019, NVIDIA" From c9dfdeedcf10d5aca2e072e732fe671ed1275ef4 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 11 Aug 2020 15:55:09 -0700 Subject: [PATCH 02/97] Fix doc version This is needed to fix a lint error in the docs. --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index d1367c552..cf9b466cc 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,7 +23,7 @@ from ucp import __version__ as release # The short X.Y version. -version = '0.16' +version = ".".join(release.split(".")[:2]) project = "ucx-py" copyright = "2019, NVIDIA" From 054acb66301bcc98f64bd291ea272c07dc16c2ad Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 11 Aug 2020 18:41:59 -0700 Subject: [PATCH 03/97] Drop version update lines These get autogenerated based on the git tag. So we don't need to use `sed` to update them. Hence we drop these version update lines. --- ci/release/update-version.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 8604f8061..444978e20 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -44,7 +44,3 @@ echo "Preparing '$RELEASE_TYPE' release [$CURRENT_TAG -> $NEXT_FULL_TAG]" function sed_runner() { sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak } - -# RTD update -sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py -sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py From f0a0551137efb744bdf38711c138a951f1f00899 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 11 Aug 2020 19:11:54 -0700 Subject: [PATCH 04/97] Drop `sed_runner` as we don't use this any more --- ci/release/update-version.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 444978e20..c2f2725f8 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -39,8 +39,3 @@ else fi echo "Preparing '$RELEASE_TYPE' release [$CURRENT_TAG -> $NEXT_FULL_TAG]" - -# Inplace sed replace; workaround for Linux and Mac -function sed_runner() { - sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak -} From d02b2ce4045eb55abf97d02724e7be9b12d101d4 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 14 Aug 2020 12:08:45 -0700 Subject: [PATCH 05/97] Add Cython header for `topological_distance` --- ucp/_libs/topological_distance.pxd | 12 ++++++++++++ ucp/_libs/topological_distance.pyx | 3 --- 2 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 ucp/_libs/topological_distance.pxd diff --git a/ucp/_libs/topological_distance.pxd b/ucp/_libs/topological_distance.pxd new file mode 100644 index 000000000..f814cc38d --- /dev/null +++ b/ucp/_libs/topological_distance.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +# cython: language_level=3 + + +from .topological_distance_dep cimport * + + +cdef class TopologicalDistance: + cdef: + hwloc_topology_t topo diff --git a/ucp/_libs/topological_distance.pyx b/ucp/_libs/topological_distance.pyx index fa327473a..93fe15967 100644 --- a/ucp/_libs/topological_distance.pyx +++ b/ucp/_libs/topological_distance.pyx @@ -8,9 +8,6 @@ from .topological_distance_dep cimport * cdef class TopologicalDistance: - cdef: - hwloc_topology_t topo - def __init__(self): """ Find topological distance between devices From 8f1f68372445c3e6aba5152ff3727daa8f3ee5c1 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 14 Aug 2020 12:13:23 -0700 Subject: [PATCH 06/97] Drop unneeded casts --- ucp/_libs/topological_distance.pyx | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/ucp/_libs/topological_distance.pyx b/ucp/_libs/topological_distance.pyx index 93fe15967..c70a80227 100644 --- a/ucp/_libs/topological_distance.pyx +++ b/ucp/_libs/topological_distance.pyx @@ -21,10 +21,10 @@ cdef class TopologicalDistance: (e.g., InfiniBand, aka. 'mlx5' or 'ib') interfaces closest to a NVIDIA GPU. """ - self.topo = initialize_topology() + self.topo = initialize_topology() def __dealloc__(self): - cdef hwloc_topology_t topo = self.topo + cdef hwloc_topology_t topo = self.topo hwloc_topology_destroy(topo) def get_cuda_distances_from_pci_info(self, domain, bus, device, @@ -80,8 +80,8 @@ cdef class TopologicalDistance: raise RuntimeError("Unknown device type: %s" % device_type) cdef hwloc_obj_t cuda_pcidev - cuda_pcidev = get_cuda_pcidev_from_pci_info( - self.topo, domain, bus, device + cuda_pcidev = get_cuda_pcidev_from_pci_info( + self.topo, domain, bus, device ) cdef topological_distance_objs_t *dev_dist @@ -90,12 +90,10 @@ cdef class TopologicalDistance: hwloc_osdev_type) cdef topological_distance_and_name_t *dist_name - dist_name = ( - get_topological_distance_and_name(dev_dist, dev_count) - ) + dist_name = get_topological_distance_and_name(dev_dist, dev_count) - ret = [{"distance": (&dist_name[i]).distance, - "name": ((&dist_name[i]).name).decode("utf-8")} + ret = [{"distance": (&dist_name[i]).distance, + "name": ((&dist_name[i]).name).decode("utf-8")} for i in range(dev_count)] free(dev_dist) From adb6262cfbef0e4af240e7f7a59aa0560fc486a1 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 14 Aug 2020 12:20:57 -0700 Subject: [PATCH 07/97] Simplify pointer handling --- ucp/_libs/topological_distance.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/topological_distance.pyx b/ucp/_libs/topological_distance.pyx index c70a80227..a73a0162f 100644 --- a/ucp/_libs/topological_distance.pyx +++ b/ucp/_libs/topological_distance.pyx @@ -92,8 +92,8 @@ cdef class TopologicalDistance: cdef topological_distance_and_name_t *dist_name dist_name = get_topological_distance_and_name(dev_dist, dev_count) - ret = [{"distance": (&dist_name[i]).distance, - "name": ((&dist_name[i]).name).decode("utf-8")} + ret = [{"distance": dist_name[i].distance, + "name": dist_name[i].name.decode("utf-8")} for i in range(dev_count)] free(dev_dist) From 336ba3182038a99b0b574a6f2f50973e46d8e75c Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 14 Aug 2020 12:26:16 -0700 Subject: [PATCH 08/97] Type method arguments (where possible) --- ucp/_libs/topological_distance.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ucp/_libs/topological_distance.pyx b/ucp/_libs/topological_distance.pyx index a73a0162f..4d7b4a3d8 100644 --- a/ucp/_libs/topological_distance.pyx +++ b/ucp/_libs/topological_distance.pyx @@ -27,8 +27,8 @@ cdef class TopologicalDistance: cdef hwloc_topology_t topo = self.topo hwloc_topology_destroy(topo) - def get_cuda_distances_from_pci_info(self, domain, bus, device, - device_type="openfabrics"): + def get_cuda_distances_from_pci_info(self, int domain, int bus, int device, + str device_type="openfabrics"): """ Find network or openfabrics devices closest to CUDA device at domain:bus:device address. @@ -102,7 +102,7 @@ cdef class TopologicalDistance: return ret def get_cuda_distances_from_device_index(self, cuda_device_index, - device_type="openfabrics"): + str device_type="openfabrics"): """ Find network or openfabrics devices closest to CUDA device of given index. Parameters From 6cc724552a0736839f16bea6951232c9a51ebd0a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 14 Aug 2020 12:27:22 -0700 Subject: [PATCH 09/97] Type iterated index --- ucp/_libs/topological_distance.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/ucp/_libs/topological_distance.pyx b/ucp/_libs/topological_distance.pyx index 4d7b4a3d8..1161bff50 100644 --- a/ucp/_libs/topological_distance.pyx +++ b/ucp/_libs/topological_distance.pyx @@ -92,6 +92,7 @@ cdef class TopologicalDistance: cdef topological_distance_and_name_t *dist_name dist_name = get_topological_distance_and_name(dev_dist, dev_count) + cdef int i ret = [{"distance": dist_name[i].distance, "name": dist_name[i].name.decode("utf-8")} for i in range(dev_count)] From 36a0d98626167d6bd54937cbb8f14bcf49c26a6c Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 14 Aug 2020 12:34:58 -0700 Subject: [PATCH 10/97] Mark methods as `cpdef` --- ucp/_libs/topological_distance.pxd | 7 +++++++ ucp/_libs/topological_distance.pyx | 5 +++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/topological_distance.pxd b/ucp/_libs/topological_distance.pxd index f814cc38d..75edcd76b 100644 --- a/ucp/_libs/topological_distance.pxd +++ b/ucp/_libs/topological_distance.pxd @@ -10,3 +10,10 @@ from .topological_distance_dep cimport * cdef class TopologicalDistance: cdef: hwloc_topology_t topo + + cpdef get_cuda_distances_from_pci_info(self, int domain, int bus, + int device, + str device_type=*) + + cpdef get_cuda_distances_from_device_index(self, cuda_device_index, + str device_type=*) diff --git a/ucp/_libs/topological_distance.pyx b/ucp/_libs/topological_distance.pyx index 1161bff50..174f6584b 100644 --- a/ucp/_libs/topological_distance.pyx +++ b/ucp/_libs/topological_distance.pyx @@ -27,8 +27,9 @@ cdef class TopologicalDistance: cdef hwloc_topology_t topo = self.topo hwloc_topology_destroy(topo) - def get_cuda_distances_from_pci_info(self, int domain, int bus, int device, - str device_type="openfabrics"): + cpdef get_cuda_distances_from_pci_info(self, int domain, int bus, + int device, + str device_type="openfabrics"): """ Find network or openfabrics devices closest to CUDA device at domain:bus:device address. From a7acedb908dc6c61053473f50558806ec685075d Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 14 Aug 2020 13:03:57 -0700 Subject: [PATCH 11/97] Add missing `cpdef` --- ucp/_libs/topological_distance.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/topological_distance.pyx b/ucp/_libs/topological_distance.pyx index 174f6584b..bbae370d7 100644 --- a/ucp/_libs/topological_distance.pyx +++ b/ucp/_libs/topological_distance.pyx @@ -103,8 +103,8 @@ cdef class TopologicalDistance: return ret - def get_cuda_distances_from_device_index(self, cuda_device_index, - str device_type="openfabrics"): + cpdef get_cuda_distances_from_device_index(self, cuda_device_index, + str device_type="openfabrics"): """ Find network or openfabrics devices closest to CUDA device of given index. Parameters From 6508ab4b732b3b958e2453862b21e3af474efb35 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 14 Aug 2020 14:52:14 -0700 Subject: [PATCH 12/97] Type `handle_as_int` as `uintptr_t` in signature Go ahead and type `handle_as_int` as `uintptr_t` in the function signature, Cython already knows how to convert generic Python `int`s to `uintptr_t` without intervention. Plus it simplifies the code internally. --- ucp/_libs/ucx_api.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 518052add..52efc53a6 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -181,8 +181,8 @@ cdef class UCXObject: ) -def _ucx_context_handle_finalizer(handle_as_int): - cdef ucp_context_h handle = handle_as_int +def _ucx_context_handle_finalizer(uintptr_t handle_as_int): + cdef ucp_context_h handle = handle_as_int ucp_cleanup(handle) From a668cff5c8b539e33aa05df5842670c31cbbb847 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 14 Aug 2020 14:52:15 -0700 Subject: [PATCH 13/97] Just cast `handle` when calling `ucp_cleanup` --- ucp/_libs/ucx_api.pyx | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 52efc53a6..f67f74c82 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -181,9 +181,8 @@ cdef class UCXObject: ) -def _ucx_context_handle_finalizer(uintptr_t handle_as_int): - cdef ucp_context_h handle = handle_as_int - ucp_cleanup(handle) +def _ucx_context_handle_finalizer(uintptr_t handle): + ucp_cleanup( handle) cdef class UCXContext(UCXObject): @@ -470,9 +469,8 @@ cdef void _listener_callback(ucp_conn_request_h conn_request, void *args): ) -def _ucx_listener_handle_finalizer(uintptr_t handle_as_int): - cdef ucp_listener_h handle = handle_as_int - ucp_listener_destroy(handle) +def _ucx_listener_handle_finalizer(uintptr_t handle): + ucp_listener_destroy( handle) cdef class UCXListener(UCXObject): From e4b7d8f71335939eefe496cc9bcdc69a52101df0 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Thu, 27 Aug 2020 11:10:04 -0400 Subject: [PATCH 14/97] update install instructions for cuda 11 and remove 9.2.10.00 --- docs/source/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/install.rst b/docs/source/install.rst index 9789ee0c3..53d3bc9d1 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -14,7 +14,7 @@ Conda ----- Some preliminary Conda packages can be installed as so. Replace -```` with either ``9.2``, ``10.0``, or ``10.1``. These are +```` with either ``10.1``, ``10.2``, or ``11.0``. These are available both on ``rapidsai`` and ``rapidsai-nightly``. With GPU support: From d8cbb3c068e8ba0bf4ca619ac4aa53cab913dd55 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 28 Aug 2020 15:03:22 -0700 Subject: [PATCH 15/97] Use Python C API in util functions w/host buffers Leverages the `Py_buffer` object held by `memoryview` objects to perform some computations and checks in C. --- ucp/_libs/utils.pyx | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 1ab6f1352..f1d100b04 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -4,7 +4,11 @@ # cython: language_level=3 -from cpython.memoryview cimport PyMemoryView_GET_BUFFER +from cpython.buffer cimport PyBuffer_IsContiguous +from cpython.memoryview cimport ( + PyMemoryView_FromObject, + PyMemoryView_GET_BUFFER, +) from cython cimport boundscheck, wraparound from libc.stdint cimport uintptr_t @@ -17,14 +21,16 @@ cpdef uintptr_t get_buffer_data(buffer, bint check_writable=False) except *: cdef dict iface = getattr(buffer, "__cuda_array_interface__", None) + cdef const Py_buffer* pybuf cdef uintptr_t data_ptr cdef bint data_readonly if iface is not None: data_ptr, data_readonly = iface["data"] else: - mview = memoryview(buffer) - data_ptr = PyMemoryView_GET_BUFFER(mview).buf - data_readonly = PyMemoryView_GET_BUFFER(mview).readonly + mview = PyMemoryView_FromObject(buffer) + pybuf = PyMemoryView_GET_BUFFER(mview) + data_ptr = pybuf.buf + data_readonly = pybuf.readonly if data_ptr == 0: raise NotImplementedError("zero-sized buffers isn't supported") @@ -54,6 +60,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, check_min_size, bint cuda_support) ex "more information." ) + cdef const Py_buffer* pybuf cdef tuple shape, strides cdef Py_ssize_t i, s, itemsize, ndim, nbytes if iface is not None: @@ -80,9 +87,12 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, check_min_size, bint cuda_support) ex if iface.get("mask") is not None: raise NotImplementedError("mask attribute not supported") else: - mview = memoryview(buffer) - nbytes = mview.nbytes - if not mview.c_contiguous: + mview = PyMemoryView_FromObject(buffer) + pybuf = PyMemoryView_GET_BUFFER(mview) + nbytes = pybuf.itemsize + for i in range(pybuf.ndim): + nbytes *= pybuf.shape[i] + if not PyBuffer_IsContiguous(pybuf, b"C"): raise ValueError("buffer must be C-contiguous") cdef Py_ssize_t min_size From c222fdd8b3f9243515b3d05c716356b5342a3055 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 2 Sep 2020 19:51:40 -0700 Subject: [PATCH 16/97] Cast `pybuf.readonly` to `bint` Previously Cython would have generated C code that assigned `pybuf.readonly` to another variable before assigning it to `data_readonly`. By casting `pybuf.readonly` to `bint`, we are able to bypass this intermediate variable from being used and assign directly to `data_readonly` saving some unneeded C code in the process. --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index f1d100b04..195879c51 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -30,7 +30,7 @@ cpdef uintptr_t get_buffer_data(buffer, bint check_writable=False) except *: mview = PyMemoryView_FromObject(buffer) pybuf = PyMemoryView_GET_BUFFER(mview) data_ptr = pybuf.buf - data_readonly = pybuf.readonly + data_readonly = pybuf.readonly if data_ptr == 0: raise NotImplementedError("zero-sized buffers isn't supported") From 89554a164ddfaa5efc580e60542524e045f7ab3b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 2 Sep 2020 20:38:56 -0700 Subject: [PATCH 17/97] Cast `iface["data"]` as `tuple` By default Cython generates a bunch of code to handle `tuple`s, `list`s, and generally iterable objects in Python for doing this unpacking. As we know we have a `tuple` and can thus simplify the code here a bit, go ahead and cast `iface["data"]` as a `tuple`. This way it can focus on the code needed to unpack a `tuple` and drop everything else extra. --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 195879c51..2fa889527 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -25,7 +25,7 @@ cpdef uintptr_t get_buffer_data(buffer, bint check_writable=False) except *: cdef uintptr_t data_ptr cdef bint data_readonly if iface is not None: - data_ptr, data_readonly = iface["data"] + data_ptr, data_readonly = iface["data"] else: mview = PyMemoryView_FromObject(buffer) pybuf = PyMemoryView_GET_BUFFER(mview) From 9720bc67ea83b6dd23a913a42bcb518ccf96fd6e Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 2 Sep 2020 20:42:23 -0700 Subject: [PATCH 18/97] Use `struct` with data (and it's constituents) Since the `__cuda_array_interface__` (and for that matter the `__array_interface__`) treat the `data` value as a `tuple` containing both the pointer and whether the data is readonly or not, update our code to treat it similarly at the Cython level by using a `struct` for it. This also results in a bit more efficient code. --- ucp/_libs/utils.pyx | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 2fa889527..b2b0eba43 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -13,6 +13,11 @@ from cython cimport boundscheck, wraparound from libc.stdint cimport uintptr_t +cdef struct iface_data_t: + uintptr_t ptr + bint readonly + + cpdef uintptr_t get_buffer_data(buffer, bint check_writable=False) except *: """ Returns data pointer of the buffer. Raising ValueError if the buffer @@ -22,23 +27,22 @@ cpdef uintptr_t get_buffer_data(buffer, bint check_writable=False) except *: cdef dict iface = getattr(buffer, "__cuda_array_interface__", None) cdef const Py_buffer* pybuf - cdef uintptr_t data_ptr - cdef bint data_readonly + cdef iface_data_t data if iface is not None: - data_ptr, data_readonly = iface["data"] + data.ptr, data.readonly = iface["data"] else: mview = PyMemoryView_FromObject(buffer) pybuf = PyMemoryView_GET_BUFFER(mview) - data_ptr = pybuf.buf - data_readonly = pybuf.readonly + data.ptr = pybuf.buf + data.readonly = pybuf.readonly - if data_ptr == 0: + if data.ptr == 0: raise NotImplementedError("zero-sized buffers isn't supported") - if check_writable and data_readonly: + if check_writable and data.readonly: raise ValueError("writing to readonly buffer!") - return data_ptr + return data.ptr @boundscheck(False) From 3938376cde8c0b78c99c81611260beeebde1a082 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 8 Sep 2020 14:48:44 -0700 Subject: [PATCH 19/97] Implement `get_itemsize` This provides the function `get_itemsize` to fast path identification of the `itemsize` some common `typestr`s. Anything else that cannot be easily identified is passed off to NumPy for determination. --- ucp/_libs/utils.pxd | 1 + ucp/_libs/utils.pyx | 61 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/utils.pxd b/ucp/_libs/utils.pxd index f2a924420..36ed99e40 100644 --- a/ucp/_libs/utils.pxd +++ b/ucp/_libs/utils.pxd @@ -7,5 +7,6 @@ from libc.stdint cimport uintptr_t +cpdef Py_ssize_t get_itemsize(str fmt) except * cpdef uintptr_t get_buffer_data(buffer, bint check_writable=*) except * cpdef Py_ssize_t get_buffer_nbytes(buffer, check_min_size, bint cuda_support) except * diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index b2b0eba43..4430b22c0 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -12,12 +12,70 @@ from cpython.memoryview cimport ( from cython cimport boundscheck, wraparound from libc.stdint cimport uintptr_t +try: + from numpy import dtype as numpy_dtype +except ImportError: + numpy_dtype = None + cdef struct iface_data_t: uintptr_t ptr bint readonly +cdef dict itemsize_mapping = { + "|b1": 1, + "|i1": 1, + "|u1": 1, + "i2": 2, + "u2": 2, + "i4": 4, + "u4": 4, + "i8": 8, + "u8": 8, + "f2": 2, + "f4": 4, + "f8": 8, + "f16": 16, + "c8": 8, + "c16": 16, + "c32": 32, +} + +cpdef Py_ssize_t get_itemsize(str fmt) except *: + """ + Get the itemsize of the format provided. + """ + if fmt is None: + raise ValueError("Expected `str`, but got `None`") + elif fmt == "": + raise ValueError("Got unexpected empty `str`") + else: + itemsize = itemsize_mapping.get(fmt) + if itemsize is None: + if numpy_dtype is not None: + itemsize = numpy_dtype(fmt).itemsize + else: + raise ValueError( + f"Unexpected `fmt`, {fmt}." + " Please install NumPy to handle this format." + ) + return itemsize + + cpdef uintptr_t get_buffer_data(buffer, bint check_writable=False) except *: """ Returns data pointer of the buffer. Raising ValueError if the buffer @@ -68,8 +126,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, check_min_size, bint cuda_support) ex cdef tuple shape, strides cdef Py_ssize_t i, s, itemsize, ndim, nbytes if iface is not None: - import numpy - itemsize = numpy.dtype(iface["typestr"]).itemsize + itemsize = get_itemsize(iface["typestr"]) # Making sure that the elements in shape is integers shape = iface["shape"] ndim = len(shape) From 02b6d6ce1c436c4651972fdc186c4f2bbf4b473f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 8 Sep 2020 17:43:54 -0700 Subject: [PATCH 20/97] Type `get_buffer_nbytes`'s `min_size` Also provide default values for other parameters. --- ucp/_libs/utils.pxd | 4 +++- ucp/_libs/utils.pyx | 14 ++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ucp/_libs/utils.pxd b/ucp/_libs/utils.pxd index f2a924420..750bad9a8 100644 --- a/ucp/_libs/utils.pxd +++ b/ucp/_libs/utils.pxd @@ -8,4 +8,6 @@ from libc.stdint cimport uintptr_t cpdef uintptr_t get_buffer_data(buffer, bint check_writable=*) except * -cpdef Py_ssize_t get_buffer_nbytes(buffer, check_min_size, bint cuda_support) except * +cpdef Py_ssize_t get_buffer_nbytes(buffer, + Py_ssize_t min_size=*, + bint cuda_support=*) except * diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index b2b0eba43..9f8e1c4ac 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -47,7 +47,9 @@ cpdef uintptr_t get_buffer_data(buffer, bint check_writable=False) except *: @boundscheck(False) @wraparound(False) -cpdef Py_ssize_t get_buffer_nbytes(buffer, check_min_size, bint cuda_support) except *: +cpdef Py_ssize_t get_buffer_nbytes(buffer, + Py_ssize_t min_size=-1, + bint cuda_support=False) except *: """ Returns the size of the buffer in bytes. Returns ValueError if `check_min_size` is greater than the size of the buffer @@ -99,11 +101,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, check_min_size, bint cuda_support) ex if not PyBuffer_IsContiguous(pybuf, b"C"): raise ValueError("buffer must be C-contiguous") - cdef Py_ssize_t min_size - if check_min_size is not None: - min_size = check_min_size - if nbytes < min_size: - raise ValueError( - "the nbytes is greater than the size of the buffer!" - ) + if min_size > 0 and nbytes < min_size: + raise ValueError("the nbytes is greater than the size of the buffer!") + return nbytes From 0b3dcbf1029032a666d780cc75a9e3754cd8b0b5 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 8 Sep 2020 18:12:12 -0700 Subject: [PATCH 21/97] Use `min_size` as needed (and default to `-1`) When `min_size` is not used, simply don't specify it. Update any default values associated with `min_size` to be `-1` (instead of `None`) as before. Semantically this has the same meaning as it did. Though the integer value can more easily be translated to a C typed variable. --- tests/test_libs_utils.py | 14 ++++++-------- ucp/core.py | 15 +++++---------- ucp/endpoint_reuse.py | 4 ++-- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/tests/test_libs_utils.py b/tests/test_libs_utils.py index 9381764b9..c162c485b 100644 --- a/tests/test_libs_utils.py +++ b/tests/test_libs_utils.py @@ -40,18 +40,16 @@ def test_get_buffer_data_builtins(buffer): @pytest.mark.parametrize("buffer", builtin_buffers) def test_get_buffer_nbytes_builtins(buffer): nbytes = memoryview(buffer).nbytes - result = get_buffer_nbytes(buffer, check_min_size=None, cuda_support=True) + result = get_buffer_nbytes(buffer, cuda_support=True) assert result == nbytes with pytest.raises(ValueError): - get_buffer_nbytes( - memoryview(buffer)[::2], check_min_size=None, cuda_support=True - ) + get_buffer_nbytes(memoryview(buffer)[::2], cuda_support=True) # Test exceptional cases with `check_min_size` - get_buffer_nbytes(buffer, check_min_size=nbytes, cuda_support=True) + get_buffer_nbytes(buffer, min_size=nbytes, cuda_support=True) with pytest.raises(ValueError): - get_buffer_nbytes(buffer, check_min_size=(nbytes + 1), cuda_support=True) + get_buffer_nbytes(buffer, min_size=(nbytes + 1), cuda_support=True) array_params = [ @@ -94,8 +92,8 @@ def test_get_buffer_nbytes_array(xp, shape, dtype, strides): xp, arr, iface = create_array(xp, shape, dtype, strides) if arr.flags.c_contiguous: - nbytes = get_buffer_nbytes(arr, check_min_size=None, cuda_support=True) + nbytes = get_buffer_nbytes(arr, cuda_support=True) assert nbytes == arr.nbytes else: with pytest.raises(ValueError): - get_buffer_nbytes(arr, check_min_size=None, cuda_support=True) + get_buffer_nbytes(arr, cuda_support=True) diff --git a/ucp/core.py b/ucp/core.py index e29c65c0a..ae6a90bdc 100644 --- a/ucp/core.py +++ b/ucp/core.py @@ -547,7 +547,7 @@ async def close(self): self.abort() @nvtx_annotate("UCXPY_SEND", color="green", domain="ucxpy") - async def send(self, buffer, nbytes=None, tag=None): + async def send(self, buffer, nbytes=-1, tag=None): """Send `buffer` to connected peer. Parameters @@ -563,7 +563,7 @@ async def send(self, buffer, nbytes=None, tag=None): if self.closed(): raise UCXCloseError("Endpoint closed") nbytes = get_buffer_nbytes( - buffer, check_min_size=nbytes, cuda_support=self._cuda_support + buffer, min_size=nbytes, cuda_support=self._cuda_support ) log = "[Send #%03d] ep: %s, tag: %s, nbytes: %d, type: %s" % ( self._send_count, @@ -583,7 +583,7 @@ async def send(self, buffer, nbytes=None, tag=None): return await comm.tag_send(self._ep, buffer, nbytes, tag, name=log) @nvtx_annotate("UCXPY_RECV", color="red", domain="ucxpy") - async def recv(self, buffer, nbytes=None, tag=None): + async def recv(self, buffer, nbytes=-1, tag=None): """Receive from connected peer into `buffer`. Parameters @@ -601,7 +601,7 @@ async def recv(self, buffer, nbytes=None, tag=None): if self.closed(): raise UCXCloseError("Endpoint closed") nbytes = get_buffer_nbytes( - buffer, check_min_size=nbytes, cuda_support=self._cuda_support + buffer, min_size=nbytes, cuda_support=self._cuda_support ) log = "[Recv #%03d] ep: %s, tag: %s, nbytes: %d, type: %s" % ( self._recv_count, @@ -694,12 +694,7 @@ async def send_obj(self, obj, tag=None): """ nbytes = array.array( - "Q", - [ - get_buffer_nbytes( - buffer=obj, check_min_size=None, cuda_support=self._cuda_support - ) - ], + "Q", [get_buffer_nbytes(buffer=obj, cuda_support=self._cuda_support)] ) await self.send(nbytes, tag=tag) await self.send(obj, tag=tag) diff --git a/ucp/endpoint_reuse.py b/ucp/endpoint_reuse.py index 1340f81d5..81692df06 100644 --- a/ucp/endpoint_reuse.py +++ b/ucp/endpoint_reuse.py @@ -98,10 +98,10 @@ async def _handle(ep_new): return core.create_listener(_handle, port=port) - async def send(self, buffer, nbytes=None): + async def send(self, buffer, nbytes=-1): await self.handle.ep.send(buffer, nbytes=nbytes, tag=self.tag) - async def recv(self, buffer, nbytes=None): + async def recv(self, buffer, nbytes=-1): await self.handle.ep.recv(buffer, nbytes=nbytes, tag=self.tag) async def close(self): From 91d1de0e28a60ea33b57a6914dc5a9a7877259df Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 8 Sep 2020 18:46:50 -0700 Subject: [PATCH 22/97] Move `cuda_support` check into CUDA case As we already have a branch later for the CUDA case, defer checking whether `cuda_support` is `True` until we are within the CUDA case. This behaves the same while simplifying the code a bit. --- ucp/_libs/utils.pyx | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index b2b0eba43..7198108bb 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -54,20 +54,20 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, check_min_size, bint cuda_support) ex """ cdef dict iface = getattr(buffer, "__cuda_array_interface__", None) - if not cuda_support and iface is not None: - raise ValueError( - "UCX is not configured with CUDA support, please add " - "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" - "variable and that the ucx-proc=*=gpu package is " - "installed. See " - "https://ucx-py.readthedocs.io/en/latest/install.html for " - "more information." - ) - cdef const Py_buffer* pybuf cdef tuple shape, strides cdef Py_ssize_t i, s, itemsize, ndim, nbytes if iface is not None: + if not cuda_support: + raise ValueError( + "UCX is not configured with CUDA support, please add " + "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" + "variable and that the ucx-proc=*=gpu package is " + "installed. See " + "https://ucx-py.readthedocs.io/en/latest/install.html for " + "more information." + ) + import numpy itemsize = numpy.dtype(iface["typestr"]).itemsize # Making sure that the elements in shape is integers From f3685bf98749d27c7e767e5cfaf658d044fc40d1 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 8 Sep 2020 18:55:16 -0700 Subject: [PATCH 23/97] Test `cuda_support` raises for CuPy arrays --- tests/test_libs_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_libs_utils.py b/tests/test_libs_utils.py index 9381764b9..3ab566ff0 100644 --- a/tests/test_libs_utils.py +++ b/tests/test_libs_utils.py @@ -93,6 +93,10 @@ def test_get_buffer_data_array(xp, shape, dtype, strides): def test_get_buffer_nbytes_array(xp, shape, dtype, strides): xp, arr, iface = create_array(xp, shape, dtype, strides) + if xp.__name__ == "cupy": + with pytest.raises(ValueError): + get_buffer_nbytes(arr, check_min_size=None, cuda_support=False) + if arr.flags.c_contiguous: nbytes = get_buffer_nbytes(arr, check_min_size=None, cuda_support=True) assert nbytes == arr.nbytes From 0bef328ce34c2fa7190c9d8bef04dcc73b7cf424 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 10:27:53 -0700 Subject: [PATCH 24/97] Fix-up CI Rename `check_min_size` to `min_size` anywhere it was missed. Also drop `min_size` when the default value is used. --- tests/test_libs_utils.py | 4 ++-- ucp/_libs/utils.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_libs_utils.py b/tests/test_libs_utils.py index 9792e9dd3..30dde1e98 100644 --- a/tests/test_libs_utils.py +++ b/tests/test_libs_utils.py @@ -46,7 +46,7 @@ def test_get_buffer_nbytes_builtins(buffer): with pytest.raises(ValueError): get_buffer_nbytes(memoryview(buffer)[::2], cuda_support=True) - # Test exceptional cases with `check_min_size` + # Test exceptional cases with `min_size` get_buffer_nbytes(buffer, min_size=nbytes, cuda_support=True) with pytest.raises(ValueError): get_buffer_nbytes(buffer, min_size=(nbytes + 1), cuda_support=True) @@ -93,7 +93,7 @@ def test_get_buffer_nbytes_array(xp, shape, dtype, strides): if xp.__name__ == "cupy": with pytest.raises(ValueError): - get_buffer_nbytes(arr, check_min_size=None, cuda_support=False) + get_buffer_nbytes(arr, cuda_support=False) if arr.flags.c_contiguous: nbytes = get_buffer_nbytes(arr, cuda_support=True) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 1f8bc78c3..b559c57f7 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -51,8 +51,8 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, Py_ssize_t min_size=-1, bint cuda_support=False) except *: """ - Returns the size of the buffer in bytes. Returns ValueError - if `check_min_size` is greater than the size of the buffer + Returns the size of the buffer in bytes. Raises `ValueError` + if `min_size` is greater than the size of the buffer """ cdef dict iface = getattr(buffer, "__cuda_array_interface__", None) From 222170aaa39cd2e44062fbfe9cd73ba8499faf9d Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 17:41:20 -0700 Subject: [PATCH 25/97] Make sure type `str`'s in `dict` are `intern`ed Lookups in Python `dict`s with `str` keys can be sped up if the keys are `intern`ed and the key accessed is `intern`ed. We use this to `intern` all of the keys of `itemsize_mapping` to speedup lookup times with this `dict`. This way if we encounter a type `str`, which is `intern`ed Python can speedup lookup time a bit. --- ucp/_libs/utils.pyx | 58 ++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 6eee323c4..8f0bff298 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -24,35 +24,35 @@ cdef struct iface_data_t: cdef dict itemsize_mapping = { - "|b1": 1, - "|i1": 1, - "|u1": 1, - "i2": 2, - "u2": 2, - "i4": 4, - "u4": 4, - "i8": 8, - "u8": 8, - "f2": 2, - "f4": 4, - "f8": 8, - "f16": 16, - "c8": 8, - "c16": 16, - "c32": 32, + intern("|b1"): 1, + intern("|i1"): 1, + intern("|u1"): 1, + intern("i2"): 2, + intern("u2"): 2, + intern("i4"): 4, + intern("u4"): 4, + intern("i8"): 8, + intern("u8"): 8, + intern("f2"): 2, + intern("f4"): 4, + intern("f8"): 8, + intern("f16"): 16, + intern("c8"): 8, + intern("c16"): 16, + intern("c32"): 32, } cpdef Py_ssize_t get_itemsize(str fmt) except *: From 89bee3c7c82dcf86aec5dd56be8a34e2729a2a44 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 18:50:26 -0700 Subject: [PATCH 26/97] Fix spacing --- ucp/_libs/utils.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 8f0bff298..07a52714f 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -55,6 +55,7 @@ cdef dict itemsize_mapping = { intern(">c32"): 32, } + cpdef Py_ssize_t get_itemsize(str fmt) except *: """ Get the itemsize of the format provided. From 4d8499dc5c854d41f43464845f5e5e8fb0158129 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:26 -0700 Subject: [PATCH 27/97] Small grammar fix in comment --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 6eee323c4..7b9966454 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -129,7 +129,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, ) itemsize = get_itemsize(iface["typestr"]) - # Making sure that the elements in shape is integers + # Make sure that the elements in shape are integers shape = iface["shape"] ndim = len(shape) nbytes = itemsize From 9031e42e85f1c96ca4a648978c5aa9d1bffb0431 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:27 -0700 Subject: [PATCH 28/97] Raise about `mask` attribute earlier --- ucp/_libs/utils.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 7b9966454..6859b1adf 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -127,6 +127,8 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, "https://ucx-py.readthedocs.io/en/latest/install.html for " "more information." ) + if iface.get("mask") is not None: + raise NotImplementedError("mask attribute not supported") itemsize = get_itemsize(iface["typestr"]) # Make sure that the elements in shape are integers @@ -147,8 +149,6 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, if s != strides[i]: raise ValueError("Array must be contiguous") s *= shape[i] - if iface.get("mask") is not None: - raise NotImplementedError("mask attribute not supported") else: mview = PyMemoryView_FromObject(buffer) pybuf = PyMemoryView_GET_BUFFER(mview) From d4703ad3e5b8e6b10401bc9c903c407c7b478066 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:29 -0700 Subject: [PATCH 29/97] Assign `strides` right after `shape` --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 6859b1adf..9234e9f10 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -133,12 +133,12 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, itemsize = get_itemsize(iface["typestr"]) # Make sure that the elements in shape are integers shape = iface["shape"] + strides = iface.get("strides") ndim = len(shape) nbytes = itemsize for i in range(ndim): nbytes *= shape[i] # Check that data is contiguous - strides = iface.get("strides") if strides is not None and ndim > 0: if len(strides) != ndim: raise ValueError( From 57997095a76e75c4df5467a3f7a29c5939ca7a22 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:30 -0700 Subject: [PATCH 30/97] Nest contiguous check under `ndim` check --- ucp/_libs/utils.pyx | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 9234e9f10..519cb71fc 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -139,16 +139,17 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, for i in range(ndim): nbytes *= shape[i] # Check that data is contiguous - if strides is not None and ndim > 0: - if len(strides) != ndim: - raise ValueError( - "The length of shape and strides must be equal" - ) - s = itemsize - for i from ndim > i >= 0 by 1: - if s != strides[i]: - raise ValueError("Array must be contiguous") - s *= shape[i] + if ndim > 0: + if strides is not None: + if len(strides) != ndim: + raise ValueError( + "The length of shape and strides must be equal" + ) + s = itemsize + for i from ndim > i >= 0 by 1: + if s != strides[i]: + raise ValueError("Array must be contiguous") + s *= shape[i] else: mview = PyMemoryView_FromObject(buffer) pybuf = PyMemoryView_GET_BUFFER(mview) From a30aacef2f1a56cfd3d585cca038eceb2680e978 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:32 -0700 Subject: [PATCH 31/97] Move comment about contiguous check --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 519cb71fc..8dc27f627 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -138,8 +138,8 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, nbytes = itemsize for i in range(ndim): nbytes *= shape[i] - # Check that data is contiguous if ndim > 0: + # Check that data is contiguous if strides is not None: if len(strides) != ndim: raise ValueError( From bcff99fc4ef4f23bbf2c69193a35971c0c73c068 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:34 -0700 Subject: [PATCH 32/97] Compute `nbytes` only for `ndim > 0` --- ucp/_libs/utils.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 8dc27f627..926d55cbd 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -136,9 +136,9 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, strides = iface.get("strides") ndim = len(shape) nbytes = itemsize - for i in range(ndim): - nbytes *= shape[i] if ndim > 0: + for i in range(ndim): + nbytes *= shape[i] # Check that data is contiguous if strides is not None: if len(strides) != ndim: From afcc20e6628b430332107ba655b5863984158c78 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:35 -0700 Subject: [PATCH 33/97] Move comment about `shape` containing integers --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 926d55cbd..048d1bca2 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -131,12 +131,12 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, raise NotImplementedError("mask attribute not supported") itemsize = get_itemsize(iface["typestr"]) - # Make sure that the elements in shape are integers shape = iface["shape"] strides = iface.get("strides") ndim = len(shape) nbytes = itemsize if ndim > 0: + # Make sure that the elements in shape are integers for i in range(ndim): nbytes *= shape[i] # Check that data is contiguous From 027e4b599ac323043fc3f2297edab7c20334ffe4 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:37 -0700 Subject: [PATCH 34/97] Drop outdated comment about `shape` type checking --- ucp/_libs/utils.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 048d1bca2..1821db13b 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -136,7 +136,6 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, ndim = len(shape) nbytes = itemsize if ndim > 0: - # Make sure that the elements in shape are integers for i in range(ndim): nbytes *= shape[i] # Check that data is contiguous From b905a8998f2b70710fc08294d2287b840a2cce8d Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:38 -0700 Subject: [PATCH 35/97] Add comment about computing size --- ucp/_libs/utils.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 1821db13b..c805d44da 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -136,6 +136,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, ndim = len(shape) nbytes = itemsize if ndim > 0: + # Compute size for i in range(ndim): nbytes *= shape[i] # Check that data is contiguous From 2b5760fbfa2dd44d8dc61197a8b4fe76d37ef37e Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:40 -0700 Subject: [PATCH 36/97] Compute size after contiguity check As we will raise if the array is not contiguous, there is no point in computing its size before we have handled that. --- ucp/_libs/utils.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index c805d44da..eac2f7d35 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -136,9 +136,6 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, ndim = len(shape) nbytes = itemsize if ndim > 0: - # Compute size - for i in range(ndim): - nbytes *= shape[i] # Check that data is contiguous if strides is not None: if len(strides) != ndim: @@ -150,6 +147,9 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, if s != strides[i]: raise ValueError("Array must be contiguous") s *= shape[i] + # Compute size + for i in range(ndim): + nbytes *= shape[i] else: mview = PyMemoryView_FromObject(buffer) pybuf = PyMemoryView_GET_BUFFER(mview) From a8aeb577ca503bd58bf6fa5a0a645c2b62cce17d Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:42:41 -0700 Subject: [PATCH 37/97] Allocate dynamic arrays for `shape` and `strides` --- ucp/_libs/utils.pyx | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index eac2f7d35..1bf63c988 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -5,6 +5,7 @@ from cpython.buffer cimport PyBuffer_IsContiguous +from cpython.mem cimport PyMem_Malloc, PyMem_Free from cpython.memoryview cimport ( PyMemoryView_FromObject, PyMemoryView_GET_BUFFER, @@ -116,6 +117,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, cdef dict iface = getattr(buffer, "__cuda_array_interface__", None) cdef const Py_buffer* pybuf cdef tuple shape, strides + cdef Py_ssize_t *shape_p, *strides_p cdef Py_ssize_t i, s, itemsize, ndim, nbytes if iface is not None: if not cuda_support: @@ -136,20 +138,35 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, ndim = len(shape) nbytes = itemsize if ndim > 0: - # Check that data is contiguous - if strides is not None: - if len(strides) != ndim: - raise ValueError( - "The length of shape and strides must be equal" + shape_p = PyMem_Malloc(ndim * sizeof(Py_ssize_t)) + try: + # Make sure that the elements in shape are integers + for i in range(ndim): + shape_p[i] = shape[i] + # Check that data is contiguous + if strides is not None: + if len(strides) != ndim: + raise ValueError( + "The length of shape and strides must be equal" + ) + strides_p = PyMem_Malloc( + ndim * sizeof(Py_ssize_t) ) - s = itemsize - for i from ndim > i >= 0 by 1: - if s != strides[i]: - raise ValueError("Array must be contiguous") - s *= shape[i] - # Compute size - for i in range(ndim): - nbytes *= shape[i] + try: + for i in range(ndim): + strides_p[i] = strides[i] + s = itemsize + for i from ndim > i >= 0 by 1: + if s != strides_p[i]: + raise ValueError("Array must be contiguous") + s *= shape_p[i] + finally: + PyMem_Free(strides_p) + # Compute size + for i in range(ndim): + nbytes *= shape_p[i] + finally: + PyMem_Free(shape_p) else: mview = PyMemoryView_FromObject(buffer) pybuf = PyMemoryView_GET_BUFFER(mview) From 39c8f848d1efa457af6059d00db03471991e3325 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:48:50 -0700 Subject: [PATCH 38/97] Move contiguous comment --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 1bf63c988..797e1fbdf 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -143,7 +143,6 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, # Make sure that the elements in shape are integers for i in range(ndim): shape_p[i] = shape[i] - # Check that data is contiguous if strides is not None: if len(strides) != ndim: raise ValueError( @@ -155,6 +154,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, try: for i in range(ndim): strides_p[i] = strides[i] + # Check that data is contiguous s = itemsize for i from ndim > i >= 0 by 1: if s != strides_p[i]: From 1edb34c3a061d32a8d32bfbecf0538dd86239fc6 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 19:49:05 -0700 Subject: [PATCH 39/97] Note we check strides contains integers too --- ucp/_libs/utils.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 797e1fbdf..a45a0ea8f 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -152,6 +152,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, ndim * sizeof(Py_ssize_t) ) try: + # Make sure that the elements in strides are integers for i in range(ndim): strides_p[i] = strides[i] # Check that data is contiguous From 18cfed6589bfd2afd484360ccbbc0240b76b06ba Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 20:03:49 -0700 Subject: [PATCH 40/97] Use single memory allocation for shape and strides --- ucp/_libs/utils.pyx | 50 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index a45a0ea8f..f24df5e90 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -138,31 +138,33 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, ndim = len(shape) nbytes = itemsize if ndim > 0: - shape_p = PyMem_Malloc(ndim * sizeof(Py_ssize_t)) - try: - # Make sure that the elements in shape are integers - for i in range(ndim): - shape_p[i] = shape[i] - if strides is not None: - if len(strides) != ndim: - raise ValueError( - "The length of shape and strides must be equal" - ) - strides_p = PyMem_Malloc( - ndim * sizeof(Py_ssize_t) + if strides is not None: + if len(strides) != ndim: + raise ValueError( + "The length of shape and strides must be equal" ) - try: - # Make sure that the elements in strides are integers - for i in range(ndim): - strides_p[i] = strides[i] - # Check that data is contiguous - s = itemsize - for i from ndim > i >= 0 by 1: - if s != strides_p[i]: - raise ValueError("Array must be contiguous") - s *= shape_p[i] - finally: - PyMem_Free(strides_p) + shape_p = PyMem_Malloc( + 2 * ndim * sizeof(Py_ssize_t) + ) + strides_p = &shape_p[ndim] + else: + shape_p = PyMem_Malloc(ndim * sizeof(Py_ssize_t)) + strides_p = NULL + try: + # Make sure that the elements are integers + if strides_p != NULL: + for i in range(ndim): + shape_p[i] = shape[i] + strides_p[i] = shape[i] + # Check that data is contiguous + s = itemsize + for i from ndim > i >= 0 by 1: + if s != strides_p[i]: + raise ValueError("Array must be contiguous") + s *= shape_p[i] + else: + for i in range(ndim): + shape_p[i] = shape[i] # Compute size for i in range(ndim): nbytes *= shape_p[i] From 25e3fb3b42e819149609ce56a1394f433227da4f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 20:32:41 -0700 Subject: [PATCH 41/97] Refactor out `nbytes` computation --- ucp/_libs/utils.pyx | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index f24df5e90..3e05391c9 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -104,6 +104,17 @@ cpdef uintptr_t get_buffer_data(buffer, bint check_writable=False) except *: return data.ptr +@boundscheck(False) +@wraparound(False) +cdef inline Py_ssize_t _nbytes(Py_ssize_t itemsize, + Py_ssize_t ndim, + Py_ssize_t* shape_p) nogil: + cdef Py_ssize_t i, nbytes = itemsize + for i in range(ndim): + nbytes *= shape_p[i] + return nbytes + + @boundscheck(False) @wraparound(False) cpdef Py_ssize_t get_buffer_nbytes(buffer, @@ -166,16 +177,13 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, for i in range(ndim): shape_p[i] = shape[i] # Compute size - for i in range(ndim): - nbytes *= shape_p[i] + nbytes = _nbytes(itemsize, ndim, shape_p) finally: PyMem_Free(shape_p) else: mview = PyMemoryView_FromObject(buffer) pybuf = PyMemoryView_GET_BUFFER(mview) - nbytes = pybuf.itemsize - for i in range(pybuf.ndim): - nbytes *= pybuf.shape[i] + nbytes = _nbytes(pybuf.itemsize, pybuf.ndim, pybuf.shape) if not PyBuffer_IsContiguous(pybuf, b"C"): raise ValueError("buffer must be C-contiguous") From e8d742a1e5e389c5b5cc70ed3bc4a28a9ca68dc5 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 20:44:54 -0700 Subject: [PATCH 42/97] Refactor out C-contiguous check --- ucp/_libs/utils.pyx | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 3e05391c9..9a364c45b 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -4,7 +4,6 @@ # cython: language_level=3 -from cpython.buffer cimport PyBuffer_IsContiguous from cpython.mem cimport PyMem_Malloc, PyMem_Free from cpython.memoryview cimport ( PyMemoryView_FromObject, @@ -104,6 +103,21 @@ cpdef uintptr_t get_buffer_data(buffer, bint check_writable=False) except *: return data.ptr +@boundscheck(False) +@wraparound(False) +cdef inline bint _c_contiguous(Py_ssize_t itemsize, + Py_ssize_t ndim, + Py_ssize_t* shape_p, + Py_ssize_t* strides_p) nogil: + cdef Py_ssize_t i, s = itemsize + if strides_p != NULL: + for i from ndim > i >= 0 by 1: + if s != strides_p[i]: + return False + s *= shape_p[i] + return True + + @boundscheck(False) @wraparound(False) cdef inline Py_ssize_t _nbytes(Py_ssize_t itemsize, @@ -130,6 +144,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, cdef tuple shape, strides cdef Py_ssize_t *shape_p, *strides_p cdef Py_ssize_t i, s, itemsize, ndim, nbytes + cdef bint c_contiguous if iface is not None: if not cuda_support: raise ValueError( @@ -168,11 +183,11 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, shape_p[i] = shape[i] strides_p[i] = shape[i] # Check that data is contiguous - s = itemsize - for i from ndim > i >= 0 by 1: - if s != strides_p[i]: - raise ValueError("Array must be contiguous") - s *= shape_p[i] + c_contiguous = _c_contiguous( + itemsize, ndim, shape_p, strides_p + ) + if not c_contiguous: + raise ValueError("Array must be contiguous") else: for i in range(ndim): shape_p[i] = shape[i] @@ -184,7 +199,10 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, mview = PyMemoryView_FromObject(buffer) pybuf = PyMemoryView_GET_BUFFER(mview) nbytes = _nbytes(pybuf.itemsize, pybuf.ndim, pybuf.shape) - if not PyBuffer_IsContiguous(pybuf, b"C"): + c_contiguous = _c_contiguous( + pybuf.itemsize, pybuf.ndim, pybuf.shape, pybuf.strides + ) + if not c_contiguous: raise ValueError("buffer must be C-contiguous") if min_size > 0 and nbytes < min_size: From be05eb3d16a9497dd53511a43abfdd6d32ca577e Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 20:58:53 -0700 Subject: [PATCH 43/97] Pull out C-contiguous check from array coercion --- ucp/_libs/utils.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 9a364c45b..62a024359 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -182,15 +182,15 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, for i in range(ndim): shape_p[i] = shape[i] strides_p[i] = shape[i] - # Check that data is contiguous - c_contiguous = _c_contiguous( - itemsize, ndim, shape_p, strides_p - ) - if not c_contiguous: - raise ValueError("Array must be contiguous") else: for i in range(ndim): shape_p[i] = shape[i] + # Check that data is contiguous + c_contiguous = _c_contiguous( + itemsize, ndim, shape_p, strides_p + ) + if not c_contiguous: + raise ValueError("Array must be contiguous") # Compute size nbytes = _nbytes(itemsize, ndim, shape_p) finally: From 77591700909ff06f86795742fae983112b2841d6 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 21:00:27 -0700 Subject: [PATCH 44/97] Fix Cython warning --- ucp/_libs/utils.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 62a024359..cb1c14535 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -142,7 +142,8 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, cdef dict iface = getattr(buffer, "__cuda_array_interface__", None) cdef const Py_buffer* pybuf cdef tuple shape, strides - cdef Py_ssize_t *shape_p, *strides_p + cdef Py_ssize_t *shape_p + cdef Py_ssize_t *strides_p cdef Py_ssize_t i, s, itemsize, ndim, nbytes cdef bint c_contiguous if iface is not None: From d6f8fb013c27a569815ec71aa961e12ad6554ab0 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 21:14:51 -0700 Subject: [PATCH 45/97] Align non-C-contiguous error messages --- ucp/_libs/utils.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index cb1c14535..54c983a76 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -191,7 +191,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, itemsize, ndim, shape_p, strides_p ) if not c_contiguous: - raise ValueError("Array must be contiguous") + raise ValueError("Array must be C-contiguous") # Compute size nbytes = _nbytes(itemsize, ndim, shape_p) finally: @@ -204,7 +204,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, pybuf.itemsize, pybuf.ndim, pybuf.shape, pybuf.strides ) if not c_contiguous: - raise ValueError("buffer must be C-contiguous") + raise ValueError("Array must be C-contiguous") if min_size > 0 and nbytes < min_size: raise ValueError("the nbytes is greater than the size of the buffer!") From fb1b5af8767d3f93c7626d28d9950ffbb9b5bad3 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 21:15:54 -0700 Subject: [PATCH 46/97] Run `isort` --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 54c983a76..80aea51e6 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -4,7 +4,7 @@ # cython: language_level=3 -from cpython.mem cimport PyMem_Malloc, PyMem_Free +from cpython.mem cimport PyMem_Free, PyMem_Malloc from cpython.memoryview cimport ( PyMemoryView_FromObject, PyMemoryView_GET_BUFFER, From 9f9d4ac9526f12c3e300b9ab85ce3cf144a42797 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 21:23:20 -0700 Subject: [PATCH 47/97] Group `itemsize` and `nbytes` assignments --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 80aea51e6..a68bc761e 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -159,10 +159,10 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, if iface.get("mask") is not None: raise NotImplementedError("mask attribute not supported") - itemsize = get_itemsize(iface["typestr"]) shape = iface["shape"] strides = iface.get("strides") ndim = len(shape) + itemsize = get_itemsize(iface["typestr"]) nbytes = itemsize if ndim > 0: if strides is not None: From ed1c2c4c51b957600cef653375c4277d364ed060 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 21:39:07 -0700 Subject: [PATCH 48/97] Strip comments from self-documenting code --- ucp/_libs/utils.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index a68bc761e..c2e0f1e0b 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -178,7 +178,6 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, shape_p = PyMem_Malloc(ndim * sizeof(Py_ssize_t)) strides_p = NULL try: - # Make sure that the elements are integers if strides_p != NULL: for i in range(ndim): shape_p[i] = shape[i] @@ -186,13 +185,11 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, else: for i in range(ndim): shape_p[i] = shape[i] - # Check that data is contiguous c_contiguous = _c_contiguous( itemsize, ndim, shape_p, strides_p ) if not c_contiguous: raise ValueError("Array must be C-contiguous") - # Compute size nbytes = _nbytes(itemsize, ndim, shape_p) finally: PyMem_Free(shape_p) From af9e790e7b9b53f5e67ec06ea96c3037449c614a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 21:42:17 -0700 Subject: [PATCH 49/97] Raise `MemoryError` if allocation fails --- ucp/_libs/utils.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index c2e0f1e0b..56627e2f0 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -177,6 +177,10 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, else: shape_p = PyMem_Malloc(ndim * sizeof(Py_ssize_t)) strides_p = NULL + if shape_p == NULL: + raise MemoryError( + "Unable to allocate memory for shape & strides" + ) try: if strides_p != NULL: for i in range(ndim): From 83c3ad85533903be4800267e36d817f90ef249ea Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 21:44:08 -0700 Subject: [PATCH 50/97] Use `strides_p` with `strides` --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index 56627e2f0..c8d3f555a 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -185,7 +185,7 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, if strides_p != NULL: for i in range(ndim): shape_p[i] = shape[i] - strides_p[i] = shape[i] + strides_p[i] = strides[i] else: for i in range(ndim): shape_p[i] = shape[i] From 6dd778e4ade3739816bdc0ef443b31f37f97f2a1 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 21:50:28 -0700 Subject: [PATCH 51/97] Assign `s` only when it is used --- ucp/_libs/utils.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index c8d3f555a..a0e99778c 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -109,8 +109,9 @@ cdef inline bint _c_contiguous(Py_ssize_t itemsize, Py_ssize_t ndim, Py_ssize_t* shape_p, Py_ssize_t* strides_p) nogil: - cdef Py_ssize_t i, s = itemsize + cdef Py_ssize_t i, s if strides_p != NULL: + s = itemsize for i from ndim > i >= 0 by 1: if s != strides_p[i]: return False From e0468e5657f1a5983f677b414df6333754969cd5 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 21:54:24 -0700 Subject: [PATCH 52/97] Compute `nbytes` after `c_contiguous` in CPU case There's no point in computing `nbytes` if we are going to error first. In the case where we don't raise an error, the order doesn't matter. So prefer raising the error first (should it come up). --- ucp/_libs/utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx index a0e99778c..2edd36002 100644 --- a/ucp/_libs/utils.pyx +++ b/ucp/_libs/utils.pyx @@ -201,12 +201,12 @@ cpdef Py_ssize_t get_buffer_nbytes(buffer, else: mview = PyMemoryView_FromObject(buffer) pybuf = PyMemoryView_GET_BUFFER(mview) - nbytes = _nbytes(pybuf.itemsize, pybuf.ndim, pybuf.shape) c_contiguous = _c_contiguous( pybuf.itemsize, pybuf.ndim, pybuf.shape, pybuf.strides ) if not c_contiguous: raise ValueError("Array must be C-contiguous") + nbytes = _nbytes(pybuf.itemsize, pybuf.ndim, pybuf.shape) if min_size > 0 and nbytes < min_size: raise ValueError("the nbytes is greater than the size of the buffer!") From 63f716eb823c525864dc99967afaea8d2e4dbca9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 9 Sep 2020 22:15:06 -0700 Subject: [PATCH 53/97] Test some empty arrays --- tests/test_libs_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_libs_utils.py b/tests/test_libs_utils.py index 30dde1e98..01eff8089 100644 --- a/tests/test_libs_utils.py +++ b/tests/test_libs_utils.py @@ -9,9 +9,12 @@ from ucp._libs.utils import get_buffer_data, get_buffer_nbytes builtin_buffers = [ + b"", b"abcd", + array.array("i", []), array.array("i", [0, 1, 2, 3]), array.array("I", [0, 1, 2, 3]), + array.array("f", []), array.array("f", [0, 1, 2, 3]), array.array("d", [0, 1, 2, 3]), memoryview(array.array("B", [0, 1, 2, 3, 4, 5])).cast("B", (3, 2)), From 6a546faac5d42b3c2b8e7ce0de3bd225d376f86d Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 10 Sep 2020 16:08:21 -0700 Subject: [PATCH 54/97] Type `msg_context` as `str` --- ucp/_libs/ucx_api.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index f67f74c82..a9815004f 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -50,7 +50,7 @@ cdef int _ucx_py_request_counter = 0 logger = logging.getLogger("ucx") -cdef assert_ucs_status(ucs_status_t status, msg_context=None): +cdef assert_ucs_status(ucs_status_t status, str msg_context=None): if status != UCS_OK: msg = "[%s] " % msg_context if msg_context is not None else "" msg += ucs_status_string(status).decode("utf-8") From 4c671142f46576b81f3559bfcbde956f91d86d57 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 10 Sep 2020 16:08:23 -0700 Subject: [PATCH 55/97] Type `msg` as `str` --- ucp/_libs/ucx_api.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index a9815004f..da9db89e8 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -51,6 +51,7 @@ logger = logging.getLogger("ucx") cdef assert_ucs_status(ucs_status_t status, str msg_context=None): + cdef str msg if status != UCS_OK: msg = "[%s] " % msg_context if msg_context is not None else "" msg += ucs_status_string(status).decode("utf-8") From ac1c36d60a42e7ca5bd0f5aaf7eaaab0643d0bb7 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 10 Sep 2020 16:08:24 -0700 Subject: [PATCH 56/97] Assign UCX status message to variable --- ucp/_libs/ucx_api.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index da9db89e8..7e92013c9 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -51,10 +51,11 @@ logger = logging.getLogger("ucx") cdef assert_ucs_status(ucs_status_t status, str msg_context=None): - cdef str msg + cdef str msg, ucs_status if status != UCS_OK: + ucs_status = ucs_status_string(status).decode("utf-8") msg = "[%s] " % msg_context if msg_context is not None else "" - msg += ucs_status_string(status).decode("utf-8") + msg += ucs_status raise UCXError(msg) From ba7e41a3b2e1038a7b7734fde9d14c877fc1134a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 10 Sep 2020 16:08:25 -0700 Subject: [PATCH 57/97] Construct `msg` in one operation Cython would turn this into a series of Python C API calls on `str`s. This turns it into a single operation. --- ucp/_libs/ucx_api.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 7e92013c9..a54b7f20b 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -54,8 +54,10 @@ cdef assert_ucs_status(ucs_status_t status, str msg_context=None): cdef str msg, ucs_status if status != UCS_OK: ucs_status = ucs_status_string(status).decode("utf-8") - msg = "[%s] " % msg_context if msg_context is not None else "" - msg += ucs_status + if msg_context is not None: + msg = "[%s] %s" % (msg_context, ucs_status) + else: + msg = ucs_status raise UCXError(msg) From 0fc440ead6abd8aaa2d4fd274e458ec7cddbc9e8 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 10 Sep 2020 16:19:52 -0700 Subject: [PATCH 58/97] Use f-string to create `msg` --- ucp/_libs/ucx_api.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index a54b7f20b..1bb90ad76 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -55,7 +55,7 @@ cdef assert_ucs_status(ucs_status_t status, str msg_context=None): if status != UCS_OK: ucs_status = ucs_status_string(status).decode("utf-8") if msg_context is not None: - msg = "[%s] %s" % (msg_context, ucs_status) + msg = f"[{msg_context}] {ucs_status}" else: msg = ucs_status raise UCXError(msg) From e326a79cd9f04496905676a61bdb69f965ddee60 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 14 Sep 2020 23:58:57 -0700 Subject: [PATCH 59/97] Add `Array` to hold array-like objects The `Array` object helps validate that a Python object meets our requirements of being array-like. Additionally it extracts metadata about the array-like object and stores it in an efficent C layout. This speeds up later queries about the array object and allows for efficient C-style access in lower-level UCX-Py code (without needing to pass through Python first). --- setup.py | 8 ++ ucp/_libs/arr.pxd | 26 +++++ ucp/_libs/arr.pyx | 287 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 321 insertions(+) create mode 100644 ucp/_libs/arr.pxd create mode 100644 ucp/_libs/arr.pyx diff --git a/setup.py b/setup.py index 1ed4f2e65..2800daabf 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,14 @@ libraries=libraries, extra_compile_args=extra_compile_args, ), + Extension( + "ucp._libs.arr", + sources=["ucp/_libs/arr.pyx"], + include_dirs=include_dirs, + library_dirs=library_dirs, + libraries=libraries, + extra_compile_args=extra_compile_args, + ), Extension( "ucp._libs.utils", sources=["ucp/_libs/utils.pyx"], diff --git a/ucp/_libs/arr.pxd b/ucp/_libs/arr.pxd new file mode 100644 index 000000000..3d54e9452 --- /dev/null +++ b/ucp/_libs/arr.pxd @@ -0,0 +1,26 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +# cython: language_level=3 + + +from libc.stdint cimport uintptr_t + + +cdef class Array: + cdef readonly uintptr_t ptr + cdef readonly bint readonly + cdef readonly object obj + + cdef readonly Py_ssize_t itemsize + + cdef readonly Py_ssize_t ndim + cdef Py_ssize_t* shape_p + cdef Py_ssize_t* strides_p + + cdef readonly bint cuda + + cpdef bint _c_contiguous(self) + cpdef bint _f_contiguous(self) + cpdef bint _contiguous(self) + cpdef Py_ssize_t _nbytes(self) diff --git a/ucp/_libs/arr.pyx b/ucp/_libs/arr.pyx new file mode 100644 index 000000000..0cc062f68 --- /dev/null +++ b/ucp/_libs/arr.pyx @@ -0,0 +1,287 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +# cython: language_level=3 + + +from cpython.buffer cimport PyBuffer_IsContiguous +from cpython.mem cimport PyMem_Free, PyMem_Malloc +from cpython.memoryview cimport ( + PyMemoryView_FromObject, + PyMemoryView_GET_BUFFER, +) +from cpython.ref cimport Py_INCREF +from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM +from cython cimport auto_pickle, boundscheck, wraparound +from libc.stdint cimport uintptr_t +from libc.string cimport memcpy + +try: + from numpy import dtype as numpy_dtype +except ImportError: + numpy_dtype = None + + +cdef dict itemsize_mapping = { + intern("|b1"): 1, + intern("|i1"): 1, + intern("|u1"): 1, + intern("i2"): 2, + intern("u2"): 2, + intern("i4"): 4, + intern("u4"): 4, + intern("i8"): 8, + intern("u8"): 8, + intern("f2"): 2, + intern("f4"): 4, + intern("f8"): 8, + intern("f16"): 16, + intern("c8"): 8, + intern("c16"): 16, + intern("c32"): 32, +} + + +@auto_pickle(False) +cdef class Array: + def __cinit__(self, obj): + + cdef dict iface = getattr(obj, "__cuda_array_interface__", None) + self.cuda = (iface is not None) + + cdef const Py_buffer* pybuf + cdef str typestr + cdef tuple data, shape, strides + cdef Py_ssize_t *shape_p + cdef Py_ssize_t *strides_p + cdef Py_ssize_t i + if self.cuda: + if iface.get("mask") is not None: + raise NotImplementedError("mask attribute not supported") + + self.obj = obj + + data = iface["data"] + self.ptr, self.readonly = data + + typestr = iface["typestr"] + if typestr is None: + raise ValueError("Expected `str`, but got `None`") + elif typestr == "": + raise ValueError("Got unexpected empty `str`") + else: + try: + self.itemsize = itemsize_mapping[typestr] + except KeyError: + if numpy_dtype is not None: + self.itemsize = numpy_dtype(typestr).itemsize + else: + raise ValueError( + f"Unexpected data type, '{typestr}'." + " Please install NumPy to handle this format." + ) + + shape = iface["shape"] + strides = iface.get("strides") + self.ndim = len(shape) + if self.ndim > 0: + if strides is not None: + if len(strides) != self.ndim: + raise ValueError( + "The length of shape and strides must be equal" + ) + self.shape_p = PyMem_Malloc( + 2 * self.ndim * sizeof(Py_ssize_t) + ) + self.strides_p = self.shape_p + self.ndim + else: + self.shape_p = PyMem_Malloc( + self.ndim * sizeof(Py_ssize_t) + ) + self.strides_p = NULL + + if self.shape_p == NULL: + raise MemoryError( + "Unable to allocate memory for shape & strides" + ) + + if self.strides_p != NULL: + for i in range(self.ndim): + self.shape_p[i] = shape[i] + self.strides_p[i] = strides[i] + else: + for i in range(self.ndim): + self.shape_p[i] = shape[i] + else: + mv = PyMemoryView_FromObject(obj) + pybuf = PyMemoryView_GET_BUFFER(mv) + + if pybuf.suboffsets != NULL: + raise NotImplementedError("Suboffsets are not supported") + + self.ptr = pybuf.buf + self.obj = pybuf.obj + self.readonly = pybuf.readonly + self.ndim = pybuf.ndim + self.itemsize = pybuf.itemsize + + if not PyBuffer_IsContiguous(pybuf, b"C"): + self.shape_p = PyMem_Malloc( + 2 * self.ndim * sizeof(Py_ssize_t) + ) + self.strides_p = self.shape_p + self.ndim + else: + self.shape_p = PyMem_Malloc( + self.ndim * sizeof(Py_ssize_t) + ) + self.strides_p = NULL + + if self.shape_p == NULL: + raise MemoryError( + "Unable to allocate memory for shape & strides" + ) + + memcpy(self.shape_p, pybuf.shape, self.ndim * sizeof(Py_ssize_t)) + if self.strides_p != NULL: + memcpy( + self.strides_p, + pybuf.strides, + self.ndim * sizeof(Py_ssize_t) + ) + + def __dealloc__(self): + PyMem_Free(self.shape_p) + self.shape_p = NULL + self.strides_p = NULL + + cpdef bint _c_contiguous(self): + return _c_contiguous( + self.itemsize, self.ndim, self.shape_p, self.strides_p + ) + + @property + def c_contiguous(self): + return self._c_contiguous() + + cpdef bint _f_contiguous(self): + return _f_contiguous( + self.itemsize, self.ndim, self.shape_p, self.strides_p + ) + + @property + def f_contiguous(self): + return self._f_contiguous() + + cpdef bint _contiguous(self): + return _contiguous( + self.itemsize, self.ndim, self.shape_p, self.strides_p + ) + + @property + def contiguous(self): + return self._contiguous() + + cpdef Py_ssize_t _nbytes(self): + return _nbytes(self.itemsize, self.ndim, self.shape_p) + + @property + def nbytes(self): + return self._nbytes() + + @property + def shape(self): + cdef tuple shape = PyTuple_New(self.ndim) + cdef Py_ssize_t i + cdef object o + for i in range(self.ndim): + o = self.shape_p[i] + Py_INCREF(o) + PyTuple_SET_ITEM(shape, i, o) + return shape + + @property + def strides(self): + cdef tuple strides = PyTuple_New(self.ndim) + cdef Py_ssize_t i, s + cdef object o + if self.strides_p != NULL: + for i from self.ndim > i >= 0 by 1: + o = self.strides_p[i] + Py_INCREF(o) + PyTuple_SET_ITEM(strides, i, o) + else: + s = self.itemsize + for i from self.ndim > i >= 0 by 1: + o = s + Py_INCREF(o) + PyTuple_SET_ITEM(strides, i, o) + s *= self.shape_p[i] + return strides + + +@boundscheck(False) +@wraparound(False) +cdef inline bint _c_contiguous(Py_ssize_t itemsize, + Py_ssize_t ndim, + Py_ssize_t* shape_p, + Py_ssize_t* strides_p) nogil: + cdef Py_ssize_t i, s + if strides_p != NULL: + s = itemsize + for i from ndim > i >= 0 by 1: + if s != strides_p[i]: + return False + s *= shape_p[i] + return True + + +@boundscheck(False) +@wraparound(False) +cdef inline bint _f_contiguous(Py_ssize_t itemsize, + Py_ssize_t ndim, + Py_ssize_t* shape_p, + Py_ssize_t* strides_p) nogil: + cdef Py_ssize_t i, s + if strides_p != NULL: + s = itemsize + for i from 0 <= i < ndim by 1: + if s != strides_p[i]: + return False + s *= shape_p[i] + elif ndim > 1: + return False + return True + + +cdef inline bint _contiguous(Py_ssize_t itemsize, + Py_ssize_t ndim, + Py_ssize_t* shape_p, + Py_ssize_t* strides_p) nogil: + cdef bint r = _c_contiguous(itemsize, ndim, shape_p, strides_p) + if not r: + r = _f_contiguous(itemsize, ndim, shape_p, strides_p) + return r + + +@boundscheck(False) +@wraparound(False) +cdef inline Py_ssize_t _nbytes(Py_ssize_t itemsize, + Py_ssize_t ndim, + Py_ssize_t* shape_p) nogil: + cdef Py_ssize_t i, nbytes = itemsize + for i in range(ndim): + nbytes *= shape_p[i] + return nbytes From 7d5e19701b288363d2215ecd896d7a6bf1b6c91b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 14 Sep 2020 23:58:59 -0700 Subject: [PATCH 60/97] Add tests for `Array` --- tests/test_libs_arr.py | 178 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 tests/test_libs_arr.py diff --git a/tests/test_libs_arr.py b/tests/test_libs_arr.py new file mode 100644 index 000000000..2f749dea3 --- /dev/null +++ b/tests/test_libs_arr.py @@ -0,0 +1,178 @@ +import array +import functools +import io +import mmap +import operator + +import pytest + +from ucp._libs.arr import Array + +builtin_buffers = [ + b"", + b"abcd", + array.array("i", []), + array.array("i", [0, 1, 2, 3]), + array.array("I", [0, 1, 2, 3]), + array.array("f", []), + array.array("f", [0, 1, 2, 3]), + array.array("d", [0, 1, 2, 3]), + memoryview(array.array("B", [0, 1, 2, 3, 4, 5])).cast("B", (3, 2)), + memoryview(b"abcd"), + memoryview(bytearray(b"abcd")), + io.BytesIO(b"abcd").getbuffer(), + mmap.mmap(-1, 5), +] + + +@pytest.mark.parametrize("buffer", builtin_buffers) +def test_Array_ptr_builtins(buffer): + arr = Array(buffer) + assert arr.ptr != 0 + + +@pytest.mark.parametrize("buffer", builtin_buffers) +def test_Array_readonly_builtins(buffer): + arr = Array(buffer) + mv = memoryview(buffer) + assert arr.readonly == mv.readonly + + +@pytest.mark.parametrize("buffer", builtin_buffers) +def test_Array_obj_builtins(buffer): + arr = Array(buffer) + mv = memoryview(buffer) + assert arr.obj is mv.obj + + +@pytest.mark.parametrize("buffer", builtin_buffers) +def test_Array_itemsize_builtins(buffer): + arr = Array(buffer) + mv = memoryview(buffer) + assert arr.itemsize == mv.itemsize + + +@pytest.mark.parametrize("buffer", builtin_buffers) +def test_Array_ndim_builtins(buffer): + arr = Array(buffer) + mv = memoryview(buffer) + assert arr.ndim == mv.ndim + + +@pytest.mark.parametrize("buffer", builtin_buffers) +def test_Array_shape_builtins(buffer): + arr = Array(buffer) + mv = memoryview(buffer) + assert arr.shape == mv.shape + + +@pytest.mark.parametrize("buffer", builtin_buffers) +def test_Array_strides_builtins(buffer): + arr = Array(buffer) + mv = memoryview(buffer) + assert arr.strides == mv.strides + + +@pytest.mark.parametrize("buffer", builtin_buffers) +def test_Array_nbytes_builtins(buffer): + arr = Array(buffer) + mv = memoryview(buffer) + assert arr.nbytes == mv.nbytes + + +@pytest.mark.parametrize("buffer", builtin_buffers) +def test_Array_contiguous_builtins(buffer): + mv = memoryview(buffer) + arr = Array(buffer) + assert arr.c_contiguous == mv.c_contiguous + assert arr.f_contiguous == mv.f_contiguous + assert arr.contiguous == mv.contiguous + + mv2 = memoryview(buffer)[::2] + if mv2: + arr2 = Array(mv2) + assert arr2.c_contiguous == mv2.c_contiguous + assert arr2.f_contiguous == mv2.f_contiguous + assert arr2.contiguous == mv2.contiguous + + +array_params = [ + ((2, 3), "i4", (12, 4)), + ((2, 3), "u4", (12, 4)), + ((2, 3), "f4", (12, 4)), + ((2, 3), "f8", (24, 8)), + ((2, 3), "f8", (8, 16)), +] + + +def create_array(xp, shape, dtype, strides): + if xp == "cupy": + iface_prop = "__cuda_array_interface__" + elif xp == "numpy": + iface_prop = "__array_interface__" + + xp = pytest.importorskip(xp) + + nelem = functools.reduce(operator.mul, shape, 1) + data = xp.arange(nelem, dtype=dtype) + arr = xp.ndarray(shape, dtype, data.data, strides=strides) + iface = getattr(arr, iface_prop) + + return xp, arr, iface + + +@pytest.mark.parametrize("xp", ["cupy", "numpy"]) +@pytest.mark.parametrize("shape, dtype, strides", array_params) +def test_Array_ndarray_ptr(xp, shape, dtype, strides): + xp, arr, iface = create_array(xp, shape, dtype, strides) + arr2 = Array(arr) + + assert arr2.ptr == iface["data"][0] + + +@pytest.mark.parametrize("xp", ["cupy", "numpy"]) +@pytest.mark.parametrize("shape, dtype, strides", array_params) +def test_Array_ndarray_is_cuda(xp, shape, dtype, strides): + xp, arr, iface = create_array(xp, shape, dtype, strides) + arr2 = Array(arr) + + is_cuda = xp.__name__ == "cupy" + assert arr2.cuda == is_cuda + + +@pytest.mark.parametrize("xp", ["cupy", "numpy"]) +@pytest.mark.parametrize("shape, dtype, strides", array_params) +def test_Array_ndarray_nbytes(xp, shape, dtype, strides): + xp, arr, iface = create_array(xp, shape, dtype, strides) + arr2 = Array(arr) + + assert arr2.nbytes == arr.nbytes + + +@pytest.mark.parametrize("xp", ["cupy", "numpy"]) +@pytest.mark.parametrize("shape, dtype, strides", array_params) +def test_Array_ndarray_shape(xp, shape, dtype, strides): + xp, arr, iface = create_array(xp, shape, dtype, strides) + arr2 = Array(arr) + + assert arr2.shape == arr.shape + + +@pytest.mark.parametrize("xp", ["cupy", "numpy"]) +@pytest.mark.parametrize("shape, dtype, strides", array_params) +def test_Array_ndarray_strides(xp, shape, dtype, strides): + xp, arr, iface = create_array(xp, shape, dtype, strides) + arr2 = Array(arr) + + assert arr2.strides == arr.strides + + +@pytest.mark.parametrize("xp", ["cupy", "numpy"]) +@pytest.mark.parametrize("shape, dtype, strides", array_params) +def test_Array_ndarray_contiguous(xp, shape, dtype, strides): + xp, arr, iface = create_array(xp, shape, dtype, strides) + arr2 = Array(arr) + + assert arr2.c_contiguous == arr.flags.c_contiguous + assert arr2.f_contiguous == arr.flags.f_contiguous + assert arr2.contiguous == (arr.flags.c_contiguous or arr.flags.f_contiguous) From a52f0a7ca496127e8f449f64fb69f8397102dbda Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 14 Sep 2020 23:59:00 -0700 Subject: [PATCH 61/97] Wrap send/recv objects with `Array` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This extracts metadata into an efficient C format with fast Python bindings. Also `Array` avoids repeated calls to `getattr`, which take close to `1µs` a piece, by performing this call once. In the Cython layer this can be used to efficiently extract pointers and compute relevant properties. --- ucp/_libs/ucx_api.pyx | 60 ++++++++++++---------------------- ucp/comm.py | 18 ++++++++--- ucp/core.py | 75 ++++++++++++++++++++++++++++++------------- 3 files changed, 86 insertions(+), 67 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index f67f74c82..51d0334d6 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -15,8 +15,8 @@ from libc.stdio cimport FILE, fclose, fflush from libc.stdlib cimport free from libc.string cimport memset +from .arr cimport Array from .ucx_api_dep cimport * -from .utils cimport get_buffer_data from ..exceptions import ( UCXCanceled, @@ -674,7 +674,7 @@ cdef void _send_callback(void *request, ucs_status_t status): def tag_send_nb( UCXEndpoint ep, - buffer, + Array buffer, size_t nbytes, ucp_tag_t tag, cb_func, @@ -703,12 +703,8 @@ def tag_send_nb( ---------- ep: UCXEndpoint The destination endpoint - buffer: object - The buffer object, which must support one of the following protocols and - is checked in order: - 1) Numba's CUDA Array Interface: `__cuda_array_interface__` - 2) Numpy's Array Interface: `__array_interface__` - 3) Python buffer protocol: `memoryview()` + buffer: Array + An ``Array`` wrapping a user-provided array-like object nbytes: int Size of the buffer to use. Must be equal or less than the size of buffer tag: int @@ -723,11 +719,10 @@ def tag_send_nb( name: str, optional Descriptive name of the operation """ - cdef void *data = get_buffer_data(buffer, check_writable=False) cdef ucp_send_callback_t _send_cb = _send_callback cdef ucs_status_ptr_t status = ucp_tag_send_nb( ep._handle, - data, + buffer.ptr, nbytes, ucp_dt_make_contig(1), tag, @@ -777,7 +772,7 @@ cdef void _tag_recv_callback( def tag_recv_nb( UCXWorker worker, - buffer, + Array buffer, size_t nbytes, ucp_tag_t tag, cb_func, @@ -807,12 +802,8 @@ def tag_recv_nb( ---------- worker: UCXWorker The worker that is used for the receive operation - buffer: object - The buffer object, which must support one of the following protocols and - is checked in order: - 1) Numba's CUDA Array Interface: `__cuda_array_interface__` - 2) Numpy's Array Interface: `__array_interface__` - 3) Python buffer protocol: `memoryview()` + buffer: Array + An ``Array`` wrapping a user-provided array-like object nbytes: int Size of the buffer to use. Must be equal or less than the size of buffer tag: int @@ -834,14 +825,14 @@ def tag_recv_nb( guarantee that the message is cancelled when `ep` closes as opposed to when the `worker` closes. """ - - cdef void *data = get_buffer_data(buffer, check_writable=True) + if buffer.readonly: + raise ValueError("writing to readonly buffer!") cdef ucp_tag_recv_callback_t _tag_recv_cb = ( _tag_recv_callback ) cdef ucs_status_ptr_t status = ucp_tag_recv_nb( worker._handle, - data, + buffer.ptr, nbytes, ucp_dt_make_contig(1), tag, @@ -856,7 +847,7 @@ def tag_recv_nb( def stream_send_nb( UCXEndpoint ep, - buffer, + Array buffer, size_t nbytes, cb_func, cb_args=tuple(), @@ -883,12 +874,8 @@ def stream_send_nb( ---------- ep: UCXEndpoint The destination endpoint - buffer: object - The buffer object, which must support one of the following protocols and - is checked in order: - 1) Numba's CUDA Array Interface: `__cuda_array_interface__` - 2) Numpy's Array Interface: `__array_interface__` - 3) Python buffer protocol: `memoryview()` + buffer: Array + An ``Array`` wrapping a user-provided array-like object nbytes: int Size of the buffer to use. Must be equal or less than the size of buffer cb_func: callable @@ -901,11 +888,10 @@ def stream_send_nb( name: str, optional Descriptive name of the operation """ - cdef void *data = get_buffer_data(buffer, check_writable=False) cdef ucp_send_callback_t _send_cb = _send_callback cdef ucs_status_ptr_t status = ucp_stream_send_nb( ep._handle, - data, + buffer.ptr, nbytes, ucp_dt_make_contig(1), _send_cb, @@ -955,7 +941,7 @@ cdef void _stream_recv_callback( def stream_recv_nb( UCXEndpoint ep, - buffer, + Array buffer, size_t nbytes, cb_func, cb_args=tuple(), @@ -976,12 +962,8 @@ def stream_recv_nb( ---------- ep: UCXEndpoint The destination endpoint - buffer: object - The buffer object, which must support one of the following protocols and - is checked in order: - 1) Numba's CUDA Array Interface: `__cuda_array_interface__` - 2) Numpy's Array Interface: `__array_interface__` - 3) Python buffer protocol: `memoryview()` + buffer: Array + An ``Array`` wrapping a user-provided array-like object nbytes: int Size of the buffer to use. Must be equal or less than the size of buffer cb_func: callable @@ -994,15 +976,15 @@ def stream_recv_nb( name: str, optional Descriptive name of the operation """ - - cdef void *data = get_buffer_data(buffer, check_writable=True) + if buffer.readonly: + raise ValueError("writing to readonly buffer!") cdef size_t length cdef ucp_stream_recv_callback_t _stream_recv_cb = ( _stream_recv_callback ) cdef ucs_status_ptr_t status = ucp_stream_recv_nb( ep._handle, - data, + buffer.ptr, nbytes, ucp_dt_make_contig(1), _stream_recv_cb, diff --git a/ucp/comm.py b/ucp/comm.py index cb60235ee..9035c9c5c 100644 --- a/ucp/comm.py +++ b/ucp/comm.py @@ -3,7 +3,7 @@ import asyncio -from ._libs import ucx_api +from ._libs import arr, ucx_api def _cb_func(request, exception, event_loop, future): @@ -33,7 +33,7 @@ def _call_ucx_api(event_loop, func, *args, **kwargs): def tag_send( ep: ucx_api.UCXEndpoint, - buffer, + buffer: arr.Array, nbytes: int, tag: int, name="tag_send", @@ -46,7 +46,11 @@ def tag_send( def stream_send( - ep: ucx_api.UCXEndpoint, buffer, nbytes: int, name="stream_send", event_loop=None + ep: ucx_api.UCXEndpoint, + buffer: arr.Array, + nbytes: int, + name="stream_send", + event_loop=None, ) -> asyncio.Future: return _call_ucx_api( @@ -56,7 +60,7 @@ def stream_send( def tag_recv( ep: ucx_api.UCXEndpoint, - buffer, + buffer: arr.Array, nbytes: int, tag: int, name="tag_recv", @@ -76,7 +80,11 @@ def tag_recv( def stream_recv( - ep: ucx_api.UCXEndpoint, buffer, nbytes: int, name="stream_recv", event_loop=None + ep: ucx_api.UCXEndpoint, + buffer: arr.Array, + nbytes: int, + name="stream_recv", + event_loop=None, ) -> asyncio.Future: return _call_ucx_api( diff --git a/ucp/core.py b/ucp/core.py index ae6a90bdc..83d295f64 100644 --- a/ucp/core.py +++ b/ucp/core.py @@ -16,7 +16,7 @@ from . import comm from ._libs import ucx_api -from ._libs.utils import get_buffer_nbytes +from ._libs.arr import Array from .continuous_ucx_progress import BlockingMode, NonBlockingMode from .exceptions import UCXCanceled, UCXCloseError, UCXError from .utils import hash64bits, nvtx_annotate @@ -52,15 +52,17 @@ async def exchange_peer_info( hash64bits(msg_tag, ctrl_tag, guarantee_msg_order, port), ) peer_info = bytearray(len(my_info)) + my_info_arr = Array(my_info) + peer_info_arr = Array(peer_info) # Send/recv peer information. Notice, we force an `await` between the two # streaming calls (see ) if listener is True: - await comm.stream_send(endpoint, my_info, len(my_info)) - await comm.stream_recv(endpoint, peer_info, len(peer_info)) + await comm.stream_send(endpoint, my_info_arr, my_info_arr.nbytes) + await comm.stream_recv(endpoint, peer_info_arr, peer_info_arr.nbytes) else: - await comm.stream_recv(endpoint, peer_info, len(peer_info)) - await comm.stream_send(endpoint, my_info, len(my_info)) + await comm.stream_recv(endpoint, peer_info_arr, peer_info_arr.nbytes) + await comm.stream_send(endpoint, my_info_arr, my_info_arr.nbytes) # Unpacking and sanity check of the peer information ret = {} @@ -132,8 +134,9 @@ def setup_ctrl_recv(ep): """Help function to setup the receive of the control message""" log = "[Recv shutdown] ep: %s, tag: %s" % (hex(ep.uid), hex(ep._ctrl_tag_recv),) msg = bytearray(CtrlMsg.nbytes) + msg_arr = Array(msg) shutdown_fut = comm.tag_recv( - ep._ep, msg, len(msg), ep._ctrl_tag_recv, name=log, + ep._ep, msg_arr, msg_arr.nbytes, ep._ctrl_tag_recv, name=log, ) shutdown_fut.add_done_callback( @@ -523,6 +526,7 @@ async def close(self): # Send a shutdown message to the peer msg = CtrlMsg.serialize(opcode=1, close_after_n_recv=self._send_count) + msg_arr = Array(msg) log = "[Send shutdown] ep: %s, tag: %s, close_after_n_recv: %d" % ( hex(self.uid), hex(self._ctrl_tag_send), @@ -531,7 +535,7 @@ async def close(self): logger.debug(log) try: await comm.tag_send( - self._ep, msg, len(msg), self._ctrl_tag_send, name=log, + self._ep, msg_arr, msg_arr.nbytes, self._ctrl_tag_send, name=log, ) # The peer might already be shutting down thus we can ignore any send errors except UCXError as e: @@ -562,15 +566,28 @@ async def send(self, buffer, nbytes=-1, tag=None): """ if self.closed(): raise UCXCloseError("Endpoint closed") - nbytes = get_buffer_nbytes( - buffer, min_size=nbytes, cuda_support=self._cuda_support - ) + if not isinstance(buffer, Array): + buffer = Array(buffer) + if not self._cuda_support and buffer.cuda: + raise ValueError( + "UCX is not configured with CUDA support, please add " + "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" + "variable and that the ucx-proc=*=gpu package is " + "installed. See " + "https://ucx-py.readthedocs.io/en/latest/install.html for " + "more information." + ) + if not buffer.c_contiguous: + raise ValueError("Array must be C-contiguous") + buffer_nbytes = buffer.nbytes + if nbytes > 0 and buffer_nbytes < nbytes: + raise ValueError("the nbytes is greater than the size of the buffer!") log = "[Send #%03d] ep: %s, tag: %s, nbytes: %d, type: %s" % ( self._send_count, hex(self.uid), hex(self._msg_tag_send), - nbytes, - type(buffer), + buffer_nbytes, + type(buffer.obj), ) logger.debug(log) self._send_count += 1 @@ -580,7 +597,7 @@ async def send(self, buffer, nbytes=-1, tag=None): tag = hash64bits(self._msg_tag_send, hash(tag)) if self._guarantee_msg_order: tag += self._send_count - return await comm.tag_send(self._ep, buffer, nbytes, tag, name=log) + return await comm.tag_send(self._ep, buffer, buffer_nbytes, tag, name=log) @nvtx_annotate("UCXPY_RECV", color="red", domain="ucxpy") async def recv(self, buffer, nbytes=-1, tag=None): @@ -600,15 +617,28 @@ async def recv(self, buffer, nbytes=-1, tag=None): """ if self.closed(): raise UCXCloseError("Endpoint closed") - nbytes = get_buffer_nbytes( - buffer, min_size=nbytes, cuda_support=self._cuda_support - ) + if not isinstance(buffer, Array): + buffer = Array(buffer) + if not self._cuda_support and buffer.cuda: + raise ValueError( + "UCX is not configured with CUDA support, please add " + "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" + "variable and that the ucx-proc=*=gpu package is " + "installed. See " + "https://ucx-py.readthedocs.io/en/latest/install.html for " + "more information." + ) + if not buffer.c_contiguous: + raise ValueError("Array must be C-contiguous") + buffer_nbytes = buffer.nbytes + if nbytes > 0 and buffer_nbytes < nbytes: + raise ValueError("the nbytes is greater than the size of the buffer!") log = "[Recv #%03d] ep: %s, tag: %s, nbytes: %d, type: %s" % ( self._recv_count, hex(self.uid), hex(self._msg_tag_recv), - nbytes, - type(buffer), + buffer_nbytes, + type(buffer.obj), ) logger.debug(log) self._recv_count += 1 @@ -618,7 +648,7 @@ async def recv(self, buffer, nbytes=-1, tag=None): tag = hash64bits(self._msg_tag_recv, hash(tag)) if self._guarantee_msg_order: tag += self._recv_count - ret = await comm.tag_recv(self._ep, buffer, nbytes, tag, name=log) + ret = await comm.tag_recv(self._ep, buffer, buffer_nbytes, tag, name=log) self._finished_recv_count += 1 if ( self._close_after_n_recv is not None @@ -692,10 +722,9 @@ async def send_obj(self, obj, tag=None): ------- >>> await ep.send_obj(pickle.dumps([1,2,3])) """ - - nbytes = array.array( - "Q", [get_buffer_nbytes(buffer=obj, cuda_support=self._cuda_support)] - ) + if not isinstance(obj, Array): + obj = Array(obj) + nbytes = Array(array.array("Q", [obj.nbytes])) await self.send(nbytes, tag=tag) await self.send(obj, tag=tag) From d05c678481d9a77c9d0305416dc5f86cd15d640f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 14 Sep 2020 23:59:01 -0700 Subject: [PATCH 62/97] Drop internal Cython `utils` As this has now been absorbed into `Array`, we can drop these utility functions. --- setup.py | 8 -- tests/test_libs_utils.py | 106 ------------------- ucp/_libs/utils.pxd | 14 --- ucp/_libs/utils.pyx | 215 --------------------------------------- 4 files changed, 343 deletions(-) delete mode 100644 tests/test_libs_utils.py delete mode 100644 ucp/_libs/utils.pxd delete mode 100644 ucp/_libs/utils.pyx diff --git a/setup.py b/setup.py index 2800daabf..9a4e21a80 100644 --- a/setup.py +++ b/setup.py @@ -46,14 +46,6 @@ libraries=libraries, extra_compile_args=extra_compile_args, ), - Extension( - "ucp._libs.utils", - sources=["ucp/_libs/utils.pyx"], - include_dirs=include_dirs, - library_dirs=library_dirs, - libraries=libraries, - extra_compile_args=extra_compile_args, - ), Extension( "ucp._libs.topological_distance", sources=[ diff --git a/tests/test_libs_utils.py b/tests/test_libs_utils.py deleted file mode 100644 index 01eff8089..000000000 --- a/tests/test_libs_utils.py +++ /dev/null @@ -1,106 +0,0 @@ -import array -import functools -import io -import mmap -import operator - -import pytest - -from ucp._libs.utils import get_buffer_data, get_buffer_nbytes - -builtin_buffers = [ - b"", - b"abcd", - array.array("i", []), - array.array("i", [0, 1, 2, 3]), - array.array("I", [0, 1, 2, 3]), - array.array("f", []), - array.array("f", [0, 1, 2, 3]), - array.array("d", [0, 1, 2, 3]), - memoryview(array.array("B", [0, 1, 2, 3, 4, 5])).cast("B", (3, 2)), - memoryview(b"abcd"), - memoryview(bytearray(b"abcd")), - io.BytesIO(b"abcd").getbuffer(), - mmap.mmap(-1, 5), -] - - -@pytest.mark.parametrize("buffer", builtin_buffers) -def test_get_buffer_data_builtins(buffer): - check_writable = False - ptr = get_buffer_data(buffer, check_writable=check_writable) - assert ptr != 0 - - check_writable = True - readonly = memoryview(buffer).readonly - if readonly: - with pytest.raises(ValueError): - get_buffer_data(buffer, check_writable=check_writable) - else: - get_buffer_data(buffer, check_writable=check_writable) - - -@pytest.mark.parametrize("buffer", builtin_buffers) -def test_get_buffer_nbytes_builtins(buffer): - nbytes = memoryview(buffer).nbytes - result = get_buffer_nbytes(buffer, cuda_support=True) - assert result == nbytes - - with pytest.raises(ValueError): - get_buffer_nbytes(memoryview(buffer)[::2], cuda_support=True) - - # Test exceptional cases with `min_size` - get_buffer_nbytes(buffer, min_size=nbytes, cuda_support=True) - with pytest.raises(ValueError): - get_buffer_nbytes(buffer, min_size=(nbytes + 1), cuda_support=True) - - -array_params = [ - ((2, 3), "i4", (12, 4)), - ((2, 3), "u4", (12, 4)), - ((2, 3), "f4", (12, 4)), - ((2, 3), "f8", (24, 8)), - ((2, 3), "f8", (8, 16)), -] - - -def create_array(xp, shape, dtype, strides): - if xp == "cupy": - iface_prop = "__cuda_array_interface__" - elif xp == "numpy": - iface_prop = "__array_interface__" - - xp = pytest.importorskip(xp) - - nelem = functools.reduce(operator.mul, shape, 1) - data = xp.arange(nelem, dtype=dtype) - arr = xp.ndarray(shape, dtype, data.data, strides=strides) - iface = getattr(arr, iface_prop) - - return xp, arr, iface - - -@pytest.mark.parametrize("xp", ["cupy", "numpy"]) -@pytest.mark.parametrize("shape, dtype, strides", array_params) -def test_get_buffer_data_array(xp, shape, dtype, strides): - xp, arr, iface = create_array(xp, shape, dtype, strides) - - ptr = get_buffer_data(arr, check_writable=False) - assert ptr == iface["data"][0] - - -@pytest.mark.parametrize("xp", ["cupy", "numpy"]) -@pytest.mark.parametrize("shape, dtype, strides", array_params) -def test_get_buffer_nbytes_array(xp, shape, dtype, strides): - xp, arr, iface = create_array(xp, shape, dtype, strides) - - if xp.__name__ == "cupy": - with pytest.raises(ValueError): - get_buffer_nbytes(arr, cuda_support=False) - - if arr.flags.c_contiguous: - nbytes = get_buffer_nbytes(arr, cuda_support=True) - assert nbytes == arr.nbytes - else: - with pytest.raises(ValueError): - get_buffer_nbytes(arr, cuda_support=True) diff --git a/ucp/_libs/utils.pxd b/ucp/_libs/utils.pxd deleted file mode 100644 index 9e3e61e39..000000000 --- a/ucp/_libs/utils.pxd +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# See file LICENSE for terms. - -# cython: language_level=3 - - -from libc.stdint cimport uintptr_t - - -cpdef Py_ssize_t get_itemsize(str fmt) except * -cpdef uintptr_t get_buffer_data(buffer, bint check_writable=*) except * -cpdef Py_ssize_t get_buffer_nbytes(buffer, - Py_ssize_t min_size=*, - bint cuda_support=*) except * diff --git a/ucp/_libs/utils.pyx b/ucp/_libs/utils.pyx deleted file mode 100644 index 1e838dd2a..000000000 --- a/ucp/_libs/utils.pyx +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. -# See file LICENSE for terms. - -# cython: language_level=3 - - -from cpython.mem cimport PyMem_Free, PyMem_Malloc -from cpython.memoryview cimport ( - PyMemoryView_FromObject, - PyMemoryView_GET_BUFFER, -) -from cython cimport boundscheck, wraparound -from libc.stdint cimport uintptr_t - -try: - from numpy import dtype as numpy_dtype -except ImportError: - numpy_dtype = None - - -cdef struct iface_data_t: - uintptr_t ptr - bint readonly - - -cdef dict itemsize_mapping = { - intern("|b1"): 1, - intern("|i1"): 1, - intern("|u1"): 1, - intern("i2"): 2, - intern("u2"): 2, - intern("i4"): 4, - intern("u4"): 4, - intern("i8"): 8, - intern("u8"): 8, - intern("f2"): 2, - intern("f4"): 4, - intern("f8"): 8, - intern("f16"): 16, - intern("c8"): 8, - intern("c16"): 16, - intern("c32"): 32, -} - - -cpdef Py_ssize_t get_itemsize(str fmt) except *: - """ - Get the itemsize of the format provided. - """ - if fmt is None: - raise ValueError("Expected `str`, but got `None`") - elif fmt == "": - raise ValueError("Got unexpected empty `str`") - else: - itemsize = itemsize_mapping.get(fmt) - if itemsize is None: - if numpy_dtype is not None: - itemsize = numpy_dtype(fmt).itemsize - else: - raise ValueError( - f"Unexpected `fmt`, {fmt}." - " Please install NumPy to handle this format." - ) - return itemsize - - -cpdef uintptr_t get_buffer_data(buffer, bint check_writable=False) except *: - """ - Returns data pointer of the buffer. Raising ValueError if the buffer - is read only and check_writable=True is set. - """ - - cdef dict iface = getattr(buffer, "__cuda_array_interface__", None) - - cdef const Py_buffer* pybuf - cdef iface_data_t data - if iface is not None: - data.ptr, data.readonly = iface["data"] - else: - mview = PyMemoryView_FromObject(buffer) - pybuf = PyMemoryView_GET_BUFFER(mview) - data.ptr = pybuf.buf - data.readonly = pybuf.readonly - - if data.ptr == 0: - raise NotImplementedError("zero-sized buffers isn't supported") - - if check_writable and data.readonly: - raise ValueError("writing to readonly buffer!") - - return data.ptr - - -@boundscheck(False) -@wraparound(False) -cdef inline bint _c_contiguous(Py_ssize_t itemsize, - Py_ssize_t ndim, - Py_ssize_t* shape_p, - Py_ssize_t* strides_p) nogil: - cdef Py_ssize_t i, s - if strides_p != NULL: - s = itemsize - for i from ndim > i >= 0 by 1: - if s != strides_p[i]: - return False - s *= shape_p[i] - return True - - -@boundscheck(False) -@wraparound(False) -cdef inline Py_ssize_t _nbytes(Py_ssize_t itemsize, - Py_ssize_t ndim, - Py_ssize_t* shape_p) nogil: - cdef Py_ssize_t i, nbytes = itemsize - for i in range(ndim): - nbytes *= shape_p[i] - return nbytes - - -@boundscheck(False) -@wraparound(False) -cpdef Py_ssize_t get_buffer_nbytes(buffer, - Py_ssize_t min_size=-1, - bint cuda_support=False) except *: - """ - Returns the size of the buffer in bytes. Raises `ValueError` - if `min_size` is greater than the size of the buffer - """ - - cdef dict iface = getattr(buffer, "__cuda_array_interface__", None) - cdef const Py_buffer* pybuf - cdef tuple shape, strides - cdef Py_ssize_t *shape_p - cdef Py_ssize_t *strides_p - cdef Py_ssize_t i, s, itemsize, ndim, nbytes - cdef bint c_contiguous - if iface is not None: - if not cuda_support: - raise ValueError( - "UCX is not configured with CUDA support, please add " - "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" - "variable and that the ucx-proc=*=gpu package is " - "installed. See " - "https://ucx-py.readthedocs.io/en/latest/install.html for " - "more information." - ) - if iface.get("mask") is not None: - raise NotImplementedError("mask attribute not supported") - - shape = iface["shape"] - strides = iface.get("strides") - ndim = len(shape) - itemsize = get_itemsize(iface["typestr"]) - nbytes = itemsize - if ndim > 0: - if strides is not None: - if len(strides) != ndim: - raise ValueError( - "The length of shape and strides must be equal" - ) - shape_p = PyMem_Malloc( - 2 * ndim * sizeof(Py_ssize_t) - ) - strides_p = &shape_p[ndim] - else: - shape_p = PyMem_Malloc(ndim * sizeof(Py_ssize_t)) - strides_p = NULL - if shape_p == NULL: - raise MemoryError( - "Unable to allocate memory for shape & strides" - ) - try: - if strides_p != NULL: - for i in range(ndim): - shape_p[i] = shape[i] - strides_p[i] = strides[i] - else: - for i in range(ndim): - shape_p[i] = shape[i] - c_contiguous = _c_contiguous( - itemsize, ndim, shape_p, strides_p - ) - if not c_contiguous: - raise ValueError("Array must be C-contiguous") - nbytes = _nbytes(itemsize, ndim, shape_p) - finally: - PyMem_Free(shape_p) - else: - mview = PyMemoryView_FromObject(buffer) - pybuf = PyMemoryView_GET_BUFFER(mview) - c_contiguous = _c_contiguous( - pybuf.itemsize, pybuf.ndim, pybuf.shape, pybuf.strides - ) - if not c_contiguous: - raise ValueError("Array must be C-contiguous") - nbytes = _nbytes(pybuf.itemsize, pybuf.ndim, pybuf.shape) - - if min_size > 0 and nbytes < min_size: - raise ValueError("the nbytes is greater than the size of the buffer!") - - return nbytes From db00a98124b31d26b69cd5593b8013d109056ba9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 14:46:49 -0700 Subject: [PATCH 63/97] Don't allocate `shape` & `strides` if `ndim == 0` --- ucp/_libs/arr.pyx | 49 +++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/ucp/_libs/arr.pyx b/ucp/_libs/arr.pyx index 0cc062f68..fc6b3e5bc 100644 --- a/ucp/_libs/arr.pyx +++ b/ucp/_libs/arr.pyx @@ -125,6 +125,9 @@ cdef class Array: else: for i in range(self.ndim): self.shape_p[i] = shape[i] + else: + self.shape_p = NULL + self.strides_p = NULL else: mv = PyMemoryView_FromObject(obj) pybuf = PyMemoryView_GET_BUFFER(mv) @@ -138,30 +141,34 @@ cdef class Array: self.ndim = pybuf.ndim self.itemsize = pybuf.itemsize - if not PyBuffer_IsContiguous(pybuf, b"C"): - self.shape_p = PyMem_Malloc( - 2 * self.ndim * sizeof(Py_ssize_t) - ) - self.strides_p = self.shape_p + self.ndim + if self.ndim > 0: + if not PyBuffer_IsContiguous(pybuf, b"C"): + self.shape_p = PyMem_Malloc( + 2 * self.ndim * sizeof(Py_ssize_t) + ) + self.strides_p = self.shape_p + self.ndim + else: + self.shape_p = PyMem_Malloc( + self.ndim * sizeof(Py_ssize_t) + ) + self.strides_p = NULL + + if self.shape_p == NULL: + raise MemoryError( + "Unable to allocate memory for shape & strides" + ) + + memcpy(self.shape_p, pybuf.shape, self.ndim * sizeof(Py_ssize_t)) + if self.strides_p != NULL: + memcpy( + self.strides_p, + pybuf.strides, + self.ndim * sizeof(Py_ssize_t) + ) else: - self.shape_p = PyMem_Malloc( - self.ndim * sizeof(Py_ssize_t) - ) + self.shape_p = NULL self.strides_p = NULL - if self.shape_p == NULL: - raise MemoryError( - "Unable to allocate memory for shape & strides" - ) - - memcpy(self.shape_p, pybuf.shape, self.ndim * sizeof(Py_ssize_t)) - if self.strides_p != NULL: - memcpy( - self.strides_p, - pybuf.strides, - self.ndim * sizeof(Py_ssize_t) - ) - def __dealloc__(self): PyMem_Free(self.shape_p) self.shape_p = NULL From 0402bacb789663a4d7d169b23a3b1917f2cd240e Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 14:46:51 -0700 Subject: [PATCH 64/97] Disable some checks for `shape` & `strides` --- ucp/_libs/arr.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ucp/_libs/arr.pyx b/ucp/_libs/arr.pyx index fc6b3e5bc..d3cf36bb1 100644 --- a/ucp/_libs/arr.pyx +++ b/ucp/_libs/arr.pyx @@ -209,6 +209,8 @@ cdef class Array: return self._nbytes() @property + @boundscheck(False) + @wraparound(False) def shape(self): cdef tuple shape = PyTuple_New(self.ndim) cdef Py_ssize_t i @@ -220,6 +222,8 @@ cdef class Array: return shape @property + @boundscheck(False) + @wraparound(False) def strides(self): cdef tuple strides = PyTuple_New(self.ndim) cdef Py_ssize_t i, s From 64c672c8089454228173777704cc7c63fd1a9dac Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 14:46:53 -0700 Subject: [PATCH 65/97] Raise `MemoryError` immediately after alloc fails --- ucp/_libs/arr.pyx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ucp/_libs/arr.pyx b/ucp/_libs/arr.pyx index d3cf36bb1..7314d51eb 100644 --- a/ucp/_libs/arr.pyx +++ b/ucp/_libs/arr.pyx @@ -106,23 +106,23 @@ cdef class Array: self.shape_p = PyMem_Malloc( 2 * self.ndim * sizeof(Py_ssize_t) ) + if self.shape_p == NULL: + raise MemoryError( + "Unable to allocate memory for shape & strides" + ) self.strides_p = self.shape_p + self.ndim + for i in range(self.ndim): + self.shape_p[i] = shape[i] + self.strides_p[i] = strides[i] else: self.shape_p = PyMem_Malloc( self.ndim * sizeof(Py_ssize_t) ) + if self.shape_p == NULL: + raise MemoryError( + "Unable to allocate memory for shape" + ) self.strides_p = NULL - - if self.shape_p == NULL: - raise MemoryError( - "Unable to allocate memory for shape & strides" - ) - - if self.strides_p != NULL: - for i in range(self.ndim): - self.shape_p[i] = shape[i] - self.strides_p[i] = strides[i] - else: for i in range(self.ndim): self.shape_p[i] = shape[i] else: From 8cc9d6379346f719b62a186dae90ff0bf9ed1a20 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 16:47:57 -0700 Subject: [PATCH 66/97] Drop unused `shape_p` and `strides_p` --- ucp/_libs/arr.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/ucp/_libs/arr.pyx b/ucp/_libs/arr.pyx index 7314d51eb..a33a8a0f3 100644 --- a/ucp/_libs/arr.pyx +++ b/ucp/_libs/arr.pyx @@ -65,8 +65,6 @@ cdef class Array: cdef const Py_buffer* pybuf cdef str typestr cdef tuple data, shape, strides - cdef Py_ssize_t *shape_p - cdef Py_ssize_t *strides_p cdef Py_ssize_t i if self.cuda: if iface.get("mask") is not None: From 094d6c38fedddb9cee04ebbe5a20f47ec02dd440 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 16:48:30 -0700 Subject: [PATCH 67/97] Add a docstring for the `Array` class --- ucp/_libs/arr.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ucp/_libs/arr.pyx b/ucp/_libs/arr.pyx index a33a8a0f3..b6f808bc7 100644 --- a/ucp/_libs/arr.pyx +++ b/ucp/_libs/arr.pyx @@ -57,6 +57,10 @@ cdef dict itemsize_mapping = { @auto_pickle(False) cdef class Array: + """ + An efficient wrapper for host and device array-like objects + """ + def __cinit__(self, obj): cdef dict iface = getattr(obj, "__cuda_array_interface__", None) From 52df6c0a9f2dbb142100dcbbde3d296225f1de97 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 16:48:31 -0700 Subject: [PATCH 68/97] Store `shape` and `strides` in `memoryview`s --- ucp/_libs/arr.pxd | 4 +- ucp/_libs/arr.pyx | 144 +++++++++++++++++++++++----------------------- 2 files changed, 74 insertions(+), 74 deletions(-) diff --git a/ucp/_libs/arr.pxd b/ucp/_libs/arr.pxd index 3d54e9452..f05e9b979 100644 --- a/ucp/_libs/arr.pxd +++ b/ucp/_libs/arr.pxd @@ -15,8 +15,8 @@ cdef class Array: cdef readonly Py_ssize_t itemsize cdef readonly Py_ssize_t ndim - cdef Py_ssize_t* shape_p - cdef Py_ssize_t* strides_p + cdef Py_ssize_t[::1] shape_mv + cdef Py_ssize_t[::1] strides_mv cdef readonly bint cuda diff --git a/ucp/_libs/arr.pyx b/ucp/_libs/arr.pyx index b6f808bc7..f705f4053 100644 --- a/ucp/_libs/arr.pyx +++ b/ucp/_libs/arr.pyx @@ -4,15 +4,22 @@ # cython: language_level=3 +from cpython.array cimport array, newarrayobject from cpython.buffer cimport PyBuffer_IsContiguous -from cpython.mem cimport PyMem_Free, PyMem_Malloc from cpython.memoryview cimport ( PyMemoryView_FromObject, PyMemoryView_GET_BUFFER, ) +from cpython.object cimport PyObject from cpython.ref cimport Py_INCREF from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM -from cython cimport auto_pickle, boundscheck, wraparound +from cython cimport ( + auto_pickle, + boundscheck, + initializedcheck, + nonecheck, + wraparound, +) from libc.stdint cimport uintptr_t from libc.string cimport memcpy @@ -55,6 +62,15 @@ cdef dict itemsize_mapping = { } +cdef array array_Py_ssize_t = array("q") + + +cdef inline Py_ssize_t[::1] new_Py_ssize_t_array(Py_ssize_t n): + return newarrayobject( + (array_Py_ssize_t).ob_type, n, array_Py_ssize_t.ob_descr + ) + + @auto_pickle(False) cdef class Array: """ @@ -100,36 +116,23 @@ cdef class Array: strides = iface.get("strides") self.ndim = len(shape) if self.ndim > 0: + self.shape_mv = new_Py_ssize_t_array(self.ndim) if strides is not None: if len(strides) != self.ndim: raise ValueError( "The length of shape and strides must be equal" ) - self.shape_p = PyMem_Malloc( - 2 * self.ndim * sizeof(Py_ssize_t) - ) - if self.shape_p == NULL: - raise MemoryError( - "Unable to allocate memory for shape & strides" - ) - self.strides_p = self.shape_p + self.ndim + self.strides_mv = new_Py_ssize_t_array(self.ndim) for i in range(self.ndim): - self.shape_p[i] = shape[i] - self.strides_p[i] = strides[i] + self.shape_mv[i] = shape[i] + self.strides_mv[i] = strides[i] else: - self.shape_p = PyMem_Malloc( - self.ndim * sizeof(Py_ssize_t) - ) - if self.shape_p == NULL: - raise MemoryError( - "Unable to allocate memory for shape" - ) - self.strides_p = NULL + self.strides_mv = None for i in range(self.ndim): - self.shape_p[i] = shape[i] + self.shape_mv[i] = shape[i] else: - self.shape_p = NULL - self.strides_p = NULL + self.shape_mv = None + self.strides_mv = None else: mv = PyMemoryView_FromObject(obj) pybuf = PyMemoryView_GET_BUFFER(mv) @@ -144,41 +147,28 @@ cdef class Array: self.itemsize = pybuf.itemsize if self.ndim > 0: + self.shape_mv = new_Py_ssize_t_array(self.ndim) + memcpy( + &self.shape_mv[0], + pybuf.shape, + self.ndim * sizeof(Py_ssize_t) + ) if not PyBuffer_IsContiguous(pybuf, b"C"): - self.shape_p = PyMem_Malloc( - 2 * self.ndim * sizeof(Py_ssize_t) - ) - self.strides_p = self.shape_p + self.ndim - else: - self.shape_p = PyMem_Malloc( - self.ndim * sizeof(Py_ssize_t) - ) - self.strides_p = NULL - - if self.shape_p == NULL: - raise MemoryError( - "Unable to allocate memory for shape & strides" - ) - - memcpy(self.shape_p, pybuf.shape, self.ndim * sizeof(Py_ssize_t)) - if self.strides_p != NULL: + self.strides_mv = new_Py_ssize_t_array(self.ndim) memcpy( - self.strides_p, + &self.strides_mv[0], pybuf.strides, self.ndim * sizeof(Py_ssize_t) ) + else: + self.strides_mv = None else: - self.shape_p = NULL - self.strides_p = NULL - - def __dealloc__(self): - PyMem_Free(self.shape_p) - self.shape_p = NULL - self.strides_p = NULL + self.shape_mv = None + self.strides_mv = None cpdef bint _c_contiguous(self): return _c_contiguous( - self.itemsize, self.ndim, self.shape_p, self.strides_p + self.itemsize, self.ndim, self.shape_mv, self.strides_mv ) @property @@ -187,7 +177,7 @@ cdef class Array: cpdef bint _f_contiguous(self): return _f_contiguous( - self.itemsize, self.ndim, self.shape_p, self.strides_p + self.itemsize, self.ndim, self.shape_mv, self.strides_mv ) @property @@ -196,7 +186,7 @@ cdef class Array: cpdef bint _contiguous(self): return _contiguous( - self.itemsize, self.ndim, self.shape_p, self.strides_p + self.itemsize, self.ndim, self.shape_mv, self.strides_mv ) @property @@ -204,7 +194,7 @@ cdef class Array: return self._contiguous() cpdef Py_ssize_t _nbytes(self): - return _nbytes(self.itemsize, self.ndim, self.shape_p) + return _nbytes(self.itemsize, self.ndim, self.shape_mv) @property def nbytes(self): @@ -212,27 +202,31 @@ cdef class Array: @property @boundscheck(False) + @initializedcheck(False) + @nonecheck(False) @wraparound(False) def shape(self): cdef tuple shape = PyTuple_New(self.ndim) cdef Py_ssize_t i cdef object o for i in range(self.ndim): - o = self.shape_p[i] + o = self.shape_mv[i] Py_INCREF(o) PyTuple_SET_ITEM(shape, i, o) return shape @property @boundscheck(False) + @initializedcheck(False) + @nonecheck(False) @wraparound(False) def strides(self): cdef tuple strides = PyTuple_New(self.ndim) cdef Py_ssize_t i, s cdef object o - if self.strides_p != NULL: + if self.strides_mv is not None: for i from self.ndim > i >= 0 by 1: - o = self.strides_p[i] + o = self.strides_mv[i] Py_INCREF(o) PyTuple_SET_ITEM(strides, i, o) else: @@ -241,39 +235,43 @@ cdef class Array: o = s Py_INCREF(o) PyTuple_SET_ITEM(strides, i, o) - s *= self.shape_p[i] + s *= self.shape_mv[i] return strides @boundscheck(False) +@initializedcheck(False) +@nonecheck(False) @wraparound(False) cdef inline bint _c_contiguous(Py_ssize_t itemsize, Py_ssize_t ndim, - Py_ssize_t* shape_p, - Py_ssize_t* strides_p) nogil: + Py_ssize_t[::1] shape_mv, + Py_ssize_t[::1] strides_mv) nogil: cdef Py_ssize_t i, s - if strides_p != NULL: + if strides_mv is not None: s = itemsize for i from ndim > i >= 0 by 1: - if s != strides_p[i]: + if s != strides_mv[i]: return False - s *= shape_p[i] + s *= shape_mv[i] return True @boundscheck(False) +@initializedcheck(False) +@nonecheck(False) @wraparound(False) cdef inline bint _f_contiguous(Py_ssize_t itemsize, Py_ssize_t ndim, - Py_ssize_t* shape_p, - Py_ssize_t* strides_p) nogil: + Py_ssize_t[::1] shape_mv, + Py_ssize_t[::1] strides_mv) nogil: cdef Py_ssize_t i, s - if strides_p != NULL: + if strides_mv is not None: s = itemsize for i from 0 <= i < ndim by 1: - if s != strides_p[i]: + if s != strides_mv[i]: return False - s *= shape_p[i] + s *= shape_mv[i] elif ndim > 1: return False return True @@ -281,20 +279,22 @@ cdef inline bint _f_contiguous(Py_ssize_t itemsize, cdef inline bint _contiguous(Py_ssize_t itemsize, Py_ssize_t ndim, - Py_ssize_t* shape_p, - Py_ssize_t* strides_p) nogil: - cdef bint r = _c_contiguous(itemsize, ndim, shape_p, strides_p) + Py_ssize_t[::1] shape_mv, + Py_ssize_t[::1] strides_mv) nogil: + cdef bint r = _c_contiguous(itemsize, ndim, shape_mv, strides_mv) if not r: - r = _f_contiguous(itemsize, ndim, shape_p, strides_p) + r = _f_contiguous(itemsize, ndim, shape_mv, strides_mv) return r @boundscheck(False) +@initializedcheck(False) +@nonecheck(False) @wraparound(False) cdef inline Py_ssize_t _nbytes(Py_ssize_t itemsize, Py_ssize_t ndim, - Py_ssize_t* shape_p) nogil: + Py_ssize_t[::1] shape_mv) nogil: cdef Py_ssize_t i, nbytes = itemsize for i in range(ndim): - nbytes *= shape_p[i] + nbytes *= shape_mv[i] return nbytes From 1904329352e6a1291b010d1166aece795eefd56b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 16:48:32 -0700 Subject: [PATCH 69/97] Factor out filling of `self.shape_mv` --- ucp/_libs/arr.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ucp/_libs/arr.pyx b/ucp/_libs/arr.pyx index f705f4053..c24ba74a9 100644 --- a/ucp/_libs/arr.pyx +++ b/ucp/_libs/arr.pyx @@ -117,6 +117,8 @@ cdef class Array: self.ndim = len(shape) if self.ndim > 0: self.shape_mv = new_Py_ssize_t_array(self.ndim) + for i in range(self.ndim): + self.shape_mv[i] = shape[i] if strides is not None: if len(strides) != self.ndim: raise ValueError( @@ -124,12 +126,9 @@ cdef class Array: ) self.strides_mv = new_Py_ssize_t_array(self.ndim) for i in range(self.ndim): - self.shape_mv[i] = shape[i] self.strides_mv[i] = strides[i] else: self.strides_mv = None - for i in range(self.ndim): - self.shape_mv[i] = shape[i] else: self.shape_mv = None self.strides_mv = None From 36ba416b1c03d050ca9851f1cd25cd98de0f4263 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 18:10:56 -0700 Subject: [PATCH 70/97] Type `k` and `v` as `str`s in `_read_ucx_config` --- ucp/_libs/ucx_api.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 1bb90ad76..e2344d934 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -76,6 +76,7 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: ) # Modify the UCX configuration options based on `config_dict` + cdef str k, v for k, v in user_options.items(): status = ucp_config_modify(config, k.encode(), v.encode()) if status == UCS_ERR_NO_ELEM: From 5d6ba6b50fbddee241c82299e126281aea12a009 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 18:15:32 -0700 Subject: [PATCH 71/97] Store UCX status as a `str` --- ucp/_libs/ucx_api.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index e2344d934..47cdab254 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -68,11 +68,13 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: """ cdef ucp_config_t *config cdef ucs_status_t status + cdef str status_msg status = ucp_config_read(NULL, NULL, &config) if status != UCS_OK: + status_msg = ucs_status_string(status).decode("utf-8") raise UCXConfigError( "Couldn't read the UCX options: %s" % - ucs_status_string(status).decode("utf-8") + status_msg ) # Modify the UCX configuration options based on `config_dict` @@ -82,8 +84,9 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: if status == UCS_ERR_NO_ELEM: raise UCXConfigError("Option %s doesn't exist" % k) elif status != UCS_OK: + status_msg = ucs_status_string(status).decode("utf-8") msg = "Couldn't set option %s to %s: %s" % \ - (k, v, ucs_status_string(status).decode("utf-8")) + (k, v, status_msg) raise UCXConfigError(msg) return config From ea2f25e6ace91d93c55d19c73730c7d3b06f3b1a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 18:16:03 -0700 Subject: [PATCH 72/97] Unwrap lines --- ucp/_libs/ucx_api.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 47cdab254..dfa94dde8 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -73,8 +73,7 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: if status != UCS_OK: status_msg = ucs_status_string(status).decode("utf-8") raise UCXConfigError( - "Couldn't read the UCX options: %s" % - status_msg + "Couldn't read the UCX options: %s" % status_msg ) # Modify the UCX configuration options based on `config_dict` @@ -85,8 +84,7 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: raise UCXConfigError("Option %s doesn't exist" % k) elif status != UCS_OK: status_msg = ucs_status_string(status).decode("utf-8") - msg = "Couldn't set option %s to %s: %s" % \ - (k, v, status_msg) + msg = "Couldn't set option %s to %s: %s" % (k, v, status_msg) raise UCXConfigError(msg) return config From 85dec6a55c99b4fe6c64167d63b2ab6d4fd33dbf Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 18:17:22 -0700 Subject: [PATCH 73/97] Use f-strings for the error messages --- ucp/_libs/ucx_api.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index dfa94dde8..c318905de 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -73,7 +73,7 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: if status != UCS_OK: status_msg = ucs_status_string(status).decode("utf-8") raise UCXConfigError( - "Couldn't read the UCX options: %s" % status_msg + f"Couldn't read the UCX options: {status_msg}" ) # Modify the UCX configuration options based on `config_dict` @@ -81,10 +81,10 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: for k, v in user_options.items(): status = ucp_config_modify(config, k.encode(), v.encode()) if status == UCS_ERR_NO_ELEM: - raise UCXConfigError("Option %s doesn't exist" % k) + raise UCXConfigError(f"Option {k} doesn't exist") elif status != UCS_OK: status_msg = ucs_status_string(status).decode("utf-8") - msg = "Couldn't set option %s to %s: %s" % (k, v, status_msg) + msg = f"Couldn't set option {k} to {v}: {status_msg}" raise UCXConfigError(msg) return config From 83651ee6a366b087791a080df75ac48db80c57a2 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 18:18:19 -0700 Subject: [PATCH 74/97] Inline error message --- ucp/_libs/ucx_api.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index c318905de..67bcc4b67 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -84,8 +84,9 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: raise UCXConfigError(f"Option {k} doesn't exist") elif status != UCS_OK: status_msg = ucs_status_string(status).decode("utf-8") - msg = f"Couldn't set option {k} to {v}: {status_msg}" - raise UCXConfigError(msg) + raise UCXConfigError( + f"Couldn't set option {k} to {v}: {status_msg}" + ) return config From a6b1924942ca86cf6a42732921cf3473a0956013 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 16 Sep 2020 18:19:27 -0700 Subject: [PATCH 75/97] Unwrap shortened line --- ucp/_libs/ucx_api.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 67bcc4b67..195aee683 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -72,9 +72,7 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: status = ucp_config_read(NULL, NULL, &config) if status != UCS_OK: status_msg = ucs_status_string(status).decode("utf-8") - raise UCXConfigError( - f"Couldn't read the UCX options: {status_msg}" - ) + raise UCXConfigError(f"Couldn't read the UCX options: {status_msg}") # Modify the UCX configuration options based on `config_dict` cdef str k, v From 21d6d5cca9fb6b826a87d46cb231cc5a9c43ff5f Mon Sep 17 00:00:00 2001 From: jakirkham Date: Mon, 21 Sep 2020 09:47:41 -0700 Subject: [PATCH 76/97] Type more variables in Cython (#606) * Type variables iterated over with `_config` * Clarify log message with f-string * Store `encoded` `str`s in `bytes` typed variables * Cast `bytes` to `const char*` for C function * Type `ret` as `dict` to match function return type * Type `status_str` as `str` * Type `msg` as `str` * Type iterated variable `t` * Type arguments to `_get_error_callback` * Assign transports to a `list` typed variable * Transform `any` into a `for`-loop This should get rid of some extra unneeded content that Cython creates for generator expressions. Also we can delay construction of the `list` or doing any of this work until the simple `bint` check is cleared. * Type `endpoint_error_handling` as a `bint` * Type `msg`s as `str` and `req`s as `UCXRequest`s --- ucp/_libs/ucx_api.pyx | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index e64053c40..fb5afa8f1 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -76,8 +76,11 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: # Modify the UCX configuration options based on `config_dict` cdef str k, v + cdef bytes kb, vb for k, v in user_options.items(): - status = ucp_config_modify(config, k.encode(), v.encode()) + kb = k.encode() + vb = v.encode() + status = ucp_config_modify(config, kb, vb) if status == UCS_ERR_NO_ELEM: raise UCXConfigError(f"Option {k} doesn't exist") elif status != UCS_OK: @@ -117,7 +120,7 @@ def get_current_options(): if UCX were to be initialized now. """ cdef ucp_config_t *config = _read_ucx_config({}) - ret = ucx_config_to_dict(config) + cdef dict ret = ucx_config_to_dict(config) ucp_config_release(config) return ret @@ -230,8 +233,9 @@ cdef class UCXContext(UCXObject): ucp_config_release(config) logger.info("UCP initiated using config: ") + cdef str k, v for k, v in self._config.items(): - logger.info(" %s: %s" % (k, v)) + logger.info(f" {k}: {v}") def get_config(self): return self._config @@ -243,8 +247,8 @@ cdef class UCXContext(UCXObject): cdef void _ib_err_cb(void *arg, ucp_ep_h ep, ucs_status_t status): - status_str = ucs_status_string(status).decode("utf-8") - msg = ( + cdef str status_str = ucs_status_string(status).decode("utf-8") + cdef str msg = ( "Endpoint %s failed with status %d: %s" % ( hex(int(ep)), status, status_str ) @@ -252,10 +256,17 @@ cdef void _ib_err_cb(void *arg, ucp_ep_h ep, ucs_status_t status): logger.error(msg) -cdef ucp_err_handler_cb_t _get_error_callback(tls, endpoint_error_handling): +cdef ucp_err_handler_cb_t _get_error_callback(str tls, + bint endpoint_error_handling): cdef ucp_err_handler_cb_t err_cb = NULL - if endpoint_error_handling and any(t in tls for t in ["dc", "ib", "rc"]): - err_cb = _ib_err_cb + cdef str t + cdef list transports + if endpoint_error_handling: + transports = ["dc", "ib", "rc"] + for t in transports: + if t in tls: + err_cb = _ib_err_cb + break return err_cb @@ -354,7 +365,7 @@ cdef class UCXWorker(UCXObject): # which will handle the request cleanup. ucp_request_cancel(self._handle, req._handle) - def ep_create(self, str ip_address, port, endpoint_error_handling): + def ep_create(self, str ip_address, port, bint endpoint_error_handling): assert self.initialized cdef ucp_ep_params_t params ip_address = socket.gethostbyname(ip_address) @@ -373,7 +384,7 @@ cdef class UCXWorker(UCXObject): return UCXEndpoint(self, ucp_ep) def ep_create_from_conn_request( - self, uintptr_t conn_request, endpoint_error_handling + self, uintptr_t conn_request, bint endpoint_error_handling ): assert self.initialized @@ -405,6 +416,7 @@ def _ucx_endpoint_finalizer(uintptr_t handle_as_int, worker, inflight_msgs): # Close the endpoint # TODO: Support UCP_EP_CLOSE_MODE_FORCE + cdef str msg status = ucp_ep_close_nb(handle, UCP_EP_CLOSE_MODE_FLUSH) if UCS_PTR_IS_PTR(status): ucp_request_free(status) @@ -617,11 +629,11 @@ cdef _handle_status( ): if UCS_PTR_STATUS(status) == UCS_OK: return - msg = "<%s>: " % name + cdef str msg = "<%s>: " % name if UCS_PTR_IS_ERR(status): msg += ucs_status_string(UCS_PTR_STATUS(status)).decode("utf-8") raise UCXError(msg) - req = UCXRequest( status) + cdef UCXRequest req = UCXRequest( status) if req.info["status"] == "finished": try: # The callback function has already handle the request @@ -648,6 +660,8 @@ cdef _handle_status( cdef void _send_callback(void *request, ucs_status_t status): + cdef UCXRequest req + cdef str msg with log_errors(): req = UCXRequest( request) req.info["status"] = "finished" @@ -741,6 +755,8 @@ def tag_send_nb( cdef void _tag_recv_callback( void *request, ucs_status_t status, ucp_tag_recv_info_t *info ): + cdef UCXRequest req + cdef str msg with log_errors(): req = UCXRequest( request) req.info["status"] = "finished" @@ -910,6 +926,8 @@ def stream_send_nb( cdef void _stream_recv_callback( void *request, ucs_status_t status, size_t length ): + cdef UCXRequest req + cdef str msg with log_errors(): req = UCXRequest( request) req.info["status"] = "finished" From 0b5f098ee1396043f8011f533da7d2f5b68d99dd Mon Sep 17 00:00:00 2001 From: jakirkham Date: Mon, 21 Sep 2020 16:55:50 -0700 Subject: [PATCH 77/97] Cleanup `ucx_config_to_dict` (#609) * Drop parentheses around condition in `if` * Type `line` as `unicode` * Type `k` and `v` as `unicode` * Move `fflush` into `try...except...` block * Raise an `IOError` if `fflush` fails * Clear error before raising * Simplify slicing of `k` * Raise if `fclose` fails --- ucp/_libs/ucx_api.pyx | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index fb5afa8f1..86e321c96 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -11,7 +11,7 @@ from posix.stdio cimport open_memstream from cpython.ref cimport Py_DECREF, Py_INCREF, PyObject from libc.stdint cimport uintptr_t -from libc.stdio cimport FILE, fclose, fflush +from libc.stdio cimport FILE, clearerr, fclose, fflush from libc.stdlib cimport free from libc.string cimport memset @@ -95,22 +95,27 @@ cdef dict ucx_config_to_dict(ucp_config_t *config): """Returns a dict of a UCX config""" cdef char *text cdef size_t text_len - cdef unicode py_text + cdef unicode py_text, line, k, v cdef FILE *text_fd = open_memstream(&text, &text_len) - if(text_fd == NULL): + if text_fd == NULL: raise IOError("open_memstream() returned NULL") cdef dict ret = {} ucp_config_print(config, text_fd, NULL, UCS_CONFIG_PRINT_CONFIG) - fflush(text_fd) try: + if fflush(text_fd) != 0: + clearerr(text_fd) + raise IOError("fflush() failed on memory stream") py_text = text.decode() for line in py_text.splitlines(): k, v = line.split("=") - k = k[len("UCX_"):] + k = k[4:] # Strip "UCX_" prefix ret[k] = v finally: - fclose(text_fd) - free(text) + if fclose(text_fd) != 0: + free(text) + raise IOError("fclose() failed to close memory stream") + else: + free(text) return ret From ccef02cfb15e01857e9dad057e71febea80de149 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Mon, 21 Sep 2020 16:56:12 -0700 Subject: [PATCH 78/97] Cleanup `UCXEndpoint.info` (#610) * Drop parentheses around condition in `if` * Move `fflush` into `try...except...` block * Raise an `IOError` if `fflush` fails * Clear error before raising * Raise if `fclose` fails --- ucp/_libs/ucx_api.pyx | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 86e321c96..5b0a412ae 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -462,15 +462,20 @@ cdef class UCXEndpoint(UCXObject): cdef size_t text_len cdef unicode py_text cdef FILE *text_fd = open_memstream(&text, &text_len) - if(text_fd == NULL): + if text_fd == NULL: raise IOError("open_memstream() returned NULL") ucp_ep_print_info(self._handle, text_fd) - fflush(text_fd) try: + if fflush(text_fd) != 0: + clearerr(text_fd) + raise IOError("fflush() failed on memory stream") py_text = text.decode() finally: - fclose(text_fd) - free(text) + if fclose(text_fd) != 0: + free(text) + raise IOError("fclose() failed to close memory stream") + else: + free(text) return py_text @property From 88bc6f1a72f8da1819573e36d12a7979b232839c Mon Sep 17 00:00:00 2001 From: jakirkham Date: Mon, 21 Sep 2020 17:30:56 -0700 Subject: [PATCH 79/97] Specify `cdef` return types (#608) * Specify `_handle_status` return as `UCXRequest` * Specify `void` return type for `assert_ucs_status` * Type `name` & `inflight_msgs` of `_handle_status` --- ucp/_libs/ucx_api.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 5b0a412ae..7c8c2895b 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -50,7 +50,7 @@ cdef int _ucx_py_request_counter = 0 logger = logging.getLogger("ucx") -cdef assert_ucs_status(ucs_status_t status, str msg_context=None): +cdef void assert_ucs_status(ucs_status_t status, str msg_context=None) except *: cdef str msg, ucs_status if status != UCS_OK: ucs_status = ucs_status_string(status).decode("utf-8") @@ -628,14 +628,14 @@ cdef class UCXRequest: ) -cdef _handle_status( +cdef UCXRequest _handle_status( ucs_status_ptr_t status, int64_t expected_receive, cb_func, cb_args, cb_kwargs, - name, - inflight_msgs + unicode name, + set inflight_msgs ): if UCS_PTR_STATUS(status) == UCS_OK: return From 67b1b7fbcdc7b6eb28a5d570c4c3b035272c6757 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 22 Sep 2020 08:56:57 -0700 Subject: [PATCH 80/97] Release `config` on error (#611) * Release `config` on error As we are responsible for cleaning up the `config` object when we are done with it, make sure that we clean it up ourselves when an exception occurs as no one else will have the opportunity to do so then. * Split up variable definitions and assignments * Swap variable definition order * Use `try...finally` to ensure `config` is released This way if an error is raised by `ucx_config_to_dict`, we are able to handle cleanup of `config` before propagating the error further. * Return result of `ucx_config_to_dict` directly * Combine definition and assignment of `config` * Extract `dict` from `config` after `ucp_init` * Use `try...finally...` to cleanup `config` --- ucp/_libs/ucx_api.pyx | 44 ++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 7c8c2895b..b6ddbfb43 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -77,17 +77,21 @@ cdef ucp_config_t * _read_ucx_config(dict user_options) except *: # Modify the UCX configuration options based on `config_dict` cdef str k, v cdef bytes kb, vb - for k, v in user_options.items(): - kb = k.encode() - vb = v.encode() - status = ucp_config_modify(config, kb, vb) - if status == UCS_ERR_NO_ELEM: - raise UCXConfigError(f"Option {k} doesn't exist") - elif status != UCS_OK: - status_msg = ucs_status_string(status).decode("utf-8") - raise UCXConfigError( - f"Couldn't set option {k} to {v}: {status_msg}" - ) + try: + for k, v in user_options.items(): + kb = k.encode() + vb = v.encode() + status = ucp_config_modify(config, kb, vb) + if status == UCS_ERR_NO_ELEM: + raise UCXConfigError(f"Option {k} doesn't exist") + elif status != UCS_OK: + status_msg = ucs_status_string(status).decode("utf-8") + raise UCXConfigError( + f"Couldn't set option {k} to {v}: {status_msg}" + ) + except Exception: + ucp_config_release(config) + raise return config @@ -125,9 +129,10 @@ def get_current_options(): if UCX were to be initialized now. """ cdef ucp_config_t *config = _read_ucx_config({}) - cdef dict ret = ucx_config_to_dict(config) - ucp_config_release(config) - return ret + try: + return ucx_config_to_dict(config) + finally: + ucp_config_release(config) def get_ucx_version(): @@ -226,17 +231,18 @@ cdef class UCXContext(UCXObject): ) cdef ucp_config_t *config = _read_ucx_config(config_dict) - status = ucp_init(&ucp_params, config, &self._handle) - assert_ucs_status(status) + try: + status = ucp_init(&ucp_params, config, &self._handle) + assert_ucs_status(status) + self._config = ucx_config_to_dict(config) + finally: + ucp_config_release(config) self.add_handle_finalizer( _ucx_context_handle_finalizer, int(self._handle) ) - self._config = ucx_config_to_dict(config) - ucp_config_release(config) - logger.info("UCP initiated using config: ") cdef str k, v for k, v in self._config.items(): From ae8ae0cb87d325ec2354225c56d634f450b79a1f Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 22 Sep 2020 19:26:31 -0700 Subject: [PATCH 81/97] Type `inflight_msgs` as `set` in a few more places (#613) --- ucp/_libs/ucx_api.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index b6ddbfb43..121145521 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -282,7 +282,7 @@ cdef ucp_err_handler_cb_t _get_error_callback(str tls, def _ucx_worker_handle_finalizer( - uintptr_t handle_as_int, UCXContext ctx, inflight_msgs + uintptr_t handle_as_int, UCXContext ctx, set inflight_msgs ): assert ctx.initialized cdef ucp_worker_h handle = handle_as_int @@ -414,7 +414,7 @@ cdef class UCXWorker(UCXObject): return UCXEndpoint(self, ucp_ep) -def _ucx_endpoint_finalizer(uintptr_t handle_as_int, worker, inflight_msgs): +def _ucx_endpoint_finalizer(uintptr_t handle_as_int, worker, set inflight_msgs): assert worker.initialized cdef ucp_ep_h handle = handle_as_int cdef ucs_status_ptr_t status @@ -876,7 +876,9 @@ def tag_recv_nb( -1, _tag_recv_cb ) - inflight_msgs = worker._inflight_msgs if ep is None else ep._inflight_msgs + cdef set inflight_msgs = ( + worker._inflight_msgs if ep is None else ep._inflight_msgs + ) return _handle_status( status, nbytes, cb_func, cb_args, cb_kwargs, name, inflight_msgs ) From 92dbb60d3da60feef0b902a6cbae09a5853c3298 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 23 Sep 2020 08:11:10 -0700 Subject: [PATCH 82/97] Type `send`, `recv` arguments and `port` (#612) * Type `port` as `uint16_t` As C libraries we interface with use `uint16_t` to type port variables, make sure we do the same. * Type `*args` and `**kwargs` as `tuple` and `dict` * Type `name` as `str` * Use `None` for default arguments As some objects like `dict()` are mutable, these are generally not used as default arguments and are set within functions. To fix this, we set these to `None` instead and then replace those values internally if they are `None`. * Use `()` and `{}` for empty `tuple` and `dict` Cython is able to pick up on this particular syntax a bit better and as result optimize it a bit better. Should avoid some overhead compared to `tuple()` and `dict()`, which may turn into Python function calls instead. * Type `cb_args` & `cb_kwargs` as `tuple` & `dict` --- ucp/_libs/ucx_api.pyx | 88 +++++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 24 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 121145521..05019bbd0 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -10,7 +10,7 @@ import weakref from posix.stdio cimport open_memstream from cpython.ref cimport Py_DECREF, Py_INCREF, PyObject -from libc.stdint cimport uintptr_t +from libc.stdint cimport uint16_t, uintptr_t from libc.stdio cimport FILE, clearerr, fclose, fflush from libc.stdlib cimport free from libc.string cimport memset @@ -376,7 +376,7 @@ cdef class UCXWorker(UCXObject): # which will handle the request cleanup. ucp_request_cancel(self._handle, req._handle) - def ep_create(self, str ip_address, port, bint endpoint_error_handling): + def ep_create(self, str ip_address, uint16_t port, bint endpoint_error_handling): assert self.initialized cdef ucp_ep_params_t params ip_address = socket.gethostbyname(ip_address) @@ -513,16 +513,20 @@ cdef class UCXListener(UCXObject): dict cb_data cdef public: - int port + uint16_t port def __init__( self, UCXWorker worker, - port, + uint16_t port, cb_func, - cb_args=tuple(), - cb_kwargs=dict() + tuple cb_args=None, + dict cb_kwargs=None ): + if cb_args is None: + cb_args = () + if cb_kwargs is None: + cb_kwargs = {} cdef ucp_listener_params_t params cdef ucp_listener_conn_callback_t _listener_cb = ( _listener_callback @@ -678,6 +682,8 @@ cdef UCXRequest _handle_status( cdef void _send_callback(void *request, ucs_status_t status): cdef UCXRequest req cdef str msg + cdef tuple cb_args + cdef dict cb_kwargs with log_errors(): req = UCXRequest( request) req.info["status"] = "finished" @@ -699,9 +705,11 @@ cdef void _send_callback(void *request, ucs_status_t status): cb_func = req.info["cb_func"] if cb_func is not None: cb_args = req.info["cb_args"] - cb_args = cb_args if cb_args else tuple() + if cb_args is None: + cb_args = () cb_kwargs = req.info["cb_kwargs"] - cb_kwargs = cb_kwargs if cb_kwargs else dict() + if cb_kwargs is None: + cb_kwargs = {} cb_func(req, exception, *cb_args, **cb_kwargs) finally: req.close() @@ -713,9 +721,9 @@ def tag_send_nb( size_t nbytes, ucp_tag_t tag, cb_func, - cb_args=tuple(), - cb_kwargs=dict(), - name="tag_send_nb" + tuple cb_args=None, + dict cb_kwargs=None, + str name=None ): """ This routine sends a message to a destination endpoint @@ -754,6 +762,12 @@ def tag_send_nb( name: str, optional Descriptive name of the operation """ + if cb_args is None: + cb_args = () + if cb_kwargs is None: + cb_kwargs = {} + if name is None: + name = "tag_send_nb" cdef ucp_send_callback_t _send_cb = _send_callback cdef ucs_status_ptr_t status = ucp_tag_send_nb( ep._handle, @@ -773,6 +787,8 @@ cdef void _tag_recv_callback( ): cdef UCXRequest req cdef str msg + cdef tuple cb_args + cdef dict cb_kwargs with log_errors(): req = UCXRequest( request) req.info["status"] = "finished" @@ -799,9 +815,11 @@ cdef void _tag_recv_callback( cb_func = req.info["cb_func"] if cb_func is not None: cb_args = req.info["cb_args"] - cb_args = cb_args if cb_args else tuple() + if cb_args is None: + cb_args = () cb_kwargs = req.info["cb_kwargs"] - cb_kwargs = cb_kwargs if cb_kwargs else dict() + if cb_kwargs is None: + cb_kwargs = {} cb_func(req, exception, *cb_args, **cb_kwargs) finally: req.close() @@ -814,9 +832,9 @@ def tag_recv_nb( ucp_tag_t tag, cb_func, ucp_tag_t tag_mask=-1, - cb_args=tuple(), - cb_kwargs=dict(), - name="tag_recv_nb", + tuple cb_args=None, + dict cb_kwargs=None, + str name=None, UCXEndpoint ep=None ): """ This routine receives a message on a worker @@ -862,6 +880,12 @@ def tag_recv_nb( guarantee that the message is cancelled when `ep` closes as opposed to when the `worker` closes. """ + if cb_args is None: + cb_args = () + if cb_kwargs is None: + cb_kwargs = {} + if name is None: + name = "tag_recv_nb" if buffer.readonly: raise ValueError("writing to readonly buffer!") cdef ucp_tag_recv_callback_t _tag_recv_cb = ( @@ -889,9 +913,9 @@ def stream_send_nb( Array buffer, size_t nbytes, cb_func, - cb_args=tuple(), - cb_kwargs=dict(), - name="stream_send_nb" + tuple cb_args=None, + dict cb_kwargs=None, + str name=None ): """ This routine sends data to a destination endpoint @@ -927,6 +951,12 @@ def stream_send_nb( name: str, optional Descriptive name of the operation """ + if cb_args is None: + cb_args = () + if cb_kwargs is None: + cb_kwargs = {} + if name is None: + name = "stream_send_nb" cdef ucp_send_callback_t _send_cb = _send_callback cdef ucs_status_ptr_t status = ucp_stream_send_nb( ep._handle, @@ -946,6 +976,8 @@ cdef void _stream_recv_callback( ): cdef UCXRequest req cdef str msg + cdef tuple cb_args + cdef dict cb_kwargs with log_errors(): req = UCXRequest( request) req.info["status"] = "finished" @@ -972,9 +1004,11 @@ cdef void _stream_recv_callback( cb_func = req.info["cb_func"] if cb_func is not None: cb_args = req.info["cb_args"] - cb_args = cb_args if cb_args else tuple() + if cb_args is None: + cb_args = () cb_kwargs = req.info["cb_kwargs"] - cb_kwargs = cb_kwargs if cb_kwargs else dict() + if cb_kwargs is None: + cb_kwargs = {} cb_func(req, exception, *cb_args, **cb_kwargs) finally: req.close() @@ -985,9 +1019,9 @@ def stream_recv_nb( Array buffer, size_t nbytes, cb_func, - cb_args=tuple(), - cb_kwargs=dict(), - name="stream_recv_nb" + tuple cb_args=None, + dict cb_kwargs=None, + str name=None ): """ This routine receives data on the endpoint. @@ -1017,6 +1051,12 @@ def stream_recv_nb( name: str, optional Descriptive name of the operation """ + if cb_args is None: + cb_args = () + if cb_kwargs is None: + cb_kwargs = {} + if name is None: + name = "stream_recv_nb" if buffer.readonly: raise ValueError("writing to readonly buffer!") cdef size_t length From 5e649abdfb4d339b8c312db382330e11bdefda0a Mon Sep 17 00:00:00 2001 From: jakirkham Date: Wed, 23 Sep 2020 13:12:23 -0700 Subject: [PATCH 83/97] Type `UCXRequest` `info` as `dict` when used (#614) * Store `req.info` in a local `dict`-typed variable * Hold `inflight_msgs` in a local `set` variable * Store `name` in `str`-typed local variable * Store UCX status string in local variable * Construct `msg` only when needed * Only extract `name` when we use it * Assign `exception` `None` outside of `else` * Define type of `req` as `UCXRequest` * Hold `req.info` in a `dict` typed local variable --- ucp/_libs/ucx_api.pyx | 130 ++++++++++++++++++++++++++---------------- 1 file changed, 81 insertions(+), 49 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 05019bbd0..298dd468a 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -288,9 +288,14 @@ def _ucx_worker_handle_finalizer( cdef ucp_worker_h handle = handle_as_int # Cancel all inflight messages + cdef UCXRequest req + cdef dict req_info + cdef str name for req in list(inflight_msgs): assert not req.closed() - logger.debug("Future cancelling: %s" % req.info["name"]) + req_info = req.info + name = req_info["name"] + logger.debug("Future cancelling: %s" % name) ucp_request_cancel(handle, req.handle) ucp_worker_destroy(handle) @@ -420,8 +425,13 @@ def _ucx_endpoint_finalizer(uintptr_t handle_as_int, worker, set inflight_msgs): cdef ucs_status_ptr_t status # Cancel all inflight messages + cdef UCXRequest req + cdef dict req_info + cdef str name for req in list(inflight_msgs): - logger.debug("Future cancelling: %s" % req.info["name"]) + req_info = req.info + name = req_info["name"] + logger.debug("Future cancelling: %s" % name) # Notice, `request_cancel()` evoke the send/recv callback functions worker.request_cancel(req) @@ -654,10 +664,12 @@ cdef UCXRequest _handle_status( msg += ucs_status_string(UCS_PTR_STATUS(status)).decode("utf-8") raise UCXError(msg) cdef UCXRequest req = UCXRequest( status) - if req.info["status"] == "finished": + cdef dict req_info + req_info = req.info + if req_info["status"] == "finished": try: # The callback function has already handle the request - received = req.info.get("received", None) + received = req_info.get("received", None) if received is not None and received != expected_receive: msg += "length mismatch: %d (got) != %d (expected)" % ( received, expected_receive @@ -669,45 +681,51 @@ cdef UCXRequest _handle_status( finally: req.close() else: - req.info["cb_func"] = cb_func - req.info["cb_args"] = cb_args - req.info["cb_kwargs"] = cb_kwargs - req.info["expected_receive"] = expected_receive - req.info["name"] = name + req_info["cb_func"] = cb_func + req_info["cb_args"] = cb_args + req_info["cb_kwargs"] = cb_kwargs + req_info["expected_receive"] = expected_receive + req_info["name"] = name inflight_msgs.add(req) - req.info["inflight_msgs"] = inflight_msgs + req_info["inflight_msgs"] = inflight_msgs return req cdef void _send_callback(void *request, ucs_status_t status): cdef UCXRequest req - cdef str msg + cdef dict req_info + cdef str name, ucx_status_msg, msg + cdef set inflight_msgs cdef tuple cb_args cdef dict cb_kwargs with log_errors(): req = UCXRequest( request) - req.info["status"] = "finished" + req_info = req.info + req_info["status"] = "finished" - if "cb_func" not in req.info: + if "cb_func" not in req_info: # This callback function was called before ucp_tag_send_nb() returned return - msg = "<%s>: " % req.info["name"] + exception = None if status == UCS_ERR_CANCELED: + name = req_info["name"] + msg = "<%s>: " % name exception = UCXCanceled(msg) elif status != UCS_OK: - msg += ucs_status_string(status).decode("utf-8") + name = req_info["name"] + ucx_status_msg = ucs_status_string(status).decode("utf-8") + msg = "<%s>: %s" % (name, ucx_status_msg) exception = UCXError(msg) - else: - exception = None try: - req.info["inflight_msgs"].discard(req) - cb_func = req.info["cb_func"] + inflight_msgs = req_info["inflight_msgs"] + inflight_msgs.discard(req) + cb_func = req_info["cb_func"] if cb_func is not None: - cb_args = req.info["cb_args"] + cb_args = req_info["cb_args"] if cb_args is None: cb_args = () - cb_kwargs = req.info["cb_kwargs"] + cb_kwargs = req_info["cb_kwargs"] if cb_kwargs is None: cb_kwargs = {} cb_func(req, exception, *cb_args, **cb_kwargs) @@ -786,38 +804,45 @@ cdef void _tag_recv_callback( void *request, ucs_status_t status, ucp_tag_recv_info_t *info ): cdef UCXRequest req - cdef str msg + cdef dict req_info + cdef str name, ucx_status_msg, msg + cdef set inflight_msgs cdef tuple cb_args cdef dict cb_kwargs with log_errors(): req = UCXRequest( request) - req.info["status"] = "finished" + req_info = req.info + req_info["status"] = "finished" - if "cb_func" not in req.info: + if "cb_func" not in req_info: # This callback function was called before ucp_tag_recv_nb() returned return - msg = "<%s>: " % req.info["name"] + exception = None if status == UCS_ERR_CANCELED: + name = req_info["name"] + msg = "<%s>: " % name exception = UCXCanceled(msg) elif status != UCS_OK: - msg += ucs_status_string(status).decode("utf-8") + name = req_info["name"] + ucx_status_msg = ucs_status_string(status).decode("utf-8") + msg = "<%s>: %s" % (name, ucx_status_msg) exception = UCXError(msg) - elif info.length != req.info["expected_receive"]: - msg += "length mismatch: %d (got) != %d (expected)" % ( - info.length, req.info["expected_receive"] + elif info.length != req_info["expected_receive"]: + name = req_info["name"] + msg = "<%s>: length mismatch: %d (got) != %d (expected)" % ( + name, info.length, req_info["expected_receive"] ) exception = UCXMsgTruncated(msg) - else: - exception = None try: - req.info["inflight_msgs"].discard(req) - cb_func = req.info["cb_func"] + inflight_msgs = req_info["inflight_msgs"] + inflight_msgs.discard(req) + cb_func = req_info["cb_func"] if cb_func is not None: - cb_args = req.info["cb_args"] + cb_args = req_info["cb_args"] if cb_args is None: cb_args = () - cb_kwargs = req.info["cb_kwargs"] + cb_kwargs = req_info["cb_kwargs"] if cb_kwargs is None: cb_kwargs = {} cb_func(req, exception, *cb_args, **cb_kwargs) @@ -975,38 +1000,45 @@ cdef void _stream_recv_callback( void *request, ucs_status_t status, size_t length ): cdef UCXRequest req - cdef str msg + cdef dict req_info + cdef str name, ucx_status_msg, msg + cdef set inflight_msgs cdef tuple cb_args cdef dict cb_kwargs with log_errors(): req = UCXRequest( request) - req.info["status"] = "finished" + req_info = req.info + req_info["status"] = "finished" - if "cb_func" not in req.info: + if "cb_func" not in req_info: # This callback function was called before ucp_tag_recv_nb() returned return - msg = "<%s>: " % req.info["name"] + exception = None if status == UCS_ERR_CANCELED: + name = req_info["name"] + msg = "<%s>: " % name exception = UCXCanceled(msg) elif status != UCS_OK: - msg += ucs_status_string(status).decode("utf-8") + name = req_info["name"] + ucx_status_msg = ucs_status_string(status).decode("utf-8") + msg = "<%s>: %s" % (name, ucx_status_msg) exception = UCXError(msg) - elif length != req.info["expected_receive"]: - msg += "length mismatch: %d (got) != %d (expected)" % ( - length, req.info["expected_receive"] + elif length != req_info["expected_receive"]: + name = req_info["name"] + msg = "<%s>: length mismatch: %d (got) != %d (expected)" % ( + name, length, req_info["expected_receive"] ) exception = UCXMsgTruncated(msg) - else: - exception = None try: - req.info["inflight_msgs"].discard(req) - cb_func = req.info["cb_func"] + inflight_msgs = req_info["inflight_msgs"] + inflight_msgs.discard(req) + cb_func = req_info["cb_func"] if cb_func is not None: - cb_args = req.info["cb_args"] + cb_args = req_info["cb_args"] if cb_args is None: cb_args = () - cb_kwargs = req.info["cb_kwargs"] + cb_kwargs = req_info["cb_kwargs"] if cb_kwargs is None: cb_kwargs = {} cb_func(req, exception, *cb_args, **cb_kwargs) From 4af17ad0d69cc77583adc45c8ad921ed90caabd2 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 29 Sep 2020 00:08:15 -0700 Subject: [PATCH 84/97] Fix minor typo in `local-send-recv` (#620) --- benchmarks/local-send-recv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/local-send-recv.py b/benchmarks/local-send-recv.py index 8d42342b5..d81f94d50 100644 --- a/benchmarks/local-send-recv.py +++ b/benchmarks/local-send-recv.py @@ -198,7 +198,7 @@ def parse_args(): metavar="N", default=10, type=int, - help="Numer of send / recv iterations (default 10).", + help="Number of send / recv iterations (default 10).", ) parser.add_argument( "-b", From f21c7c4b68621baae6a25fcb38d7763ac31eaa0b Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 29 Sep 2020 00:20:55 -0700 Subject: [PATCH 85/97] Type request count as `unsigned` (#618) As this is a request counter, we won't use the negative values anyways. So go ahead and type this as `unsigned`. This gives us a bit larger range to work with as well. --- ucp/_libs/ucx_api.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 298dd468a..dd780b140 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -44,7 +44,7 @@ cdef void ucx_py_request_reset(void* request): req.info = NULL # Counter used as UCXRequest UIDs -cdef int _ucx_py_request_counter = 0 +cdef unsigned int _ucx_py_request_counter = 0 logger = logging.getLogger("ucx") @@ -584,7 +584,7 @@ cdef class UCXRequest: """ cdef: ucx_py_request *_handle - int _uid + unsigned int _uid def __init__(self, uintptr_t req_as_int): global _ucx_py_request_counter From c3bcf75f174c12a41f322e86d526130c3cec2c7e Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 29 Sep 2020 01:42:32 -0700 Subject: [PATCH 86/97] Type `cuda_device_index` as `unsigned int` (#617) --- ucp/_libs/topological_distance.pxd | 3 ++- ucp/_libs/topological_distance.pyx | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ucp/_libs/topological_distance.pxd b/ucp/_libs/topological_distance.pxd index 75edcd76b..326a240fd 100644 --- a/ucp/_libs/topological_distance.pxd +++ b/ucp/_libs/topological_distance.pxd @@ -15,5 +15,6 @@ cdef class TopologicalDistance: int device, str device_type=*) - cpdef get_cuda_distances_from_device_index(self, cuda_device_index, + cpdef get_cuda_distances_from_device_index(self, + unsigned int cuda_device_index, str device_type=*) diff --git a/ucp/_libs/topological_distance.pyx b/ucp/_libs/topological_distance.pyx index bbae370d7..a973b522d 100644 --- a/ucp/_libs/topological_distance.pyx +++ b/ucp/_libs/topological_distance.pyx @@ -103,7 +103,8 @@ cdef class TopologicalDistance: return ret - cpdef get_cuda_distances_from_device_index(self, cuda_device_index, + cpdef get_cuda_distances_from_device_index(self, + unsigned int cuda_device_index, str device_type="openfabrics"): """ Find network or openfabrics devices closest to CUDA device of given index. From 1de1344842d097518df8b247229991fbab1ba018 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 29 Sep 2020 08:41:07 -0700 Subject: [PATCH 87/97] Update Array checks (#605) * Perform C-contiguous `Array` checks in Cython * Drop C-contiguous checks from Python * Allow F-contiguous arrays as well * Drop `nbytes` parameter and check * Track `cuda_support` in `UCXContext` * Use `cuda_support` from `UCXContext` * Move CUDA checks into Cython * Drop CUDA checks from Python * Check equality before checking for contained value This is a little faster in the case `"all"` is set. * Use `UCXEndpoint` if provided * Cast to `bint` --- ucp/_libs/ucx_api.pyx | 55 +++++++++++++++++++++++++++++++++++++++++++ ucp/core.py | 53 ++++++++--------------------------------- ucp/endpoint_reuse.py | 8 +++---- 3 files changed, 69 insertions(+), 47 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index dd780b140..8d11262d8 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -208,6 +208,7 @@ cdef class UCXContext(UCXObject): cdef: ucp_context_h _handle dict _config + readonly bint cuda_support def __init__(self, config_dict): cdef ucp_params_t ucp_params @@ -238,6 +239,10 @@ cdef class UCXContext(UCXObject): finally: ucp_config_release(config) + # UCX supports CUDA if "cuda" is part of the TLS or TLS is "all" + cdef str tls = self._config["TLS"] + self.cuda_support = tls == "all" or "cuda" in tls + self.add_handle_finalizer( _ucx_context_handle_finalizer, int(self._handle) @@ -786,6 +791,17 @@ def tag_send_nb( cb_kwargs = {} if name is None: name = "tag_send_nb" + if buffer.cuda and not ep.worker._context.cuda_support: + raise ValueError( + "UCX is not configured with CUDA support, please add " + "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" + "variable and that the ucx-proc=*=gpu package is " + "installed. See " + "https://ucx-py.readthedocs.io/en/latest/install.html for " + "more information." + ) + if not buffer._contiguous(): + raise ValueError("Array must be C or F contiguous") cdef ucp_send_callback_t _send_cb = _send_callback cdef ucs_status_ptr_t status = ucp_tag_send_nb( ep._handle, @@ -913,6 +929,23 @@ def tag_recv_nb( name = "tag_recv_nb" if buffer.readonly: raise ValueError("writing to readonly buffer!") + cdef bint cuda_support + if buffer.cuda: + if ep is None: + cuda_support = worker._context.cuda_support + else: + cuda_support = ep.worker._context.cuda_support + if not cuda_support: + raise ValueError( + "UCX is not configured with CUDA support, please add " + "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" + "variable and that the ucx-proc=*=gpu package is " + "installed. See " + "https://ucx-py.readthedocs.io/en/latest/install.html for " + "more information." + ) + if not buffer._contiguous(): + raise ValueError("Array must be C or F contiguous") cdef ucp_tag_recv_callback_t _tag_recv_cb = ( _tag_recv_callback ) @@ -982,6 +1015,17 @@ def stream_send_nb( cb_kwargs = {} if name is None: name = "stream_send_nb" + if buffer.cuda and not ep.worker._context.cuda_support: + raise ValueError( + "UCX is not configured with CUDA support, please add " + "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" + "variable and that the ucx-proc=*=gpu package is " + "installed. See " + "https://ucx-py.readthedocs.io/en/latest/install.html for " + "more information." + ) + if not buffer._contiguous(): + raise ValueError("Array must be C or F contiguous") cdef ucp_send_callback_t _send_cb = _send_callback cdef ucs_status_ptr_t status = ucp_stream_send_nb( ep._handle, @@ -1091,6 +1135,17 @@ def stream_recv_nb( name = "stream_recv_nb" if buffer.readonly: raise ValueError("writing to readonly buffer!") + if buffer.cuda and not ep.worker._context.cuda_support: + raise ValueError( + "UCX is not configured with CUDA support, please add " + "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" + "variable and that the ucx-proc=*=gpu package is " + "installed. See " + "https://ucx-py.readthedocs.io/en/latest/install.html for " + "more information." + ) + if not buffer._contiguous(): + raise ValueError("Array must be C or F contiguous") cdef size_t length cdef ucp_stream_recv_callback_t _stream_recv_cb = ( _stream_recv_callback diff --git a/ucp/core.py b/ucp/core.py index 83d295f64..936af0e90 100644 --- a/ucp/core.py +++ b/ucp/core.py @@ -483,9 +483,6 @@ def __init__( self._recv_count = 0 # Number of calls to self.recv() self._finished_recv_count = 0 # Number of returned (finished) self.recv() calls self._shutting_down_peer = False # Told peer to shutdown - # UCX supports CUDA if "cuda" is part of the TLS or TLS is "all" - tls = ctx.get_config()["TLS"] - self._cuda_support = "cuda" in tls or tls == "all" self._close_after_n_recv = None @property @@ -551,7 +548,7 @@ async def close(self): self.abort() @nvtx_annotate("UCXPY_SEND", color="green", domain="ucxpy") - async def send(self, buffer, nbytes=-1, tag=None): + async def send(self, buffer, tag=None): """Send `buffer` to connected peer. Parameters @@ -559,8 +556,6 @@ async def send(self, buffer, nbytes=-1, tag=None): buffer: exposing the buffer protocol or array/cuda interface The buffer to send. Raise ValueError if buffer is smaller than nbytes. - nbytes: int, optional - Number of bytes to send. Default is the whole buffer. tag: hashable, optional Set a tag that the receiver must match. """ @@ -568,25 +563,12 @@ async def send(self, buffer, nbytes=-1, tag=None): raise UCXCloseError("Endpoint closed") if not isinstance(buffer, Array): buffer = Array(buffer) - if not self._cuda_support and buffer.cuda: - raise ValueError( - "UCX is not configured with CUDA support, please add " - "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" - "variable and that the ucx-proc=*=gpu package is " - "installed. See " - "https://ucx-py.readthedocs.io/en/latest/install.html for " - "more information." - ) - if not buffer.c_contiguous: - raise ValueError("Array must be C-contiguous") - buffer_nbytes = buffer.nbytes - if nbytes > 0 and buffer_nbytes < nbytes: - raise ValueError("the nbytes is greater than the size of the buffer!") + nbytes = buffer.nbytes log = "[Send #%03d] ep: %s, tag: %s, nbytes: %d, type: %s" % ( self._send_count, hex(self.uid), hex(self._msg_tag_send), - buffer_nbytes, + nbytes, type(buffer.obj), ) logger.debug(log) @@ -597,10 +579,10 @@ async def send(self, buffer, nbytes=-1, tag=None): tag = hash64bits(self._msg_tag_send, hash(tag)) if self._guarantee_msg_order: tag += self._send_count - return await comm.tag_send(self._ep, buffer, buffer_nbytes, tag, name=log) + return await comm.tag_send(self._ep, buffer, nbytes, tag, name=log) @nvtx_annotate("UCXPY_RECV", color="red", domain="ucxpy") - async def recv(self, buffer, nbytes=-1, tag=None): + async def recv(self, buffer, tag=None): """Receive from connected peer into `buffer`. Parameters @@ -608,8 +590,6 @@ async def recv(self, buffer, nbytes=-1, tag=None): buffer: exposing the buffer protocol or array/cuda interface The buffer to receive into. Raise ValueError if buffer is smaller than nbytes or read-only. - nbytes: int, optional - Number of bytes to receive. Default is the whole buffer. tag: hashable, optional Set a tag that must match the received message. Notice, currently UCX-Py doesn't support a "any tag" thus `tag=None` only matches a @@ -619,25 +599,12 @@ async def recv(self, buffer, nbytes=-1, tag=None): raise UCXCloseError("Endpoint closed") if not isinstance(buffer, Array): buffer = Array(buffer) - if not self._cuda_support and buffer.cuda: - raise ValueError( - "UCX is not configured with CUDA support, please add " - "`cuda_copy` and/or `cuda_ipc` to the UCX_TLS environment" - "variable and that the ucx-proc=*=gpu package is " - "installed. See " - "https://ucx-py.readthedocs.io/en/latest/install.html for " - "more information." - ) - if not buffer.c_contiguous: - raise ValueError("Array must be C-contiguous") - buffer_nbytes = buffer.nbytes - if nbytes > 0 and buffer_nbytes < nbytes: - raise ValueError("the nbytes is greater than the size of the buffer!") + nbytes = buffer.nbytes log = "[Recv #%03d] ep: %s, tag: %s, nbytes: %d, type: %s" % ( self._recv_count, hex(self.uid), hex(self._msg_tag_recv), - buffer_nbytes, + nbytes, type(buffer.obj), ) logger.debug(log) @@ -648,7 +615,7 @@ async def recv(self, buffer, nbytes=-1, tag=None): tag = hash64bits(self._msg_tag_recv, hash(tag)) if self._guarantee_msg_order: tag += self._recv_count - ret = await comm.tag_recv(self._ep, buffer, buffer_nbytes, tag, name=log) + ret = await comm.tag_recv(self._ep, buffer, nbytes, tag, name=log) self._finished_recv_count += 1 if ( self._close_after_n_recv is not None @@ -659,7 +626,7 @@ async def recv(self, buffer, nbytes=-1, tag=None): def cuda_support(self): """Return whether UCX is configured with CUDA support or not""" - return self._cuda_support + return self._ctx.context.cuda_support def get_ucp_worker(self): """Returns the underlying UCP worker handle (ucp_worker_h) @@ -756,7 +723,7 @@ async def recv_obj(self, tag=None, allocator=bytearray): await self.recv(nbytes, tag=tag) nbytes = nbytes[0] ret = allocator(nbytes) - await self.recv(ret, nbytes=nbytes, tag=tag) + await self.recv(ret, tag=tag) return ret diff --git a/ucp/endpoint_reuse.py b/ucp/endpoint_reuse.py index 81692df06..bdb3d85a6 100644 --- a/ucp/endpoint_reuse.py +++ b/ucp/endpoint_reuse.py @@ -98,11 +98,11 @@ async def _handle(ep_new): return core.create_listener(_handle, port=port) - async def send(self, buffer, nbytes=-1): - await self.handle.ep.send(buffer, nbytes=nbytes, tag=self.tag) + async def send(self, buffer): + await self.handle.ep.send(buffer, tag=self.tag) - async def recv(self, buffer, nbytes=-1): - await self.handle.ep.recv(buffer, nbytes=nbytes, tag=self.tag) + async def recv(self, buffer): + await self.handle.ep.recv(buffer, tag=self.tag) async def close(self): if self.closed(): From 5208b4538a86e2d1e22b596a5050943aeae70bcd Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 1 Oct 2020 14:31:40 -0700 Subject: [PATCH 88/97] Make `ucx_py_request`'s `uid` `unsigned` (#627) As other UID variables are `unsigned`, make sure this one is `unsigned` as well. --- ucp/_libs/ucx_api.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 8d11262d8..376f46a9a 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -31,7 +31,7 @@ from ..utils import nvtx_annotate # Struct used as requests by UCX cdef struct ucx_py_request: bint finished # Used by downstream projects such as cuML - int uid + unsigned int uid PyObject *info From bba97a87f433acd19f36733cd07276fca59b75b4 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Fri, 2 Oct 2020 07:21:28 -0700 Subject: [PATCH 89/97] Use C attributes, methods, & minor error message cleanup (#629) * Use `UCXRequest`'s `_handle` directly * Grab `UCXRequest`'s `info` via `_handle` This is able to proceed efficiently directly through C without needing to go through Python. Also behaves equivalent to how the `info` property would behave. Include the `.closed()` check that we would otherwise use in the property for good measure. * Combine definition and assignment on one line * Provide efficient C-level access to `closed()` In addition to exposing `UCXRequest.closed()` to Python, expose an efficient version to C as well. Also type its `return` value as `bint` to provide an efficient C type we can use. As no exceptions can be raised from this function skip marking it as `except *`. * Provide C implementation of `.close()` too Adds a C-level implementation of `.close()` for `UCXRequest` as well. * Fix tense * Construct error message only when needed * Add more methods to C API As we are calling these methods from Cython, make sure we expose a C API for faster calling. This done in addition to the Python API that was already available. --- ucp/_libs/ucx_api.pyx | 47 +++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 376f46a9a..b630c5a7f 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -169,7 +169,7 @@ cdef class UCXObject: # List of weak references of UCX objects that make use of this object self._children = [] - def close(self): + cpdef void close(self) except *: """Close the object and free the underlying UCX handle. Does nothing if the object is already closed """ @@ -181,7 +181,7 @@ cdef class UCXObject: """Is the underlying UCX handle initialized""" return self._finalizer and self._finalizer.alive - def add_child(self, child): + cpdef void add_child(self, child) except *: """Add a UCX object to this object's children. The underlying UCX handle will be freed when this obejct is freed. """ @@ -253,7 +253,7 @@ cdef class UCXContext(UCXObject): for k, v in self._config.items(): logger.info(f" {k}: {v}") - def get_config(self): + cpdef dict get_config(self): return self._config @property @@ -298,10 +298,10 @@ def _ucx_worker_handle_finalizer( cdef str name for req in list(inflight_msgs): assert not req.closed() - req_info = req.info + req_info = req._handle.info name = req_info["name"] logger.debug("Future cancelling: %s" % name) - ucp_request_cancel(handle, req.handle) + ucp_request_cancel(handle, req._handle) ucp_worker_destroy(handle) @@ -358,7 +358,7 @@ cdef class UCXWorker(UCXObject): raise IOError("epoll_ctl() returned %d" % err) return epoll_fd - def arm(self): + cpdef bint arm(self) except *: assert self.initialized cdef ucs_status_t status status = ucp_worker_arm(self._handle) @@ -378,7 +378,7 @@ cdef class UCXWorker(UCXObject): assert self.initialized return int(self._handle) - def request_cancel(self, UCXRequest req): + cpdef void request_cancel(self, UCXRequest req) except *: assert self.initialized assert not req.closed() @@ -434,7 +434,8 @@ def _ucx_endpoint_finalizer(uintptr_t handle_as_int, worker, set inflight_msgs): cdef dict req_info cdef str name for req in list(inflight_msgs): - req_info = req.info + assert not req.closed() + req_info = req._handle.info name = req_info["name"] logger.debug("Future cancelling: %s" % name) # Notice, `request_cancel()` evoke the send/recv callback functions @@ -608,10 +609,10 @@ cdef class UCXRequest: else: self._uid = self._handle.uid - def closed(self): + cpdef bint closed(self): return self._handle == NULL or self._uid != self._handle.uid - def close(self): + cpdef void close(self) except *: """This routine releases the non-blocking request back to UCX, regardless of its current state. Communications operations associated with this request will make progress internally, however no further notifications or @@ -664,20 +665,23 @@ cdef UCXRequest _handle_status( ): if UCS_PTR_STATUS(status) == UCS_OK: return - cdef str msg = "<%s>: " % name + cdef str ucx_status_msg, msg if UCS_PTR_IS_ERR(status): - msg += ucs_status_string(UCS_PTR_STATUS(status)).decode("utf-8") + ucx_status_msg = ( + ucs_status_string(UCS_PTR_STATUS(status)).decode("utf-8") + ) + msg = "<%s>: %s" % (name, ucx_status_msg) raise UCXError(msg) cdef UCXRequest req = UCXRequest( status) - cdef dict req_info - req_info = req.info + assert not req.closed() + cdef dict req_info = req._handle.info if req_info["status"] == "finished": try: - # The callback function has already handle the request + # The callback function has already handled the request received = req_info.get("received", None) if received is not None and received != expected_receive: - msg += "length mismatch: %d (got) != %d (expected)" % ( - received, expected_receive + msg = "<%s>: length mismatch: %d (got) != %d (expected)" % ( + name, received, expected_receive ) raise UCXMsgTruncated(msg) else: @@ -705,7 +709,8 @@ cdef void _send_callback(void *request, ucs_status_t status): cdef dict cb_kwargs with log_errors(): req = UCXRequest( request) - req_info = req.info + assert not req.closed() + req_info = req._handle.info req_info["status"] = "finished" if "cb_func" not in req_info: @@ -827,7 +832,8 @@ cdef void _tag_recv_callback( cdef dict cb_kwargs with log_errors(): req = UCXRequest( request) - req_info = req.info + assert not req.closed() + req_info = req._handle.info req_info["status"] = "finished" if "cb_func" not in req_info: @@ -1051,7 +1057,8 @@ cdef void _stream_recv_callback( cdef dict cb_kwargs with log_errors(): req = UCXRequest( request) - req_info = req.info + assert not req.closed() + req_info = req._handle.info req_info["status"] = "finished" if "cb_func" not in req_info: From c55a079428ccaf9919038ae802ef719cf3e37dec Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 2 Oct 2020 16:58:26 +0200 Subject: [PATCH 90/97] Remove remaining `nbytes` references in send/recv calls (#631) * Remove remaining nbytes references in send/recv calls * Remove msg_size from quickstart page --- benchmarks/local-send-recv.py | 8 ++++---- docs/source/quickstart.rst | 10 ++++------ tests/test_send_recv.py | 6 +++--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/benchmarks/local-send-recv.py b/benchmarks/local-send-recv.py index d81f94d50..832e81279 100644 --- a/benchmarks/local-send-recv.py +++ b/benchmarks/local-send-recv.py @@ -71,8 +71,8 @@ async def server_handler(ep): assert msg_recv_list[0].nbytes == args.n_bytes for i in range(args.n_iter): - await ep.recv(msg_recv_list[i], args.n_bytes) - await ep.send(msg_recv_list[i], args.n_bytes) + await ep.recv(msg_recv_list[i]) + await ep.send(msg_recv_list[i]) await ep.close() lf.close() @@ -135,8 +135,8 @@ async def run(): times = [] for i in range(args.n_iter): start = clock() - await ep.send(msg_send_list[i], args.n_bytes) - await ep.recv(msg_recv_list[i], args.n_bytes) + await ep.send(msg_send_list[i]) + await ep.recv(msg_recv_list[i]) stop = clock() times.append(stop - start) if args.cuda_profile: diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index d29337414..6e5a0d936 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -79,16 +79,15 @@ Process 2 - Client host = ucp.get_address(ifname='eth0') # ethernet device name ep = await ucp.create_endpoint(host, port) msg = np.zeros(n_bytes, dtype='u1') # create some data to send - msg_size = np.array([msg.nbytes], dtype=np.uint64) # send message print("Send Original NumPy array") - await ep.send(msg, msg_size) # send the real message + await ep.send(msg) # send the real message # recv response print("Receive Incremented NumPy arrays") resp = np.empty_like(msg) - await ep.recv(resp, msg_size) # receive the echo + await ep.recv(resp) # receive the echo await ep.close() np.testing.assert_array_equal(msg + 1, resp) @@ -159,16 +158,15 @@ Process 2 - Client host = ucp.get_address(ifname='eth0') # ethernet device name ep = await ucp.create_endpoint(host, port) msg = cp.zeros(n_bytes, dtype='u1') # create some data to send - msg_size = np.array([msg.nbytes], dtype=np.uint64) # send message print("Send Original CuPy array") - await ep.send(msg, msg_size) # send the real message + await ep.send(msg) # send the real message # recv response print("Receive Incremented CuPy arrays") resp = cp.empty_like(msg) - await ep.recv(resp, msg_size) # receive the echo + await ep.recv(resp) # receive the echo await ep.close() cp.testing.assert_array_equal(msg + 1, resp) diff --git a/tests/test_send_recv.py b/tests/test_send_recv.py index 009b3e050..cee3e16f9 100644 --- a/tests/test_send_recv.py +++ b/tests/test_send_recv.py @@ -40,9 +40,9 @@ async def echo_server(ep): Basic echo server for sized messages. We expect the other endpoint to follow the pattern:: # size of the real message (in bytes) - >>> await ep.send(msg_size, np.uint64().nbytes) - >>> await ep.send(msg, msg_size) # send the real message - >>> await ep.recv(responds, msg_size) # receive the echo + >>> await ep.send(msg_size) + >>> await ep.send(msg) # send the real message + >>> await ep.recv(responds) # receive the echo """ msg_size = np.empty(1, dtype=np.uint64) await ep.recv(msg_size) From efc4be8afcfd9585a3639992ff867de2f9cbf8e8 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Fri, 2 Oct 2020 10:59:22 -0400 Subject: [PATCH 91/97] doc update for only applying the patches when using cuda (#633) * doc update for only applying the patches when using cuda * Update docs/source/install.rst Co-authored-by: Peter Andreas Entschev Co-authored-by: Peter Andreas Entschev --- docs/source/install.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/install.rst b/docs/source/install.rst index 53d3bc9d1..ac889292a 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -74,7 +74,7 @@ UCX cd ucx git checkout v1.8.x # apply UCX IB registration cache patches, improves overall - # IB performance when using a memory pool + # CUDA IB performance when using a memory pool curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/master/recipe/add-page-alignment.patch curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/master/recipe/ib_registration_cache.patch git apply ib_registration_cache.patch && git apply add-page-alignment.patch @@ -87,6 +87,9 @@ UCX ../contrib/configure-devel --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include" make -j install +.. note:: + If you're running on a machine without CUDA then you _must NOT_ apply the patches. + UCX + OFED ~~~~~~~~~~ From 84aae404f07c53ca2d7112db060918a3798ee06e Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 2 Oct 2020 17:02:38 +0200 Subject: [PATCH 92/97] Clean up of custom send/recv test, which fixes #621 (#632) --- tests/test_custom_send_recv.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_custom_send_recv.py b/tests/test_custom_send_recv.py index e27666eb6..ee2ac81d1 100644 --- a/tests/test_custom_send_recv.py +++ b/tests/test_custom_send_recv.py @@ -4,8 +4,7 @@ import numpy as np import pytest -from distributed.comm.utils import to_frames # noqa -from distributed.utils import nbytes # noqa +from distributed.utils import nbytes import ucp @@ -43,8 +42,6 @@ async def test_send_recv_cudf(event_loop, g): class UCX: def __init__(self, ep): self.ep = ep - loop = asyncio.get_event_loop() - self.queue = asyncio.Queue(loop=loop) async def write(self, cdf): header, _frames = cdf.serialize() From 1b4b580d53a613a4f80bfa2be9c477ec6187f319 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Fri, 2 Oct 2020 13:02:21 -0700 Subject: [PATCH 93/97] Cast `req_info["expected_receive"]` as `size_t` (#635) Otherwise Cython tries to convert the `size_t` variable to a Python object, which is a bit laborious. This bypasses that. --- ucp/_libs/ucx_api.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index b630c5a7f..587e70522 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -850,7 +850,7 @@ cdef void _tag_recv_callback( ucx_status_msg = ucs_status_string(status).decode("utf-8") msg = "<%s>: %s" % (name, ucx_status_msg) exception = UCXError(msg) - elif info.length != req_info["expected_receive"]: + elif info.length != req_info["expected_receive"]: name = req_info["name"] msg = "<%s>: length mismatch: %d (got) != %d (expected)" % ( name, info.length, req_info["expected_receive"] From e9db628a56c051f70db6621bac24979197453461 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 5 Oct 2020 16:22:21 +0200 Subject: [PATCH 94/97] Try to import NVTX from its own module (#639) --- ucp/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ucp/utils.py b/ucp/utils.py index 36ccc5206..f5fefe286 100644 --- a/ucp/utils.py +++ b/ucp/utils.py @@ -228,10 +228,9 @@ def run_on_local_network( try: - from cudf._lib.nvtx import annotate as nvtx_annotate + from nvtx import annotate as nvtx_annotate except ImportError: - # NVTX annotations functionality currently exists in cuDF, if cuDF isn't - # installed, `annotate` yields only. + # If nvtx module is not installed, `annotate` yields only. from contextlib import contextmanager @contextmanager From c3dd8f95017bd2c407cd9a4e29e0ee23974b47ef Mon Sep 17 00:00:00 2001 From: jakirkham Date: Mon, 5 Oct 2020 10:34:41 -0700 Subject: [PATCH 95/97] Fix compile warnings (#641) * Cast `handle` to `void*` first This should make sure Cython recognizes this is a pointer and thus keep the casting as specified when generating C code. As a result the C code should still have the cast and thus avoid the warning regarding pass an integer as a pointer. * Wrap `_get_error_callback` arguments on next line * Note `_get_error_callback` can raise exceptions * Fix typing of `ucp_err_handler_cb_t` As this is a function pointer and not a `struct`, correct its typing based on its upstream definition. https://github.com/openucx/ucx/blob/8b6641b02ef5d217323f719d54dfb2e9142dad59/src/ucp/api/ucp_def.h#L335-L348 * Fix typing of `ucp_listener_h` As `ucp_listener_h` is actually a pointer and not a `struct`, correct the typing of `ucp_listener_h` to reflect this. Also as the underlying type is `ucp_listener`, make sure this is also added to our definitions. This ensures that this part of the casting in `_ucx_listener_handle_finalizer` is not dropped by Cython. https://github.com/openucx/ucx/blob/8b6641b02ef5d217323f719d54dfb2e9142dad59/src/ucp/api/ucp_def.h#L168-L175 * Space out `enum` declaration * Drop unneeded `void*` cast Now that `ucp_listener_h` is typed correctly, we no longer need to cast through a `void*` first. So go ahead and drop that. --- ucp/_libs/ucx_api.pyx | 5 +++-- ucp/_libs/ucx_api_dep.pxd | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 587e70522..89e3c2f8d 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -272,8 +272,9 @@ cdef void _ib_err_cb(void *arg, ucp_ep_h ep, ucs_status_t status): logger.error(msg) -cdef ucp_err_handler_cb_t _get_error_callback(str tls, - bint endpoint_error_handling): +cdef ucp_err_handler_cb_t _get_error_callback( + str tls, bint endpoint_error_handling +) except *: cdef ucp_err_handler_cb_t err_cb = NULL cdef str t cdef list transports diff --git a/ucp/_libs/ucx_api_dep.pxd b/ucp/_libs/ucx_api_dep.pxd index e40b7b133..73a4dea19 100644 --- a/ucp/_libs/ucx_api_dep.pxd +++ b/ucp/_libs/ucx_api_dep.pxd @@ -28,8 +28,7 @@ cdef extern from "src/c_util.h": ctypedef ucp_conn_request* ucp_conn_request_h - ctypedef struct ucp_err_handler_cb_t: - pass + ctypedef void(*ucp_err_handler_cb_t)(void *arg, ucp_ep_h ep, ucs_status_t status) ctypedef void(*ucp_listener_conn_callback_t)(ucp_conn_request_h request, void *arg) @@ -127,9 +126,11 @@ cdef extern from "ucp/api/ucp.h": ucp_worker_h *worker_p) void ucp_worker_destroy(ucp_worker_h worker) - ctypedef struct ucp_listener_h: + ctypedef struct ucp_listener: pass + ctypedef ucp_listener* ucp_listener_h + ucs_status_t ucp_listener_create(ucp_worker_h worker, const ucp_listener_params_t *params, ucp_listener_h *listener_p) @@ -207,6 +208,7 @@ cdef extern from "ucp/api/ucp.h": ctypedef enum ucs_config_print_flags_t: pass + ucs_config_print_flags_t UCS_CONFIG_PRINT_CONFIG void ucp_config_print(const ucp_config_t *config, FILE *stream, From 116f978a70b54234271e359b04e6028d6bc6c5c1 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Tue, 6 Oct 2020 16:57:27 -0400 Subject: [PATCH 96/97] [REVIEW] UCX/UCX-Py Debugging/Performance checks (#624) * starting paging on debugging ucx * Update docs/source/ucx-debug.rst Co-authored-by: Peter Andreas Entschev * more doc updates * update text based on suggestions from peter * Update docs/source/ucx-debug.rst Co-authored-by: Peter Andreas Entschev Co-authored-by: Peter Andreas Entschev --- docs/source/index.rst | 2 + docs/source/ucx-debug.rst | 120 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 docs/source/ucx-debug.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 36e6fec25..69421e39b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -18,6 +18,8 @@ UCX-Py is the Python interface for `UCX `_, a lo configuration dask deployment + ucx-debug + .. toctree:: :maxdepth: 1 diff --git a/docs/source/ucx-debug.rst b/docs/source/ucx-debug.rst new file mode 100644 index 000000000..7d9b6b89b --- /dev/null +++ b/docs/source/ucx-debug.rst @@ -0,0 +1,120 @@ +UCX Debugging +============= + +InfiniBand +---------- + +System Configuration +~~~~~~~~~~~~~~~~~~~~ + + +``ibdev2netdev`` -- check to ensure at least one IB controller is configured for IPoIB + +:: + + user@mlnx:~$ ibdev2netdev + mlx5_0 port 1 ==> ib0 (Up) + mlx5_1 port 1 ==> ib1 (Up) + mlx5_2 port 1 ==> ib2 (Up) + mlx5_3 port 1 ==> ib3 (Up) + +``ucx_info -d`` and ``ucx_info -p -u t`` are helpful commands to display what UCX understands about the underlying hardware + + +InfiniBand Performance +~~~~~~~~~~~~~~~~~~~~~~ + +``ucx_perftest`` should confirm InfiniBand bandwidth to be in the 10+ GB/s range + +:: + + CUDA_VISIBLE_DEVICES=0 UCX_NET_DEVICES=mlx5_0:1 UCX_TLS=rc,cuda_copy ucx_perftest -t tag_bw -m cuda -s 10000000 -n 10 -p 9999 & \ + CUDA_VISIBLE_DEVICES=1 UCX_NET_DEVICES=mlx5_1:1 UCX_TLS=rc,cuda_copy ucx_perftest `hostname` -t tag_bw -m cuda -s 100000000 -n 10 -p 9999 + + +--------------+-----------------------------+---------------------+-----------------------+ + | | latency (usec) | bandwidth (MB/s) | message rate (msg/s) | + +--------------+---------+---------+---------+----------+----------+-----------+-----------+ + | # iterations | typical | average | overall | average | overall | average | overall | + +--------------+---------+---------+---------+----------+----------+-----------+-----------+ + +------------------------------------------------------------------------------------------+ + | API: protocol layer | + | Test: tag match bandwidth | + | Data layout: (automatic) | + | Send memory: cuda | + | Recv memory: cuda | + | Message size: 100000000 | + +------------------------------------------------------------------------------------------+ + 10 0.000 9104.800 9104.800 10474.41 10474.41 110 110 + + +``-c`` option is NUMA dependent and sets the CPU Affinity of process for a particular GPU. CPU Affinity information can be found in ``nvidia-smi topo -m`` +:: + + user@mlnx:~$ nvidia-smi topo -m + GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 mlx5_2 mlx5_3 CPU Affinity + GPU0 X NV1 NV1 NV2 NV2 SYS SYS SYS PIX PHB SYS SYS 0-19,40-59 + GPU1 NV1 X NV2 NV1 SYS NV2 SYS SYS PIX PHB SYS SYS 0-19,40-59 + GPU2 NV1 NV2 X NV2 SYS SYS NV1 SYS PHB PIX SYS SYS 0-19,40-59 + GPU3 NV2 NV1 NV2 X SYS SYS SYS NV1 PHB PIX SYS SYS 0-19,40-59 + GPU4 NV2 SYS SYS SYS X NV1 NV1 NV2 SYS SYS PIX PHB 20-39,60-79 + GPU5 SYS NV2 SYS SYS NV1 X NV2 NV1 SYS SYS PIX PHB 20-39,60-79 + GPU6 SYS SYS NV1 SYS NV1 NV2 X NV2 SYS SYS PHB PIX 20-39,60-79 + GPU7 SYS SYS SYS NV1 NV2 NV1 NV2 X SYS SYS PHB PIX 20-39,60-79 + mlx5_0 PIX PIX PHB PHB SYS SYS SYS SYS X PHB SYS SYS + mlx5_1 PHB PHB PIX PIX SYS SYS SYS SYS PHB X SYS SYS + mlx5_2 SYS SYS SYS SYS PIX PIX PHB PHB SYS SYS X PHB + mlx5_3 SYS SYS SYS SYS PHB PHB PIX PIX SYS SYS PHB X + + Legend: + + X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + NV# = Connection traversing a bonded set of # NVLinks + +NVLink +------ + +System Configuration +~~~~~~~~~~~~~~~~~~~~ + + +The NVLink connectivity on the system above (DGX-1) is not homogenous, +some GPUs are connected by a single NVLink connection (NV1, e.g., GPUs 0 and +1), others with two NVLink connections (NV2, e.g., GPUs 1 and 2), and some not +connected at all via NVLink (SYS, e.g., GPUs 3 and 4)." + +NVLink Performance +~~~~~~~~~~~~~~~~~~ + +``ucx_perftest`` should confirm NVLink bandwidth to be in the 20+ GB/s range + +:: + + CUDA_VISIBLE_DEVICES=0 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm ucx_perftest -t tag_bw -m cuda -s 10000000 -n 10 -p 9999 -c 0 & \ + CUDA_VISIBLE_DEVICES=1 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm ucx_perftest `hostname` -t tag_bw -m cuda -s 100000000 -n 10 -p 9999 -c 1 + +--------------+-----------------------------+---------------------+-----------------------+ + | | latency (usec) | bandwidth (MB/s) | message rate (msg/s) | + +--------------+---------+---------+---------+----------+----------+-----------+-----------+ + | # iterations | typical | average | overall | average | overall | average | overall | + +--------------+---------+---------+---------+----------+----------+-----------+-----------+ + +------------------------------------------------------------------------------------------+ + | API: protocol layer | + | Test: tag match bandwidth | + | Data layout: (automatic) | + | Send memory: cuda | + | Recv memory: cuda | + | Message size: 100000000 | + +------------------------------------------------------------------------------------------+ + 10 0.000 4163.694 4163.694 22904.52 22904.52 240 240 + + +Experimental Debugging +---------------------- + +A list of problems we have run into along the way while trying to understand performance issues with UCX/UCX-Py: + +- System-wide settings environment variables. For example, we saw a system with ``UCX_MEM_MMAP_HOOK_MODE`` set to ``none``. Unsetting this env var resolved problems: https://github.com/rapidsai/ucx-py/issues/616 . One can quickly check system wide variables with ``env|grep ^UCX_``. From 3bc60af5f116d284b8ca7990ceef954bc15426f4 Mon Sep 17 00:00:00 2001 From: MattBBaker Date: Tue, 13 Oct 2020 15:03:05 -0400 Subject: [PATCH 97/97] Add fence/flush/worker address APIs (#594) * Flush and fence APIs Worker address APIs * White space fix ups * White space fix ups * Add tests * Improve tests. * Black formatting * Black formatting * Update ucp/_libs/ucx_api.pyx make a cpdef and add except * Co-authored-by: jakirkham * Rework to use native python buffer protocol * Style fixups * isort fix * Switch size_t to Py_ssize_t * Apply suggestions from code review Add suggestions for checking initialization and buffer changes Co-authored-by: Mads R. B. Kristensen Co-authored-by: jakirkham * Update UCXAddress destruction and expand test * Update ucp/_libs/ucx_api.pyx Include suggested flush update Co-authored-by: jakirkham * Flush update for the ep_flush as well, plus tyding up the test. * Misordered import * Update ucp/_libs/ucx_api.pyx Use worker handle directly instead of casting Co-authored-by: jakirkham Co-authored-by: jakirkham Co-authored-by: Benjamin Zaitlen Co-authored-by: Mads R. B. Kristensen --- tests/test_rma.py | 91 ++++++++++++++++++++++++++++++++++ ucp/_libs/ucx_api.pyx | 100 ++++++++++++++++++++++++++++++++++++++ ucp/_libs/ucx_api_dep.pxd | 17 ++++++- ucp/comm.py | 9 ++++ ucp/core.py | 38 +++++++++++++++ 5 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 tests/test_rma.py diff --git a/tests/test_rma.py b/tests/test_rma.py new file mode 100644 index 000000000..26948db58 --- /dev/null +++ b/tests/test_rma.py @@ -0,0 +1,91 @@ +import asyncio + +import numpy as np +import pytest + +import ucp + + +def handle_exception(loop, context): + msg = context.get("exception", context["message"]) + print(msg) + + +# Let's make sure that UCX gets time to cancel +# progress tasks before closing the event loop. +@pytest.yield_fixture() +def event_loop(scope="function"): + loop = asyncio.new_event_loop() + loop.set_exception_handler(handle_exception) + ucp.reset() + yield loop + ucp.reset() + loop.run_until_complete(asyncio.sleep(0)) + loop.close() + + +def make_echo_server(create_empty_data): + """ + Returns an echo server that calls the function `create_empty_data(nbytes)` + to create the data container.` + """ + + async def echo_server(ep): + """ + Basic echo server for sized messages. + We expect the other endpoint to follow the pattern:: + # size of the real message (in bytes) + >>> await ep.send(msg_size, np.uint64().nbytes) + >>> await ep.send(msg, msg_size) # send the real message + >>> await ep.recv(responds, msg_size) # receive the echo + """ + msg_size = np.empty(1, dtype=np.uint64) + await ep.recv(msg_size) + msg = create_empty_data(msg_size[0]) + await ep.recv(msg) + await ep.send(msg) + + return echo_server + + +@pytest.mark.parametrize("blocking_progress_mode", [True, False]) +def test_fence(blocking_progress_mode): + ucp.init(blocking_progress_mode=blocking_progress_mode) + # this should always succeed + ucp.fence() + ucp.reset() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("blocking_progress_mode", [True, False]) +async def test_flush(blocking_progress_mode): + ucp.init(blocking_progress_mode=blocking_progress_mode) + + await ucp.flush() + ucp.reset() + + +@pytest.mark.parametrize("blocking_progress_mode", [True, False]) +def test_worker_address(blocking_progress_mode): + ucp.init(blocking_progress_mode=blocking_progress_mode) + + addr = ucp.get_worker_address() + assert addr is not None + ucp.reset() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("blocking_progress_mode", [True, False]) +async def test_send_recv_addr(blocking_progress_mode): + ucp.init(blocking_progress_mode=blocking_progress_mode) + + msg = ucp.get_worker_address() + msg_size = np.array([len(bytes(msg))], dtype=np.uint64) + listener = ucp.create_listener(make_echo_server(lambda n: bytearray(n))) + client = await ucp.create_endpoint(ucp.get_address(), listener.port) + + await client.send(msg_size) + await client.send(msg) + resp = bytearray(msg_size[0]) + await client.recv(resp) + assert resp == bytes(msg) diff --git a/ucp/_libs/ucx_api.pyx b/ucp/_libs/ucx_api.pyx index 89e3c2f8d..f722342bb 100644 --- a/ucp/_libs/ucx_api.pyx +++ b/ucp/_libs/ucx_api.pyx @@ -1,4 +1,5 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020 UT-Battelle, LLC. All rights reserved. # See file LICENSE for terms. # cython: language_level=3 @@ -9,6 +10,7 @@ import weakref from posix.stdio cimport open_memstream +from cpython.buffer cimport PyBUF_FORMAT, PyBUF_ND, PyBUF_WRITABLE from cpython.ref cimport Py_DECREF, Py_INCREF, PyObject from libc.stdint cimport uint16_t, uintptr_t from libc.stdio cimport FILE, clearerr, fclose, fflush @@ -424,6 +426,91 @@ cdef class UCXWorker(UCXObject): assert_ucs_status(status) return UCXEndpoint(self, ucp_ep) + cpdef ucs_status_t fence(self) except *: + cdef ucs_status_t status = ucp_worker_fence(self._handle) + assert_ucs_status(status) + return status + + def flush(self, cb_func, tuple cb_args=None, dict cb_kwargs=None): + if cb_args is None: + cb_args = () + if cb_kwargs is None: + cb_kwargs = {} + cdef ucs_status_ptr_t req + cdef ucp_send_callback_t _send_cb = _send_callback + + cdef ucs_status_ptr_t status = ucp_worker_flush_nb(self._handle, 0, _send_cb) + return _handle_status( + status, 0, cb_func, cb_args, cb_kwargs, 'flush', self._inflight_msgs + ) + + def get_address(self): + return UCXAddress(self) + + +def _ucx_address_finalizer(uintptr_t handle_as_int, uintptr_t worker_as_int): + cdef ucp_address_t *address = handle_as_int + cdef ucp_worker_h ucp_worker = worker_as_int + ucp_worker_release_address(ucp_worker, address) + + +cdef class UCXAddress(UCXObject): + """Python representation of ucp_address_t""" + cdef ucp_address_t *_handle + cdef Py_ssize_t _length + cdef worker + + def __init__(self, UCXWorker worker): + cdef ucs_status_t status + cdef ucp_worker_h ucp_worker = worker._handle + cdef size_t length + + status = ucp_worker_get_address(ucp_worker, &self._handle, &length) + assert_ucs_status(status) + self.worker = worker + self._length = length + self.add_handle_finalizer( + _ucx_address_finalizer, + int(self._handle), + int(worker._handle) + ) + worker.add_child(self) + + @property + def address(self): + assert self.initialized + return self._handle + + @property + def length(self): + assert self.initialized + return int(self._length) + + def __getbuffer__(self, Py_buffer *buffer, int flags): + assert self.initialized + if (flags & PyBUF_WRITABLE) == PyBUF_WRITABLE: + raise BufferError("Requested writable view on readonly data") + buffer.buf = self._handle + buffer.obj = self + buffer.len = self._length + buffer.readonly = True + buffer.itemsize = 1 + if (flags & PyBUF_FORMAT) == PyBUF_FORMAT: + buffer.format = b"B" + else: + buffer.format = NULL + buffer.ndim = 1 + if (flags & PyBUF_ND) == PyBUF_ND: + buffer.shape = &self._length + else: + buffer.shape = NULL + buffer.strides = NULL + buffer.suboffsets = NULL + buffer.internal = NULL + + def __releasebuffer__(self, Py_buffer *buffer): + pass + def _ucx_endpoint_finalizer(uintptr_t handle_as_int, worker, set inflight_msgs): assert worker.initialized @@ -506,6 +593,19 @@ cdef class UCXEndpoint(UCXObject): assert self.initialized return int(self._handle) + def flush(self, cb_func, cb_args=None, cb_kwargs=None): + if cb_args is None: + cb_args = () + if cb_kwargs is None: + cb_kwargs = {} + cdef ucs_status_ptr_t req + cdef ucp_send_callback_t _send_cb = _send_callback + + cdef ucs_status_ptr_t status = ucp_ep_flush_nb(self._handle, 0, _send_cb) + return _handle_status( + status, 0, cb_func, cb_args, cb_kwargs, 'flush', self._inflight_msgs + ) + cdef void _listener_callback(ucp_conn_request_h conn_request, void *args): """Callback function used by UCXListener""" diff --git a/ucp/_libs/ucx_api_dep.pxd b/ucp/_libs/ucx_api_dep.pxd index 73a4dea19..a76abae2c 100644 --- a/ucp/_libs/ucx_api_dep.pxd +++ b/ucp/_libs/ucx_api_dep.pxd @@ -1,4 +1,5 @@ # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020 UT-Battelle, LLC. All rights reserved. # See file LICENSE for terms. # cython: language_level=3 @@ -65,6 +66,9 @@ cdef extern from "ucp/api/ucp.h": ctypedef struct ucp_config_t: pass + ctypedef struct ucp_address_t: + pass + ctypedef void(* ucp_request_init_callback_t)(void *request) ctypedef struct ucp_params_t: @@ -217,7 +221,18 @@ cdef extern from "ucp/api/ucp.h": ucs_status_t ucp_config_modify(ucp_config_t *config, const char *name, const char *value) - + ucs_status_t ucp_worker_get_address(ucp_worker_h worker, + ucp_address_t **address, + size_t *len) + void ucp_worker_release_address(ucp_worker_h worker, + ucp_address_t *address) + ucs_status_t ucp_worker_fence(ucp_worker_h worker) + ucs_status_ptr_t ucp_worker_flush_nb(ucp_worker_h worker, + unsigned flags, + ucp_send_callback_t cb) + ucs_status_ptr_t ucp_ep_flush_nb(ucp_ep_h ep, + unsigned flags, + ucp_send_callback_t cb) cdef extern from "sys/epoll.h": diff --git a/ucp/comm.py b/ucp/comm.py index 9035c9c5c..eec307ab3 100644 --- a/ucp/comm.py +++ b/ucp/comm.py @@ -1,4 +1,5 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020 UT-Battelle, LLC. All rights reserved. # See file LICENSE for terms. import asyncio @@ -90,3 +91,11 @@ def stream_recv( return _call_ucx_api( event_loop, ucx_api.stream_recv_nb, ep, buffer, nbytes, name=name ) + + +def flush_worker(worker: ucx_api.UCXWorker, event_loop=None) -> asyncio.Future: + return _call_ucx_api(event_loop, worker.flush) + + +def flush_ep(ep: ucx_api.UCXEndpoint, event_loop=None) -> asyncio.Future: + return _call_ucx_api(event_loop, ep.flush) diff --git a/ucp/core.py b/ucp/core.py index 936af0e90..d8e2c0ab6 100644 --- a/ucp/core.py +++ b/ucp/core.py @@ -1,4 +1,5 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020 UT-Battelle, LLC. All rights reserved. # See file LICENSE for terms. import array @@ -429,6 +430,15 @@ def get_config(self): """ return self.context.get_config() + def fence(self): + return self.worker.fence() + + async def flush(self): + return await comm.flush_worker(self.worker) + + def get_worker_address(self): + return self.worker.get_address() + class Listener: """A handle to the listening service started by `create_listener()` @@ -726,6 +736,9 @@ async def recv_obj(self, tag=None, allocator=bytearray): await self.recv(ret, tag=tag) return ret + async def flush(self): + return await comm.flush_ep(self) + # The following functions initialize and use a single ApplicationContext instance @@ -857,6 +870,31 @@ def get_ucp_worker(): return _get_ctx().get_ucp_worker() +def get_worker_address(): + return _get_ctx().get_worker_address() + + +async def flush(): + """Flushes outstanding AMO and RMA operations. This ensures that the + operations issued on this worker have completed both locally and remotely. + This function does not guarantee ordering. + """ + if _ctx is not None: + return await _get_ctx().flush() + else: + # If ctx is not initialized we still want to do the right thing by asyncio + return await asyncio.sleep(0) + + +def fence(): + """Ensures ordering of non-blocking communication operations on the UCP worker. + This function returns nothing, but will raise an error if it cannot make + this guarantee. This function does not ensure any operations have completed. + """ + if _ctx is not None: + _get_ctx().fence() + + # Setting the __doc__ create_listener.__doc__ = ApplicationContext.create_listener.__doc__ create_endpoint.__doc__ = ApplicationContext.create_endpoint.__doc__