Merge pull request #649 from rapidsai/branch-0.16

[RELEASE] ucx-py v0.16
rapidsai · Oct 21, 2020 · b804a78 · b804a78
2 parents fc2d6b9 + 3bc60af
commit b804a78
Show file tree

Hide file tree

Showing 24 changed files with 1,318 additions and 484 deletions.
diff --git a/benchmarks/local-send-recv.py b/benchmarks/local-send-recv.py
@@ -71,8 +71,8 @@ async def server_handler(ep):
 
             assert msg_recv_list[0].nbytes == args.n_bytes
             for i in range(args.n_iter):
-                await ep.recv(msg_recv_list[i], args.n_bytes)
-                await ep.send(msg_recv_list[i], args.n_bytes)
+                await ep.recv(msg_recv_list[i])
+                await ep.send(msg_recv_list[i])
             await ep.close()
             lf.close()
 
@@ -135,8 +135,8 @@ async def run():
         times = []
         for i in range(args.n_iter):
             start = clock()
-            await ep.send(msg_send_list[i], args.n_bytes)
-            await ep.recv(msg_recv_list[i], args.n_bytes)
+            await ep.send(msg_send_list[i])
+            await ep.recv(msg_recv_list[i])
             stop = clock()
             times.append(stop - start)
         if args.cuda_profile:
@@ -198,7 +198,7 @@ def parse_args():
         metavar="N",
         default=10,
         type=int,
-        help="Numer of send / recv iterations (default 10).",
+        help="Number of send / recv iterations (default 10).",
     )
     parser.add_argument(
         "-b",

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -39,12 +39,3 @@ else
 fi
 
 echo "Preparing '$RELEASE_TYPE' release [$CURRENT_TAG -> $NEXT_FULL_TAG]"
-
-# Inplace sed replace; workaround for Linux and Mac
-function sed_runner() {
-    sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
-}
-
-# RTD update
-sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py
-sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -18,6 +18,8 @@ UCX-Py is the Python interface for `UCX <https://github.com/openucx/ucx>`_, a lo
    configuration
    dask
    deployment
+   ucx-debug
+
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/source/install.rst b/docs/source/install.rst
@@ -14,7 +14,7 @@ Conda
 -----
 
 Some preliminary Conda packages can be installed as so. Replace
-``<CUDA version>`` with either ``9.2``, ``10.0``, or ``10.1``. These are
+``<CUDA version>`` with either ``10.1``, ``10.2``, or ``11.0``. These are
 available both on ``rapidsai`` and ``rapidsai-nightly``.
 
 With GPU support:
@@ -74,7 +74,7 @@ UCX
     cd ucx
     git checkout v1.8.x
     # apply UCX IB registration cache patches, improves overall
-    # IB performance when using a memory pool
+    # CUDA IB performance when using a memory pool
     curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/master/recipe/add-page-alignment.patch
     curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/master/recipe/ib_registration_cache.patch
     git apply ib_registration_cache.patch && git apply add-page-alignment.patch
@@ -87,6 +87,9 @@ UCX
     ../contrib/configure-devel --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include"
     make -j install
 
+.. note::
+    If you're running on a machine without CUDA then you _must NOT_ apply the patches.
+
 UCX + OFED
 ~~~~~~~~~~
 

diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -79,16 +79,15 @@ Process 2 - Client
         host = ucp.get_address(ifname='eth0')  # ethernet device name
         ep = await ucp.create_endpoint(host, port)
         msg = np.zeros(n_bytes, dtype='u1') # create some data to send
-        msg_size = np.array([msg.nbytes], dtype=np.uint64)
 
         # send message
         print("Send Original NumPy array")
-        await ep.send(msg, msg_size)  # send the real message
+        await ep.send(msg)  # send the real message
 
         # recv response
         print("Receive Incremented NumPy arrays")
         resp = np.empty_like(msg)
-        await ep.recv(resp, msg_size)  # receive the echo
+        await ep.recv(resp)  # receive the echo
         await ep.close()
         np.testing.assert_array_equal(msg + 1, resp)
 
@@ -159,16 +158,15 @@ Process 2 - Client
         host = ucp.get_address(ifname='eth0')  # ethernet device name
         ep = await ucp.create_endpoint(host, port)
         msg = cp.zeros(n_bytes, dtype='u1') # create some data to send
-        msg_size = np.array([msg.nbytes], dtype=np.uint64)
 
         # send message
         print("Send Original CuPy array")
-        await ep.send(msg, msg_size)  # send the real message
+        await ep.send(msg)  # send the real message
 
         # recv response
         print("Receive Incremented CuPy arrays")
         resp = cp.empty_like(msg)
-        await ep.recv(resp, msg_size)  # receive the echo
+        await ep.recv(resp)  # receive the echo
         await ep.close()
         cp.testing.assert_array_equal(msg + 1, resp)
 

diff --git a/docs/source/ucx-debug.rst b/docs/source/ucx-debug.rst
@@ -0,0 +1,120 @@
+UCX Debugging
+=============
+
+InfiniBand
+----------
+
+System Configuration
+~~~~~~~~~~~~~~~~~~~~
+
+
+``ibdev2netdev`` -- check to ensure at least one IB controller is configured for IPoIB
+
+::
+
+    user@mlnx:~$ ibdev2netdev
+    mlx5_0 port 1 ==> ib0 (Up)
+    mlx5_1 port 1 ==> ib1 (Up)
+    mlx5_2 port 1 ==> ib2 (Up)
+    mlx5_3 port 1 ==> ib3 (Up)
+
+``ucx_info -d`` and ``ucx_info -p -u t`` are helpful commands to display what UCX understands about the underlying hardware
+
+
+InfiniBand Performance
+~~~~~~~~~~~~~~~~~~~~~~
+
+``ucx_perftest`` should confirm InfiniBand bandwidth to be in the 10+ GB/s range
+
+::
+
+    CUDA_VISIBLE_DEVICES=0 UCX_NET_DEVICES=mlx5_0:1 UCX_TLS=rc,cuda_copy ucx_perftest -t tag_bw -m cuda -s 10000000 -n 10 -p 9999 & \
+    CUDA_VISIBLE_DEVICES=1 UCX_NET_DEVICES=mlx5_1:1 UCX_TLS=rc,cuda_copy ucx_perftest `hostname` -t tag_bw -m cuda -s 100000000 -n 10 -p 9999
+
+    +--------------+-----------------------------+---------------------+-----------------------+
+    |              |       latency (usec)        |   bandwidth (MB/s)  |  message rate (msg/s) |
+    +--------------+---------+---------+---------+----------+----------+-----------+-----------+
+    | # iterations | typical | average | overall |  average |  overall |   average |   overall |
+    +--------------+---------+---------+---------+----------+----------+-----------+-----------+
+    +------------------------------------------------------------------------------------------+
+    | API:          protocol layer                                                             |
+    | Test:         tag match bandwidth                                                        |
+    | Data layout:  (automatic)                                                                |
+    | Send memory:  cuda                                                                       |
+    | Recv memory:  cuda                                                                       |
+    | Message size: 100000000                                                                  |
+    +------------------------------------------------------------------------------------------+
+                10     0.000  9104.800  9104.800   10474.41   10474.41         110         110
+
+
+``-c`` option is NUMA dependent and sets the CPU Affinity of process for a particular GPU.  CPU Affinity information can be found in ``nvidia-smi topo -m``
+::
+
+    user@mlnx:~$  nvidia-smi topo -m
+            GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7    mlx5_0  mlx5_1  mlx5_2  mlx5_3  CPU Affinity
+    GPU0     X      NV1     NV1     NV2     NV2     SYS     SYS     SYS     PIX     PHB     SYS     SYS     0-19,40-59
+    GPU1    NV1      X      NV2     NV1     SYS     NV2     SYS     SYS     PIX     PHB     SYS     SYS     0-19,40-59
+    GPU2    NV1     NV2      X      NV2     SYS     SYS     NV1     SYS     PHB     PIX     SYS     SYS     0-19,40-59
+    GPU3    NV2     NV1     NV2      X      SYS     SYS     SYS     NV1     PHB     PIX     SYS     SYS     0-19,40-59
+    GPU4    NV2     SYS     SYS     SYS      X      NV1     NV1     NV2     SYS     SYS     PIX     PHB     20-39,60-79
+    GPU5    SYS     NV2     SYS     SYS     NV1      X      NV2     NV1     SYS     SYS     PIX     PHB     20-39,60-79
+    GPU6    SYS     SYS     NV1     SYS     NV1     NV2      X      NV2     SYS     SYS     PHB     PIX     20-39,60-79
+    GPU7    SYS     SYS     SYS     NV1     NV2     NV1     NV2      X      SYS     SYS     PHB     PIX     20-39,60-79
+    mlx5_0  PIX     PIX     PHB     PHB     SYS     SYS     SYS     SYS      X      PHB     SYS     SYS
+    mlx5_1  PHB     PHB     PIX     PIX     SYS     SYS     SYS     SYS     PHB      X      SYS     SYS
+    mlx5_2  SYS     SYS     SYS     SYS     PIX     PIX     PHB     PHB     SYS     SYS      X      PHB
+    mlx5_3  SYS     SYS     SYS     SYS     PHB     PHB     PIX     PIX     SYS     SYS     PHB      X
+
+    Legend:
+
+      X    = Self
+      SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+      NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+      PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+      PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+      PIX  = Connection traversing at most a single PCIe bridge
+      NV#  = Connection traversing a bonded set of # NVLinks
+
+NVLink
+------
+
+System Configuration
+~~~~~~~~~~~~~~~~~~~~
+
+
+The NVLink connectivity on the system above (DGX-1) is not homogenous,
+some GPUs are connected by a single NVLink connection (NV1, e.g., GPUs 0 and
+1), others with two NVLink connections (NV2, e.g., GPUs 1 and 2), and some not
+connected at all via NVLink (SYS, e.g., GPUs 3 and 4)."
+
+NVLink Performance
+~~~~~~~~~~~~~~~~~~
+
+``ucx_perftest`` should confirm NVLink bandwidth to be in the 20+ GB/s range
+
+::
+
+    CUDA_VISIBLE_DEVICES=0 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm  ucx_perftest -t tag_bw -m cuda -s 10000000 -n 10 -p 9999 -c 0 & \
+    CUDA_VISIBLE_DEVICES=1 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm ucx_perftest `hostname` -t tag_bw -m cuda -s 100000000 -n 10 -p 9999 -c 1
+    +--------------+-----------------------------+---------------------+-----------------------+
+    |              |       latency (usec)        |   bandwidth (MB/s)  |  message rate (msg/s) |
+    +--------------+---------+---------+---------+----------+----------+-----------+-----------+
+    | # iterations | typical | average | overall |  average |  overall |   average |   overall |
+    +--------------+---------+---------+---------+----------+----------+-----------+-----------+
+    +------------------------------------------------------------------------------------------+
+    | API:          protocol layer                                                             |
+    | Test:         tag match bandwidth                                                        |
+    | Data layout:  (automatic)                                                                |
+    | Send memory:  cuda                                                                       |
+    | Recv memory:  cuda                                                                       |
+    | Message size: 100000000                                                                  |
+    +------------------------------------------------------------------------------------------+
+                10     0.000  4163.694  4163.694   22904.52   22904.52         240         240
+
+
+Experimental Debugging
+----------------------
+
+A list of problems we have run into along the way while trying to understand performance issues with UCX/UCX-Py:
+
+- System-wide settings environment variables. For example, we saw a system with ``UCX_MEM_MMAP_HOOK_MODE`` set to ``none``.  Unsetting this env var resolved problems: https://github.com/rapidsai/ucx-py/issues/616 .  One can quickly check system wide variables with ``env|grep ^UCX_``.
diff --git a/setup.py b/setup.py
@@ -39,8 +39,8 @@
         extra_compile_args=extra_compile_args,
     ),
     Extension(
-        "ucp._libs.utils",
-        sources=["ucp/_libs/utils.pyx"],
+        "ucp._libs.arr",
+        sources=["ucp/_libs/arr.pyx"],
         include_dirs=include_dirs,
         library_dirs=library_dirs,
         libraries=libraries,

diff --git a/tests/test_custom_send_recv.py b/tests/test_custom_send_recv.py
@@ -4,8 +4,7 @@
 import numpy as np
 import pytest
 
-from distributed.comm.utils import to_frames  # noqa
-from distributed.utils import nbytes  # noqa
+from distributed.utils import nbytes
 
 import ucp
 
@@ -43,8 +42,6 @@ async def test_send_recv_cudf(event_loop, g):
     class UCX:
         def __init__(self, ep):
             self.ep = ep
-            loop = asyncio.get_event_loop()
-            self.queue = asyncio.Queue(loop=loop)
 
         async def write(self, cdf):
             header, _frames = cdf.serialize()