Skip to content

Commit

Permalink
Merge pull request #649 from rapidsai/branch-0.16
Browse files Browse the repository at this point in the history
[RELEASE] ucx-py v0.16
  • Loading branch information
raydouglass authored Oct 21, 2020
2 parents fc2d6b9 + 3bc60af commit b804a78
Show file tree
Hide file tree
Showing 24 changed files with 1,318 additions and 484 deletions.
10 changes: 5 additions & 5 deletions benchmarks/local-send-recv.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ async def server_handler(ep):

assert msg_recv_list[0].nbytes == args.n_bytes
for i in range(args.n_iter):
await ep.recv(msg_recv_list[i], args.n_bytes)
await ep.send(msg_recv_list[i], args.n_bytes)
await ep.recv(msg_recv_list[i])
await ep.send(msg_recv_list[i])
await ep.close()
lf.close()

Expand Down Expand Up @@ -135,8 +135,8 @@ async def run():
times = []
for i in range(args.n_iter):
start = clock()
await ep.send(msg_send_list[i], args.n_bytes)
await ep.recv(msg_recv_list[i], args.n_bytes)
await ep.send(msg_send_list[i])
await ep.recv(msg_recv_list[i])
stop = clock()
times.append(stop - start)
if args.cuda_profile:
Expand Down Expand Up @@ -198,7 +198,7 @@ def parse_args():
metavar="N",
default=10,
type=int,
help="Numer of send / recv iterations (default 10).",
help="Number of send / recv iterations (default 10).",
)
parser.add_argument(
"-b",
Expand Down
9 changes: 0 additions & 9 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,3 @@ else
fi

echo "Preparing '$RELEASE_TYPE' release [$CURRENT_TAG -> $NEXT_FULL_TAG]"

# Inplace sed replace; workaround for Linux and Mac
function sed_runner() {
sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
}

# RTD update
sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py
sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py
2 changes: 2 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ UCX-Py is the Python interface for `UCX <https://github.com/openucx/ucx>`_, a lo
configuration
dask
deployment
ucx-debug


.. toctree::
:maxdepth: 1
Expand Down
7 changes: 5 additions & 2 deletions docs/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Conda
-----

Some preliminary Conda packages can be installed as so. Replace
``<CUDA version>`` with either ``9.2``, ``10.0``, or ``10.1``. These are
``<CUDA version>`` with either ``10.1``, ``10.2``, or ``11.0``. These are
available both on ``rapidsai`` and ``rapidsai-nightly``.

With GPU support:
Expand Down Expand Up @@ -74,7 +74,7 @@ UCX
cd ucx
git checkout v1.8.x
# apply UCX IB registration cache patches, improves overall
# IB performance when using a memory pool
# CUDA IB performance when using a memory pool
curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/master/recipe/add-page-alignment.patch
curl -LO https://raw.githubusercontent.com/rapidsai/ucx-split-feedstock/master/recipe/ib_registration_cache.patch
git apply ib_registration_cache.patch && git apply add-page-alignment.patch
Expand All @@ -87,6 +87,9 @@ UCX
../contrib/configure-devel --prefix=$CONDA_PREFIX --with-cuda=$CUDA_HOME --enable-mt CPPFLAGS="-I/$CUDA_HOME/include"
make -j install

.. note::
If you're running on a machine without CUDA then you _must NOT_ apply the patches.

UCX + OFED
~~~~~~~~~~

Expand Down
10 changes: 4 additions & 6 deletions docs/source/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,15 @@ Process 2 - Client
host = ucp.get_address(ifname='eth0') # ethernet device name
ep = await ucp.create_endpoint(host, port)
msg = np.zeros(n_bytes, dtype='u1') # create some data to send
msg_size = np.array([msg.nbytes], dtype=np.uint64)
# send message
print("Send Original NumPy array")
await ep.send(msg, msg_size) # send the real message
await ep.send(msg) # send the real message
# recv response
print("Receive Incremented NumPy arrays")
resp = np.empty_like(msg)
await ep.recv(resp, msg_size) # receive the echo
await ep.recv(resp) # receive the echo
await ep.close()
np.testing.assert_array_equal(msg + 1, resp)
Expand Down Expand Up @@ -159,16 +158,15 @@ Process 2 - Client
host = ucp.get_address(ifname='eth0') # ethernet device name
ep = await ucp.create_endpoint(host, port)
msg = cp.zeros(n_bytes, dtype='u1') # create some data to send
msg_size = np.array([msg.nbytes], dtype=np.uint64)
# send message
print("Send Original CuPy array")
await ep.send(msg, msg_size) # send the real message
await ep.send(msg) # send the real message
# recv response
print("Receive Incremented CuPy arrays")
resp = cp.empty_like(msg)
await ep.recv(resp, msg_size) # receive the echo
await ep.recv(resp) # receive the echo
await ep.close()
cp.testing.assert_array_equal(msg + 1, resp)
Expand Down
120 changes: 120 additions & 0 deletions docs/source/ucx-debug.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
UCX Debugging
=============

InfiniBand
----------

System Configuration
~~~~~~~~~~~~~~~~~~~~


``ibdev2netdev`` -- check to ensure at least one IB controller is configured for IPoIB

::

user@mlnx:~$ ibdev2netdev
mlx5_0 port 1 ==> ib0 (Up)
mlx5_1 port 1 ==> ib1 (Up)
mlx5_2 port 1 ==> ib2 (Up)
mlx5_3 port 1 ==> ib3 (Up)

``ucx_info -d`` and ``ucx_info -p -u t`` are helpful commands to display what UCX understands about the underlying hardware


InfiniBand Performance
~~~~~~~~~~~~~~~~~~~~~~

``ucx_perftest`` should confirm InfiniBand bandwidth to be in the 10+ GB/s range

::

CUDA_VISIBLE_DEVICES=0 UCX_NET_DEVICES=mlx5_0:1 UCX_TLS=rc,cuda_copy ucx_perftest -t tag_bw -m cuda -s 10000000 -n 10 -p 9999 & \
CUDA_VISIBLE_DEVICES=1 UCX_NET_DEVICES=mlx5_1:1 UCX_TLS=rc,cuda_copy ucx_perftest `hostname` -t tag_bw -m cuda -s 100000000 -n 10 -p 9999

+--------------+-----------------------------+---------------------+-----------------------+
| | latency (usec) | bandwidth (MB/s) | message rate (msg/s) |
+--------------+---------+---------+---------+----------+----------+-----------+-----------+
| # iterations | typical | average | overall | average | overall | average | overall |
+--------------+---------+---------+---------+----------+----------+-----------+-----------+
+------------------------------------------------------------------------------------------+
| API: protocol layer |
| Test: tag match bandwidth |
| Data layout: (automatic) |
| Send memory: cuda |
| Recv memory: cuda |
| Message size: 100000000 |
+------------------------------------------------------------------------------------------+
10 0.000 9104.800 9104.800 10474.41 10474.41 110 110


``-c`` option is NUMA dependent and sets the CPU Affinity of process for a particular GPU. CPU Affinity information can be found in ``nvidia-smi topo -m``
::

user@mlnx:~$ nvidia-smi topo -m
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 mlx5_2 mlx5_3 CPU Affinity
GPU0 X NV1 NV1 NV2 NV2 SYS SYS SYS PIX PHB SYS SYS 0-19,40-59
GPU1 NV1 X NV2 NV1 SYS NV2 SYS SYS PIX PHB SYS SYS 0-19,40-59
GPU2 NV1 NV2 X NV2 SYS SYS NV1 SYS PHB PIX SYS SYS 0-19,40-59
GPU3 NV2 NV1 NV2 X SYS SYS SYS NV1 PHB PIX SYS SYS 0-19,40-59
GPU4 NV2 SYS SYS SYS X NV1 NV1 NV2 SYS SYS PIX PHB 20-39,60-79
GPU5 SYS NV2 SYS SYS NV1 X NV2 NV1 SYS SYS PIX PHB 20-39,60-79
GPU6 SYS SYS NV1 SYS NV1 NV2 X NV2 SYS SYS PHB PIX 20-39,60-79
GPU7 SYS SYS SYS NV1 NV2 NV1 NV2 X SYS SYS PHB PIX 20-39,60-79
mlx5_0 PIX PIX PHB PHB SYS SYS SYS SYS X PHB SYS SYS
mlx5_1 PHB PHB PIX PIX SYS SYS SYS SYS PHB X SYS SYS
mlx5_2 SYS SYS SYS SYS PIX PIX PHB PHB SYS SYS X PHB
mlx5_3 SYS SYS SYS SYS PHB PHB PIX PIX SYS SYS PHB X

Legend:

X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
NV# = Connection traversing a bonded set of # NVLinks

NVLink
------

System Configuration
~~~~~~~~~~~~~~~~~~~~


The NVLink connectivity on the system above (DGX-1) is not homogenous,
some GPUs are connected by a single NVLink connection (NV1, e.g., GPUs 0 and
1), others with two NVLink connections (NV2, e.g., GPUs 1 and 2), and some not
connected at all via NVLink (SYS, e.g., GPUs 3 and 4)."

NVLink Performance
~~~~~~~~~~~~~~~~~~

``ucx_perftest`` should confirm NVLink bandwidth to be in the 20+ GB/s range

::

CUDA_VISIBLE_DEVICES=0 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm ucx_perftest -t tag_bw -m cuda -s 10000000 -n 10 -p 9999 -c 0 & \
CUDA_VISIBLE_DEVICES=1 UCX_TLS=cuda_ipc,cuda_copy,tcp,sockcm UCX_SOCKADDR_TLS_PRIORITY=sockcm ucx_perftest `hostname` -t tag_bw -m cuda -s 100000000 -n 10 -p 9999 -c 1
+--------------+-----------------------------+---------------------+-----------------------+
| | latency (usec) | bandwidth (MB/s) | message rate (msg/s) |
+--------------+---------+---------+---------+----------+----------+-----------+-----------+
| # iterations | typical | average | overall | average | overall | average | overall |
+--------------+---------+---------+---------+----------+----------+-----------+-----------+
+------------------------------------------------------------------------------------------+
| API: protocol layer |
| Test: tag match bandwidth |
| Data layout: (automatic) |
| Send memory: cuda |
| Recv memory: cuda |
| Message size: 100000000 |
+------------------------------------------------------------------------------------------+
10 0.000 4163.694 4163.694 22904.52 22904.52 240 240


Experimental Debugging
----------------------

A list of problems we have run into along the way while trying to understand performance issues with UCX/UCX-Py:

- System-wide settings environment variables. For example, we saw a system with ``UCX_MEM_MMAP_HOOK_MODE`` set to ``none``. Unsetting this env var resolved problems: https://github.com/rapidsai/ucx-py/issues/616 . One can quickly check system wide variables with ``env|grep ^UCX_``.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
extra_compile_args=extra_compile_args,
),
Extension(
"ucp._libs.utils",
sources=["ucp/_libs/utils.pyx"],
"ucp._libs.arr",
sources=["ucp/_libs/arr.pyx"],
include_dirs=include_dirs,
library_dirs=library_dirs,
libraries=libraries,
Expand Down
5 changes: 1 addition & 4 deletions tests/test_custom_send_recv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
import numpy as np
import pytest

from distributed.comm.utils import to_frames # noqa
from distributed.utils import nbytes # noqa
from distributed.utils import nbytes

import ucp

Expand Down Expand Up @@ -43,8 +42,6 @@ async def test_send_recv_cudf(event_loop, g):
class UCX:
def __init__(self, ep):
self.ep = ep
loop = asyncio.get_event_loop()
self.queue = asyncio.Queue(loop=loop)

async def write(self, cdf):
header, _frames = cdf.serialize()
Expand Down
Loading

0 comments on commit b804a78

Please sign in to comment.