From 909168e501b7eb144d4a361a88938af99c1a4352 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Tue, 28 Nov 2023 18:51:11 -0800 Subject: [PATCH] docs/cuda: reword cuda-aware support of communication APIs This change also reorganizes the paragraphs and removes duplicate contents. Signed-off-by: Wenduo Wang --- docs/tuning-apps/networking/cuda.rst | 685 ++++++++++++--------------- 1 file changed, 309 insertions(+), 376 deletions(-) diff --git a/docs/tuning-apps/networking/cuda.rst b/docs/tuning-apps/networking/cuda.rst index c6f44955e51..2b7110b702b 100644 --- a/docs/tuning-apps/networking/cuda.rst +++ b/docs/tuning-apps/networking/cuda.rst @@ -4,51 +4,96 @@ CUDA .. error:: TODO This section needs to be converted from FAQ Q&A style to regular documentation style. -How do I build Open MPI with CUDA-aware support? ------------------------------------------------- +What is CUDA-aware support? +--------------------------- -CUDA-aware support means that the MPI library can send and receive GPU -buffers directly. CUDA support is being continuously updated so -different levels of support exist in different versions. We recommend -you use the latest version of Open MPI for best support. +CUDA-aware support means that the MPI library can use GPU application +buffers directly where host memory can be used. CUDA support is being +continuously updated so different levels of support exist in different +versions. We recommend you use the latest version of Open MPI for best +support. -Open MPI offers two flavors of CUDA support: +.. attention:: Support for operations on GPU buffers varies between + components/modules. We highly recommend CUDA-aware MPI application + authors continue to read, and understand the capabilities of the + Open MPI build on their platforms. -#. Via `UCX `_. +///////////////////////////////////////////////////////////////////////// - This is the preferred mechanism. Since UCX will be providing the - CUDA support, it is important to ensure that UCX itself is built - with CUDA support. +What kind of CUDA support exists in Open MPI? +--------------------------------------------- - To see if your ucx was built with CUDA support run the following - command: +Open MPI depends on various features of CUDA 4.0, so one needs to have +at least the CUDA 4.0 driver and toolkit. The new features of +interest are the Unified Virtual Addressing (UVA) so that all pointers +within a program have unique addresses. In addition, there is a new +API that allows one to determine if a pointer is a CUDA device pointer +or host memory pointer. This API is used by the library to decide +what needs to be done with each buffer. In addition, CUDA 4.1 also +provides the ability to register host memory with the CUDA driver, +which can improve performance. CUDA 4.1 also added CUDA IPC support +for fast communication between GPUs on the same node. - .. code-block:: sh +.. error:: CUDA 4.0 is SUPER OLD! End users dont care about the + differences between cuda-aware, cuda-ipc, gpu-direct, and gpu-direct-rdma - # Check if ucx was built with CUDA support - shell$ ucx_info -v +Note that derived datatypes |mdash| both contiguous and non-contiguous +|mdash| are supported. However, the non-contiguous datatypes +currently have high overhead because of the many calls to the CUDA +function ``cuMemcpy()`` to copy all the pieces of the buffer into the +intermediate buffer. - # configured with: --build=powerpc64le-redhat-linux-gnu --host=powerpc64le-redhat-linux-gnu --program-prefix= --disable-dependency-tracking --prefix=/usr --exec-prefix=/usr --bindir=/usr/bin --sbindir=/usr/sbin --sysconfdir=/etc --datadir=/usr/share --includedir=/usr/include --libdir=/usr/lib64 --libexecdir=/usr/libexec --localstatedir=/var --sharedstatedir=/var/lib --mandir=/usr/share/man --infodir=/usr/share/info --disable-optimizations --disable-logging --disable-debug --disable-assertions --enable-mt --disable-params-check --enable-cma --without-cuda --without-gdrcopy --with-verbs --with-cm --with-knem --with-rdmacm --without-rocm --without-xpmem --without-ugni --without-java +CUDA-aware support is available in: - If you need to build ucx yourself to include CUDA support, please - see the UCX documentation for `building ucx with Open MPI: `_ +* The UCX (``ucx``) PML +* The PSM2 (``psm2``) MTL with the CM (``cm``) PML. +* The OFI (``ofi``) MTL with the CM (``cm``) PML. +* Both CUDA-ized shared memory (``smcuda``) and TCP (``tcp``) BTLs + with the OB1 (``ob1``) PML. +* The HCOLL (``hcoll``) COLL - It should look something like: +See :ref:`this FAQ entry ` +for more details on which commnication APIs are CUDA-aware. - .. code-block:: sh +OFI support for CUDA +^^^^^^^^^^^^^^^^^^^^ - # Configure UCX this way - shell$ ./configure --prefix=/path/to/ucx-cuda-install --with-cuda=/usr/local/cuda --with-gdrcopy=/usr +When running CUDA-aware Open MPI over Libfabric, the OFI MTL will +check if there are any providers capable of handling GPU (or other +accelerator) memory through the ``hmem``-related flags. If a +CUDA-capable provider is available, the OFI MTL will directly send +GPU buffers through Libfabric's API after registering the memory. +If there are no CUDA-capable providers available, the buffers will +automatically be copied to host buffers before being transferred +through Libfabric's API. - # Configure Open MPI this way - shell$ ./configure --with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/local/cuda/lib64/stubs/ --with-ucx=/path/to/ucx-cuda-install +PSM2 support for CUDA +^^^^^^^^^^^^^^^^^^^^^ + +When running CUDA-aware Open MPI on Intel Omni-path, the PSM2 MTL will +automatically set ``PSM2_CUDA`` environment variable which enables +PSM2 to handle GPU buffers. If the user wants to use host buffers +with a CUDA-aware Open MPI, it is recommended to set ``PSM2_CUDA`` +to ``0`` in the execution environment. PSM2 also has support for the +NVIDIA GPUDirect support feature. To enable this, users will need to +set ``PSM2_GPUDIRECT`` to ``1`` in the execution environment. + +Note: The PSM2 library and ``hfi1`` driver with CUDA support are +requirements to use GPUDirect support on Intel Omni-Path. The minimum +PSM2 build version required is `PSM2 10.2.175 +`_. -#. Via internal Open MPI CUDA support +For more information refer to the `Intel Omni-Path documentation +`_. + +///////////////////////////////////////////////////////////////////////// + +How do I build Open MPI with CUDA-aware support? +------------------------------------------------ -Regardless of which flavor of CUDA support (or both) you plan to use, Open MPI should be configured using the ``--with-cuda=`` and ``--with-cuda-libdir=`` configure options to -build CUDA support into Open MPI. +build CUDA support. Open MPI supports building with CUDA libraries and running on systems without CUDA libraries or hardware. In order to take advantage of @@ -68,31 +113,35 @@ An example configure command would look like the following: shell$ ./configure --with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/local/cuda/lib64/stubs \ --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda -///////////////////////////////////////////////////////////////////////// +Build with `UCX `_ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -How do I verify that Open MPI has been built with CUDA support? ---------------------------------------------------------------- +If you want to build Open MPI with UCX , it is important to ensure +that UCX itself is built with CUDA support, which can be verified +with the following command: -Verify that Open MPI has been built with cuda using ``ompi_info`` + .. code-block:: sh -.. code-block:: sh + # Check if ucx was built with CUDA support + shell$ ucx_info -v - # Use ompi_info to verify cuda support in Open MPI - shell$ ./ompi_info |grep "MPI extensions" - MPI extensions: affinity, cuda, pcollreq + # configured with: --build=powerpc64le-redhat-linux-gnu --host=powerpc64le-redhat-linux-gnu --program-prefix= --disable-dependency-tracking --prefix=/usr --exec-prefix=/usr --bindir=/usr/bin --sbindir=/usr/sbin --sysconfdir=/etc --datadir=/usr/share --includedir=/usr/include --libdir=/usr/lib64 --libexecdir=/usr/libexec --localstatedir=/var --sharedstatedir=/var/lib --mandir=/usr/share/man --infodir=/usr/share/info --disable-optimizations --disable-logging --disable-debug --disable-assertions --enable-mt --disable-params-check --enable-cma --without-cuda --without-gdrcopy --with-verbs --with-cm --with-knem --with-rdmacm --without-rocm --without-xpmem --without-ugni --without-java -///////////////////////////////////////////////////////////////////////// + If you need to build ucx yourself to include CUDA support, please + see the UCX documentation for `building ucx with Open MPI: `_ -How do I run Open MPI with applications that pass CUDA buffers to MPI? ----------------------------------------------------------------------- + It should look something like: -Open MPI will detect and enable CUDA enabled components at runtime with -no additional mpirun parameters. + .. code-block:: sh -///////////////////////////////////////////////////////////////////////// + # Configure UCX this way + shell$ ./configure --prefix=/path/to/ucx-cuda-install --with-cuda=/usr/local/cuda --with-gdrcopy=/usr + + # Configure Open MPI this way + shell$ ./configure --with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/local/cuda/lib64/stubs/ --with-ucx=/path/to/ucx-cuda-install -How do I build Open MPI with CUDA-aware support using PGI? ----------------------------------------------------------- +Build with PGI +^^^^^^^^^^^^^^ With CUDA 6.5, you can build all versions of CUDA-aware Open MPI without doing anything special. However, with CUDA 7.0 and CUDA 7.5, @@ -109,102 +158,59 @@ correctly. Add the following to your configure line. ///////////////////////////////////////////////////////////////////////// -What kind of CUDA support exists in Open MPI? ---------------------------------------------- - -CUDA-aware support is defined as Open MPI automatically detecting that -the argument pointer being passed to an MPI routine is a CUDA device -memory pointer. - -See :ref:`this FAQ entry ` -for more details on which APIs are CUDA-aware. - - -.. error:: CUDA 4.0 is SUPER OLD! End users dont care about the - differences between cuda-aware, cuda-ipc, gpu-direct, and gpu-direct-rdma +How do I verify that Open MPI has been built with CUDA support? +--------------------------------------------------------------- -Open MPI depends on various features of CUDA 4.0, so one needs to have -at least the CUDA 4.0 driver and toolkit. The new features of -interest are the Unified Virtual Addressing (UVA) so that all pointers -within a program have unique addresses. In addition, there is a new -API that allows one to determine if a pointer is a CUDA device pointer -or host memory pointer. This API is used by the library to decide -what needs to be done with each buffer. In addition, CUDA 4.1 also -provides the ability to register host memory with the CUDA driver, -which can improve performance. CUDA 4.1 also added CUDA IPC support -for fast communication between GPUs on the same node. +Use the ``ompi_info`` command: -Note that derived datatypes |mdash| both contiguous and non-contiguous -|mdash| are supported. However, the non-contiguous datatypes -currently have high overhead because of the many calls to the CUDA -function ``cuMemcpy()`` to copy all the pieces of the buffer into the -intermediate buffer. +.. code-block:: -CUDA-aware support is available in: + shell$ ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + mca:mpi:base:param:mpi_built_with_cuda_support:value:true -* The UCX (``ucx``) PML -* The PSM2 (``psm2``) MTL with the CM (``cm``) PML. -* The OFI (``ofi``) MTL with the CM (``cm``) PML. -* Both CUDA-ized shared memory (``smcuda``) and TCP (``tcp``) BTLs - with the OB1 (``ob1``) PML. -* The HCOLL (``hcoll``) COLL +See :ref:`this FAQ entry ` +for more details on verifying CUDA support at run time. ///////////////////////////////////////////////////////////////////////// -PSM2 support for CUDA ---------------------- - -CUDA-aware support is present in PSM2 MTL. When running CUDA-aware -Open MPI on Intel Omni-path, the PSM2 MTL will automatically set -``PSM2_CUDA`` environment variable which enables PSM2 to handle GPU -buffers. If the user wants to use host buffers with a CUDA-aware Open -MPI, it is recommended to set ``PSM2_CUDA`` to ``0`` in the execution -environment. PSM2 also has support for the NVIDIA GPUDirect support -feature. To enable this, users will need to set ``PSM2_GPUDIRECT`` -to ``1`` in the execution environment. - -Note: The PSM2 library and ``hfi1`` driver with CUDA support are -requirements to use GPUDirect support on Intel Omni-Path. The minimum -PSM2 build version required is `PSM2 10.2.175 -`_. +How do I run Open MPI with CUDA applications? +--------------------------------------------- -For more information refer to the `Intel Omni-Path documentation -`_. +Open MPI will detect and enable CUDA enabled components at runtime with +no additional mpirun parameters. -///////////////////////////////////////////////////////////////////////// +How do I use CUDA-aware UCX for Open MPI? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -OFI support for CUDA ---------------------- - -CUDA-aware support is present in OFI MTL. When running CUDA-aware -Open MPI over Libfabric, the OFI MTL will check if there are any -providers capable of handling GPU (or other accelerator) memory -through the ``hmem``-related flags. If a CUDA-capable provider is -available, the OFI MTL will directly send GPU buffers through -Libfabric's API after registering the memory. If there are no -CUDA-capable providers available, the buffers will automatically -be copied to host buffers before being transferred through -Libfabric's API. +Example of running ``osu_latency`` from the `OSU benchmarks +`_ with CUDA buffers +using Open MPI and UCX CUDA support: -///////////////////////////////////////////////////////////////////////// +.. code-block:: + shell$ mpirun -n 2 --mca pml ucx \ + -x UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc ./osu_latency D D -How can I tell if Open MPI was built with CUDA support? -------------------------------------------------------- +How do I enable CUDA-aware support in HCOLL collective component? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Use the ``ompi_info`` command: +To enable CUDA GPU buffer support in HCOLL collectives pass the following +environment variables via mpirun: .. code-block:: - shell$ ompi_info --parsable --all | grep mpi_built_with_cuda_support:value - mca:mpi:base:param:mpi_built_with_cuda_support:value:true + shell$ mpirun -x HCOLL_GPU_ENABLE=1 -x HCOLL_ENABLE_NBC=1 .. + +See `nVidia HCOLL documentation `_ +for more information. ///////////////////////////////////////////////////////////////////////// -Can I get additional CUDA debug-level information at run-time? --------------------------------------------------------------- -Yes, by enabling some vebosity flags. +Get additional CUDA debug-level information +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +More debug information can be obtained by enabling verbosity flags. * The ``opal_cuda_verbose`` parameter has only one level of verbosity: @@ -253,136 +259,6 @@ Yes, by enabling some vebosity flags. ///////////////////////////////////////////////////////////////////////// -.. _faq-cuda-mpi-cuda-numa-issues-label: - -NUMA Node Issues ----------------- - -When running on a node that has multiple GPUs, you may want to select -the GPU that is closest to the NUMA node on which your process is -running. One way to do this is to make use of the ``hwloc`` library. -The following is a C code snippet that can be used in your application -to select a GPU that is close. It will determine on which CPU it is -running and then look for the closest GPU. There could be multiple -GPUs that are the same distance away. This is dependent on having -``hwloc`` somewhere on your system. - -.. code-block:: c - - /** - * Test program to show the use of hwloc to select the GPU closest to the CPU - * that the MPI program is running on. Note that this works even without - * any libpciaccess or libpci support as it keys off the NVIDIA vendor ID. - * There may be other ways to implement this but this is one way. - * January 10, 2014 - */ - #include - #include - #include "cuda.h" - #include "mpi.h" - #include "hwloc.h" - - #define ABORT_ON_ERROR(func) \ - { CUresult res; \ - res = func; \ - if (CUDA_SUCCESS != res) { \ - printf("%s returned error=%d\n", #func, res); \ - abort(); \ - } \ - } - static hwloc_topology_t topology = NULL; - static int gpuIndex = 0; - static hwloc_obj_t gpus[16] = {0}; - - /** - * This function searches for all the GPUs that are hanging off a NUMA - * node. It walks through each of the PCI devices and looks for ones - * with the NVIDIA vendor ID. It then stores them into an array. - * Note that there can be more than one GPU on the NUMA node. - */ - static void find_gpus(hwloc_topology_t topology, hwloc_obj_t parent, hwloc_obj_t child) { - hwloc_obj_t pcidev; - pcidev = hwloc_get_next_child(topology, parent, child); - if (NULL == pcidev) { - return; - } else if (0 != pcidev->arity) { - /* This device has children so need to look recursively at them */ - find_gpus(topology, pcidev, NULL); - find_gpus(topology, parent, pcidev); - } else { - if (pcidev->attr->pcidev.vendor_id == 0x10de) { - gpus[gpuIndex++] = pcidev; - } - find_gpus(topology, parent, pcidev); - } - } - - int main(int argc, char *argv[]) - { - int rank, retval, length; - char procname[MPI_MAX_PROCESSOR_NAME+1]; - const unsigned long flags = HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES; - hwloc_cpuset_t newset; - hwloc_obj_t node, bridge; - char pciBusId[16]; - CUdevice dev; - char devName[256]; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (MPI_SUCCESS != MPI_Get_processor_name(procname, &length)) { - strcpy(procname, "unknown"); - } - - /* Now decide which GPU to pick. This requires hwloc to work properly. - * We first see which CPU we are bound to, then try and find a GPU nearby. - */ - retval = hwloc_topology_init(&topology); - assert(retval == 0); - retval = hwloc_topology_set_flags(topology, flags); - assert(retval == 0); - retval = hwloc_topology_load(topology); - assert(retval == 0); - newset = hwloc_bitmap_alloc(); - retval = hwloc_get_last_cpu_location(topology, newset, 0); - assert(retval == 0); - - /* Get the object that contains the cpuset */ - node = hwloc_get_first_largest_obj_inside_cpuset(topology, newset); - - /* Climb up from that object until we find the HWLOC_OBJ_NODE */ - while (node->type != HWLOC_OBJ_NODE) { - node = node->parent; - } - - /* Now look for the HWLOC_OBJ_BRIDGE. All PCI busses hanging off the - * node will have one of these */ - bridge = hwloc_get_next_child(topology, node, NULL); - while (bridge->type != HWLOC_OBJ_BRIDGE) { - bridge = hwloc_get_next_child(topology, node, bridge); - } - - /* Now find all the GPUs on this NUMA node and put them into an array */ - find_gpus(topology, bridge, NULL); - - ABORT_ON_ERROR(cuInit(0)); - /* Now select the first GPU that we find */ - if (gpus[0] == 0) { - printf("No GPU found\n"); - } else { - sprintf(pciBusId, "%.2x:%.2x:%.2x.%x", gpus[0]->attr->pcidev.domain, gpus[0]->attr->pcidev.bus, - gpus[0]->attr->pcidev.dev, gpus[0]->attr->pcidev.func); - ABORT_ON_ERROR(cuDeviceGetByPCIBusId(&dev, pciBusId)); - ABORT_ON_ERROR(cuDeviceGetName(devName, 256, dev)); - printf("rank=%d (%s): Selected GPU=%s, name=%s\n", rank, procname, pciBusId, devName); - } - - MPI_Finalize(); - return 0; - } - -///////////////////////////////////////////////////////////////////////// - How do I develop CUDA-aware Open MPI applications? -------------------------------------------------- @@ -410,135 +286,75 @@ example of how to write CUDA-aware MPI applications. .. _faq-cuda-mpi-apis-cuda-label: -Which MPI APIs work with CUDA-aware? ------------------------------------- +CUDA-aware support of communication APIs +---------------------------------------- + +The level of CUDA-aware support depends on the Open MPI build and the system +that it runs atop, and varies between components/modules. This section +provides general advice to applications looking to use GPU buffers. + +.. hint:: + + As CUDA-aware support evolves in newer Open MPI versions, this section + should be updated to reflect the latest status. If you find inaccuracies, + please submit a bug report, or update the document at the minimum. + The following information was last updated in November 2023. + +APIs with CUDA-aware support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Point-to-Point commnication APIs support sending from and receive on +GPU buffers, including both blocking and non-blocking variants. + +Most blocking collective communication APIs support GPU send/receive +buffers, with the **exception** of the reductive versions: -* MPI_Allgather -* MPI_Allgatherv * MPI_Allreduce -* MPI_Alltoall -* MPI_Alltoallv -* MPI_Alltoallw -* MPI_Bcast -* MPI_Bsend -* MPI_Bsend_init -* MPI_Exscan -* MPI_Ibsend -* MPI_Irecv -* MPI_Isend -* MPI_Irsend -* MPI_Issend -* MPI_Gather -* MPI_Gatherv -* MPI_Get -* MPI_Put -* MPI_Rsend -* MPI_Rsend_init -* MPI_Recv -* MPI_Recv_init * MPI_Reduce * MPI_Reduce_scatter * MPI_Reduce_scatter_block -* MPI_Scan -* MPI_Scatter -* MPI_Scatterv -* MPI_Send -* MPI_Send_init -* MPI_Sendrecv -* MPI_Ssend -* MPI_Ssend_init -* MPI_Win_create - -.. FIXME: We need to verify the above list. -///////////////////////////////////////////////////////////////////////// +.. attention:: -Which MPI APIs do NOT work with CUDA-aware? -------------------------------------------- + Some blocking collective algorithms are implemented with non-blocking APIs, + and therefore lack CUDA-aware support. In such cases, the application can + unselect the algorithm or component with corresponding MCA parameters. + +One-sided communication APIs currently have limited CUDA-aware support. Only +``MPI_Get`` and ``MPI_Put`` support GPU buffers. + +APIs without CUDA-aware support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Non-blocking collective communication APIs generally do not have CUDA-aware +support, plus reductive blocking APIs: + +* MPI_Allreduce +* MPI_Reduce +* MPI_Reduce_scatter +* MPI_Reduce_scatter_block + +One-sided commnication APIs other than ``MPI_Get`` and ``MPI_Put`` do not fully +support GPU buffers, including: * MPI_Accumulate * MPI_Compare_and_swap * MPI_Fetch_and_op * MPI_Get_Accumulate -* MPI_Iallgather -* MPI_Iallgatherv -* MPI_Iallreduce -* MPI_Ialltoall -* MPI_Ialltoallv -* MPI_Ialltoallw -* MPI_Ibcast -* MPI_Iexscan * MPI_Rget * MPI_Rput -.. FIXME: We need to verify the above list. - -///////////////////////////////////////////////////////////////////////// - -How do I use CUDA-aware UCX for Open MPI? ------------------------------------------ - -Example of running ``osu_latency`` from the `OSU benchmarks -`_ with CUDA buffers -using Open MPI and UCX CUDA support: - -.. code-block:: - - shell$ mpirun -n 2 --mca pml ucx \ - -x UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc ./osu_latency D D +UCX and UCC +^^^^^^^^^^^ -///////////////////////////////////////////////////////////////////////// - -Which MPI APIs work with CUDA-aware UCX? ----------------------------------------- +UCX and UCC supports CUDA-aware blocking reduction collective APIs: -* MPI_Send -* MPI_Bsend -* MPI_Ssend -* MPI_Rsend -* MPI_Isend -* MPI_Ibsend -* MPI_Issend -* MPI_Irsend -* MPI_Send_init -* MPI_Bsend_init -* MPI_Ssend_init -* MPI_Rsend_init -* MPI_Recv -* MPI_Irecv -* MPI_Recv_init -* MPI_Sendrecv -* MPI_Bcast -* MPI_Gather -* MPI_Gatherv -* MPI_Allgather +* MPI_Allreduce * MPI_Reduce * MPI_Reduce_scatter * MPI_Reduce_scatter_block -* MPI_Allreduce -* MPI_Scan -* MPI_Exscan -* MPI_Allgatherv -* MPI_Alltoall -* MPI_Alltoallv -* MPI_Alltoallw -* MPI_Scatter -* MPI_Scatterv -* MPI_Iallgather -* MPI_Iallgatherv -* MPI_Ialltoall -* MPI_Iialltoallv -* MPI_Ialltoallw -* MPI_Ibcast -* MPI_Iexscan - -.. FIXME: We need to verify the above list. These _SHOULD_ be the same - as above. - -///////////////////////////////////////////////////////////////////////// -Which MPI APIs do NOT work with CUDA-aware UCX? ------------------------------------------------ +However, the following APIs do not support GPU buffers: * All one-sided operations such as MPI_Put, MPI_Get, MPI_Accumulate, MPI_Rget, MPI_Rput, MPI_Get_Accumulate, MPI_Fetch_and_op, @@ -550,9 +366,23 @@ Which MPI APIs do NOT work with CUDA-aware UCX? .. FIXME: Checking with nVidia. This may be more of an issue of OSC_UCX not supporting CUDA, though perhaps it's just performance. +HCOLL collective component +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +HCOLL collective component offers CUDA-aware support for the following APIs +in addition to the above: + +* MPI_Allreduce +* MPI_Ibarrier +* MPI_Ibcast +* MPI_Iallgather +* MPI_Iallreduce + ///////////////////////////////////////////////////////////////////////// -Can I tell at compile time or runtime whether I have CUDA-aware support? +.. _cuda-support-runtime-check-label: + +How do I verify CUDA-aware support at compile or run time? ------------------------------------------------------------------------ There is both a compile time check and a run-time check available. @@ -699,29 +529,132 @@ those to select a GPU, e.g. using MPI internal CUDA resources are released during MPI_Finalize. Thus it is an application error to call cudaDeviceReset before MPI_Finalize is called. - ///////////////////////////////////////////////////////////////////////// -How do I enable CUDA support in HCOLL collective component ----------------------------------------------------------- +.. _faq-cuda-mpi-cuda-numa-issues-label: + +NUMA Node Issues +---------------- + +When running on a node that has multiple GPUs, you may want to select +the GPU that is closest to the NUMA node on which your process is +running. One way to do this is to make use of the ``hwloc`` library. +The following is a C code snippet that can be used in your application +to select a GPU that is close. It will determine on which CPU it is +running and then look for the closest GPU. There could be multiple +GPUs that are the same distance away. This is dependent on having +``hwloc`` somewhere on your system. + +.. code-block:: c -HCOLL component supports CUDA GPU buffers for the following -collectives: + /** + * Test program to show the use of hwloc to select the GPU closest to the CPU + * that the MPI program is running on. Note that this works even without + * any libpciaccess or libpci support as it keys off the NVIDIA vendor ID. + * There may be other ways to implement this but this is one way. + * January 10, 2014 + */ + #include + #include + #include "cuda.h" + #include "mpi.h" + #include "hwloc.h" -MPI_Allreduce -MPI_Bcast -MPI_Allgather -MPI_Ibarrier -MPI_Ibcast -MPI_Iallgather -MPI_Iallreduce + #define ABORT_ON_ERROR(func) \ + { CUresult res; \ + res = func; \ + if (CUDA_SUCCESS != res) { \ + printf("%s returned error=%d\n", #func, res); \ + abort(); \ + } \ + } + static hwloc_topology_t topology = NULL; + static int gpuIndex = 0; + static hwloc_obj_t gpus[16] = {0}; -To enable CUDA GPU buffer support in these collectives pass the -following environment variables via mpirun: + /** + * This function searches for all the GPUs that are hanging off a NUMA + * node. It walks through each of the PCI devices and looks for ones + * with the NVIDIA vendor ID. It then stores them into an array. + * Note that there can be more than one GPU on the NUMA node. + */ + static void find_gpus(hwloc_topology_t topology, hwloc_obj_t parent, hwloc_obj_t child) { + hwloc_obj_t pcidev; + pcidev = hwloc_get_next_child(topology, parent, child); + if (NULL == pcidev) { + return; + } else if (0 != pcidev->arity) { + /* This device has children so need to look recursively at them */ + find_gpus(topology, pcidev, NULL); + find_gpus(topology, parent, pcidev); + } else { + if (pcidev->attr->pcidev.vendor_id == 0x10de) { + gpus[gpuIndex++] = pcidev; + } + find_gpus(topology, parent, pcidev); + } + } -.. code-block:: + int main(int argc, char *argv[]) + { + int rank, retval, length; + char procname[MPI_MAX_PROCESSOR_NAME+1]; + const unsigned long flags = HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES; + hwloc_cpuset_t newset; + hwloc_obj_t node, bridge; + char pciBusId[16]; + CUdevice dev; + char devName[256]; - shell$ mpirun -x HCOLL_GPU_ENABLE=1 -x HCOLL_ENABLE_NBC=1 .. + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (MPI_SUCCESS != MPI_Get_processor_name(procname, &length)) { + strcpy(procname, "unknown"); + } -See `nVidia HCOLL documentation `_ -for more information. + /* Now decide which GPU to pick. This requires hwloc to work properly. + * We first see which CPU we are bound to, then try and find a GPU nearby. + */ + retval = hwloc_topology_init(&topology); + assert(retval == 0); + retval = hwloc_topology_set_flags(topology, flags); + assert(retval == 0); + retval = hwloc_topology_load(topology); + assert(retval == 0); + newset = hwloc_bitmap_alloc(); + retval = hwloc_get_last_cpu_location(topology, newset, 0); + assert(retval == 0); + + /* Get the object that contains the cpuset */ + node = hwloc_get_first_largest_obj_inside_cpuset(topology, newset); + + /* Climb up from that object until we find the HWLOC_OBJ_NODE */ + while (node->type != HWLOC_OBJ_NODE) { + node = node->parent; + } + + /* Now look for the HWLOC_OBJ_BRIDGE. All PCI busses hanging off the + * node will have one of these */ + bridge = hwloc_get_next_child(topology, node, NULL); + while (bridge->type != HWLOC_OBJ_BRIDGE) { + bridge = hwloc_get_next_child(topology, node, bridge); + } + + /* Now find all the GPUs on this NUMA node and put them into an array */ + find_gpus(topology, bridge, NULL); + + ABORT_ON_ERROR(cuInit(0)); + /* Now select the first GPU that we find */ + if (gpus[0] == 0) { + printf("No GPU found\n"); + } else { + sprintf(pciBusId, "%.2x:%.2x:%.2x.%x", gpus[0]->attr->pcidev.domain, gpus[0]->attr->pcidev.bus, + gpus[0]->attr->pcidev.dev, gpus[0]->attr->pcidev.func); + ABORT_ON_ERROR(cuDeviceGetByPCIBusId(&dev, pciBusId)); + ABORT_ON_ERROR(cuDeviceGetName(devName, 256, dev)); + printf("rank=%d (%s): Selected GPU=%s, name=%s\n", rank, procname, pciBusId, devName); + } + + MPI_Finalize(); + return 0; + }