diff --git a/buildlib/az-distro-release.yml b/buildlib/az-distro-release.yml index ebbe1a15e80a..8bd8c508cc4e 100644 --- a/buildlib/az-distro-release.yml +++ b/buildlib/az-distro-release.yml @@ -22,9 +22,6 @@ jobs: centos8_cuda11: build_container: centos8_cuda11 artifact_name: $(POSTFIX)-centos8-mofed5-cuda11.tar.bz2 - ubuntu16_cuda11: - build_container: ubuntu16_cuda11 - artifact_name: $(POSTFIX)-ubuntu16.04-mofed5-cuda11.tar.bz2 ubuntu18_cuda11: build_container: ubuntu18_cuda11 artifact_name: $(POSTFIX)-ubuntu18.04-mofed5-cuda11.tar.bz2 @@ -69,13 +66,22 @@ jobs: tar -xzvf ${tarball} # extract the sources in a subdirectory cd $(tar tf ${tarball} | head -1) # go to extracted tarball directory echo 10 > debian/compat # https://www.debian.org/doc/manuals/maint-guide/dother.en.htmdpl#compat - dpkg-buildpackage -us -uc -Pcuda + dpkg-buildpackage -d -us -uc -Pcuda cd .. # Move back to the working directory find . -name '*.deb' - VER="${POSTFIX#ucx-}" # Remove 'ucx' prefix from the POSTFIX string - # Rename DEB files + + # Rename DEB files + VER="${POSTFIX#ucx-}" # Remove 'ucx-' prefix from the POSTFIX string find . -name "ucx*.deb" -exec bash -c 'mv "$1" "${1%%_*}-'"${VER}"'.deb"' _ {} \; + + # Remove ucx-cuda dependency on a specific libnvidia-compute version + dpkg-deb -R "ucx-cuda${VER}.deb" tmp # Extract + sed -i 's/libnvidia-compute-[0-9]\+,//g' tmp/DEBIAN/control + dpkg-deb -b tmp ucx-cuda_new.deb # Rebuild dpkg-deb -I "ucx-${VER}.deb" + dpkg-deb -I "ucx-cuda-${VER}.deb" + + # Package tar -cjf "${AZ_ARTIFACT_NAME}" *.deb # Package all DEBs tar -tjf "${AZ_ARTIFACT_NAME}" displayName: Build DEB package diff --git a/buildlib/azure-pipelines-release.yml b/buildlib/azure-pipelines-release.yml index 0e8b796ffd11..c13bf6e8a2c4 100644 --- a/buildlib/azure-pipelines-release.yml +++ b/buildlib/azure-pipelines-release.yml @@ -19,14 +19,12 @@ resources: options: $(DOCKER_OPT_VOLUMES) - container: centos8_cuda11 image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos8-mofed5-cuda11:2 - - container: ubuntu16_cuda11 - image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu16.04-mofed5-cuda11:3 - container: ubuntu18_cuda11 image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu18.04-mofed5-cuda11:3 - container: ubuntu20_cuda11 image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu20.04-mofed5-cuda11:3 - container: ubuntu22_cuda11 - image: rdmz-harbor.rdmz.labs.mlnx/ucx/x86_64/ubuntu22.04-mofed5-cuda11:3 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu22.04-mofed5-cuda11:3 stages: - stage: Prepare diff --git a/buildlib/dockers/docker-compose.yml b/buildlib/dockers/docker-compose.yml index f91d973c8d93..1e6cebeb7882 100644 --- a/buildlib/dockers/docker-compose.yml +++ b/buildlib/dockers/docker-compose.yml @@ -1,5 +1,8 @@ version: "3.4" +# Find driver version based on CUDA version, OS and CPU arch (515 in this case): +# https://developer.nvidia.com/cuda-11-7-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=runfile_local + services: centos7-mofed5-cuda11: image: centos7-mofed5-cuda11:2 @@ -34,16 +37,6 @@ services: MOFED_OS: rhel8.2 CUDA_VERSION: 11.4.0 OS_VERSION: 8 - ubuntu16.04-mofed5-cuda11: - image: ubuntu16.04-mofed5-cuda11:3 - build: - context: . - network: host - dockerfile: ubuntu-release.Dockerfile - args: - MOFED_VERSION: 5.0-1.0.0.0 - UBUNTU_VERSION: 16.04 - CUDA_VERSION: 11.2.0 ubuntu18.04-mofed5-cuda11: image: ubuntu18.04-mofed5-cuda11:3 build: @@ -54,6 +47,7 @@ services: MOFED_VERSION: 5.0-1.0.0.0 UBUNTU_VERSION: 18.04 CUDA_VERSION: 11.4.0 + DRIVER_VERSION: 470 ubuntu20.04-mofed5-cuda11: image: ubuntu20.04-mofed5-cuda11:3 build: @@ -64,6 +58,7 @@ services: MOFED_VERSION: 5.0-1.0.0.0 UBUNTU_VERSION: 20.04 CUDA_VERSION: 11.4.0 + DRIVER_VERSION: 470 ubuntu22.04-mofed5-cuda11: image: ubuntu22.04-mofed5-cuda11:3 build: @@ -74,3 +69,4 @@ services: MOFED_VERSION: 5.4-3.6.8.1 UBUNTU_VERSION: 22.04 CUDA_VERSION: 11.7.0 + DRIVER_VERSION: 515 diff --git a/buildlib/dockers/push-release-images.sh b/buildlib/dockers/push-release-images.sh index 3db6de93984e..07ba20475989 100755 --- a/buildlib/dockers/push-release-images.sh +++ b/buildlib/dockers/push-release-images.sh @@ -1,11 +1,11 @@ -#!/bin/bash -eE +#!/bin/bash -eEx # shellcheck disable=SC2086 basedir=$(cd "$(dirname $0)" && pwd) registry=harbor.mellanox.com/ucx -images=$(awk '/image:/ {print $2}' "${basedir}/docker-compose.yml") +images=$(awk '!/#/ && /image:/ {print $2}' "${basedir}/docker-compose.yml") for img in $images; do target_name="${registry}/${img}" docker tag ${img} ${target_name} diff --git a/buildlib/dockers/ubuntu-release.Dockerfile b/buildlib/dockers/ubuntu-release.Dockerfile index 6f463961def3..b9b7f84c6756 100644 --- a/buildlib/dockers/ubuntu-release.Dockerfile +++ b/buildlib/dockers/ubuntu-release.Dockerfile @@ -1,10 +1,12 @@ -ARG CUDA_VERSION=10.1 -ARG UBUNTU_VERSION=16.04 +ARG CUDA_VERSION +ARG UBUNTU_VERSION FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +ARG DRIVER_VERSION RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata && \ apt-get install -y \ + apt-file \ automake \ default-jdk \ dh-make \ @@ -14,13 +16,15 @@ RUN apt-get update && \ libcap2 \ libnuma-dev \ libtool \ + libnvidia-compute-${DRIVER_VERSION} \ make \ maven \ udev \ wget \ environment-modules \ pkg-config \ - && apt-get remove -y openjdk-11-* || apt-get autoremove -y \ + sudo \ + && apt-get remove -y openjdk-11-* cuda-compat* || apt-get autoremove -y \ && apt-get clean && rm -rf /var/lib/apt/lists/* # MOFED @@ -43,8 +47,5 @@ RUN ${MOFED_DIR}/mlnxofedinstall --all -q \ rm -rf ${MOFED_DIR} && rm -rf *.tgz ENV CPATH /usr/local/cuda/include:${CPATH} -ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH} -ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH} -ENV PATH /usr/local/cuda/compat:${PATH} - -RUN ml_stub=$(find /usr -name libnvidia-ml.so) && ln -s $ml_stub /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 +ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:${LD_LIBRARY_PATH} +ENV LIBRARY_PATH /usr/local/cuda/lib64:${LIBRARY_PATH} diff --git a/debian/control.in b/debian/control.in index b620b13bdba3..f632e500532d 100644 --- a/debian/control.in +++ b/debian/control.in @@ -41,7 +41,7 @@ Package: ucx-gdrcopy Section: libs Depends: ${misc:Depends}, ${shlibs:Depends} Architecture: any -Build-Profiles: +Build-Profiles: Description: Unified Communication X - gdrcopy support UCX is a communication library implementing high-performance messaging. .