From cd8182b6ad479dcc5a3195c1faf816aef4d25057 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Sun, 26 Feb 2023 11:33:54 +0200 Subject: [PATCH] AZP/RELEASE: rm cuda-compat & separate gdrcopy --- buildlib/az-distro-release.yml | 3 ++- buildlib/azure-pipelines-release.yml | 2 +- buildlib/dockers/docker-compose.yml | 16 ++++++---------- buildlib/dockers/push-release-images.sh | 4 ++-- buildlib/dockers/ubuntu-release.Dockerfile | 11 ++++++----- debian/control.in | 2 +- 6 files changed, 18 insertions(+), 20 deletions(-) diff --git a/buildlib/az-distro-release.yml b/buildlib/az-distro-release.yml index ebbe1a15e80a..deae0ed075fc 100644 --- a/buildlib/az-distro-release.yml +++ b/buildlib/az-distro-release.yml @@ -72,10 +72,11 @@ jobs: dpkg-buildpackage -us -uc -Pcuda cd .. # Move back to the working directory find . -name '*.deb' - VER="${POSTFIX#ucx-}" # Remove 'ucx' prefix from the POSTFIX string + VER="${POSTFIX#ucx-}" # Remove 'ucx-' prefix from the POSTFIX string # Rename DEB files find . -name "ucx*.deb" -exec bash -c 'mv "$1" "${1%%_*}-'"${VER}"'.deb"' _ {} \; dpkg-deb -I "ucx-${VER}.deb" + dpkg-deb -I "ucx-cuda-${VER}.deb" tar -cjf "${AZ_ARTIFACT_NAME}" *.deb # Package all DEBs tar -tjf "${AZ_ARTIFACT_NAME}" displayName: Build DEB package diff --git a/buildlib/azure-pipelines-release.yml b/buildlib/azure-pipelines-release.yml index 0e8b796ffd11..6653edb98b4c 100644 --- a/buildlib/azure-pipelines-release.yml +++ b/buildlib/azure-pipelines-release.yml @@ -26,7 +26,7 @@ resources: - container: ubuntu20_cuda11 image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu20.04-mofed5-cuda11:3 - container: ubuntu22_cuda11 - image: rdmz-harbor.rdmz.labs.mlnx/ucx/x86_64/ubuntu22.04-mofed5-cuda11:3 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu22.04-mofed5-cuda11:3 stages: - stage: Prepare diff --git a/buildlib/dockers/docker-compose.yml b/buildlib/dockers/docker-compose.yml index f91d973c8d93..0bf96c2dd56c 100644 --- a/buildlib/dockers/docker-compose.yml +++ b/buildlib/dockers/docker-compose.yml @@ -1,5 +1,8 @@ version: "3.4" +# CUDA runtime <-> driver versions compatibility per OS: +# https://developer.nvidia.com/cuda-11-7-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=runfile_local + services: centos7-mofed5-cuda11: image: centos7-mofed5-cuda11:2 @@ -34,16 +37,6 @@ services: MOFED_OS: rhel8.2 CUDA_VERSION: 11.4.0 OS_VERSION: 8 - ubuntu16.04-mofed5-cuda11: - image: ubuntu16.04-mofed5-cuda11:3 - build: - context: . - network: host - dockerfile: ubuntu-release.Dockerfile - args: - MOFED_VERSION: 5.0-1.0.0.0 - UBUNTU_VERSION: 16.04 - CUDA_VERSION: 11.2.0 ubuntu18.04-mofed5-cuda11: image: ubuntu18.04-mofed5-cuda11:3 build: @@ -54,6 +47,7 @@ services: MOFED_VERSION: 5.0-1.0.0.0 UBUNTU_VERSION: 18.04 CUDA_VERSION: 11.4.0 + DRIVER_VERSION: 470 ubuntu20.04-mofed5-cuda11: image: ubuntu20.04-mofed5-cuda11:3 build: @@ -64,6 +58,7 @@ services: MOFED_VERSION: 5.0-1.0.0.0 UBUNTU_VERSION: 20.04 CUDA_VERSION: 11.4.0 + DRIVER_VERSION: 470 ubuntu22.04-mofed5-cuda11: image: ubuntu22.04-mofed5-cuda11:3 build: @@ -74,3 +69,4 @@ services: MOFED_VERSION: 5.4-3.6.8.1 UBUNTU_VERSION: 22.04 CUDA_VERSION: 11.7.0 + DRIVER_VERSION: 515 diff --git a/buildlib/dockers/push-release-images.sh b/buildlib/dockers/push-release-images.sh index 3db6de93984e..07ba20475989 100755 --- a/buildlib/dockers/push-release-images.sh +++ b/buildlib/dockers/push-release-images.sh @@ -1,11 +1,11 @@ -#!/bin/bash -eE +#!/bin/bash -eEx # shellcheck disable=SC2086 basedir=$(cd "$(dirname $0)" && pwd) registry=harbor.mellanox.com/ucx -images=$(awk '/image:/ {print $2}' "${basedir}/docker-compose.yml") +images=$(awk '!/#/ && /image:/ {print $2}' "${basedir}/docker-compose.yml") for img in $images; do target_name="${registry}/${img}" docker tag ${img} ${target_name} diff --git a/buildlib/dockers/ubuntu-release.Dockerfile b/buildlib/dockers/ubuntu-release.Dockerfile index 6f463961def3..f4e57fd5e8aa 100644 --- a/buildlib/dockers/ubuntu-release.Dockerfile +++ b/buildlib/dockers/ubuntu-release.Dockerfile @@ -1,7 +1,8 @@ -ARG CUDA_VERSION=10.1 -ARG UBUNTU_VERSION=16.04 +ARG CUDA_VERSION +ARG UBUNTU_VERSION FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +ARG DRIVER_VERSION RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata && \ apt-get install -y \ @@ -14,13 +15,15 @@ RUN apt-get update && \ libcap2 \ libnuma-dev \ libtool \ + libnvidia-compute-${DRIVER_VERSION} \ make \ maven \ udev \ wget \ environment-modules \ pkg-config \ - && apt-get remove -y openjdk-11-* || apt-get autoremove -y \ + sudo \ + && apt-get remove -y openjdk-11-* cuda-compat* || apt-get autoremove -y \ && apt-get clean && rm -rf /var/lib/apt/lists/* # MOFED @@ -46,5 +49,3 @@ ENV CPATH /usr/local/cuda/include:${CPATH} ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH} ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH} ENV PATH /usr/local/cuda/compat:${PATH} - -RUN ml_stub=$(find /usr -name libnvidia-ml.so) && ln -s $ml_stub /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 diff --git a/debian/control.in b/debian/control.in index b620b13bdba3..f632e500532d 100644 --- a/debian/control.in +++ b/debian/control.in @@ -41,7 +41,7 @@ Package: ucx-gdrcopy Section: libs Depends: ${misc:Depends}, ${shlibs:Depends} Architecture: any -Build-Profiles: +Build-Profiles: Description: Unified Communication X - gdrcopy support UCX is a communication library implementing high-performance messaging. .