diff --git a/buildlib/az-network-corrupter.sh b/buildlib/az-network-corrupter.sh index f1649ff72a0..a6a66d3ff44 100644 --- a/buildlib/az-network-corrupter.sh +++ b/buildlib/az-network-corrupter.sh @@ -3,7 +3,7 @@ echo "Running $0 $*..." eval "$*" initial_delay=${initial_delay:=10} -interface=${interface:=bond0} +interfaces=${interfaces:=bond0} cycles=${cycles:=1000} downtime=${downtime:=5} uptime=${uptime:=20} @@ -14,8 +14,10 @@ manager_script_dir=/hpc/noarch/git_projects/hpc-mtt-conf/scripts manager_script=${manager_script_dir}/switch_port_on_off.py if [ "x$reset" = "xyes" ]; then - echo "Resetting interface ${interface} on $(hostname) interface ..." - ${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface} + for interface in ${interfaces} ; do + echo "Resetting interface ${interface} on $(hostname) interface ..." + ${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface} + done sleep "$uptime" exit $? fi @@ -25,10 +27,14 @@ sleep ${initial_delay} for i in $(seq 1 ${cycles}); do echo "#$i Put it down! And sleep ${downtime}" - ${manager_script} --one -d ${manager_script_dir}/hosts --host $(hostname) -a off -i ${interface} + for interface in ${interfaces} ; do + ${manager_script} --one -d ${manager_script_dir}/hosts --host $(hostname) -a off -i ${interface} + done sleep "$downtime" echo "#$i Put it up! And sleep ${uptime}" - ${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface} + for interface in ${interfaces} ; do + ${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface} + done sleep "$uptime" done diff --git a/buildlib/pr/io_demo/az-stage-io-demo.yaml b/buildlib/pr/io_demo/az-stage-io-demo.yaml index 943737bf750..132562aa508 100644 --- a/buildlib/pr/io_demo/az-stage-io-demo.yaml +++ b/buildlib/pr/io_demo/az-stage-io-demo.yaml @@ -13,13 +13,18 @@ parameters: default: 20 - name: analyzer_allow_list_args default: '' +- name: interference + default: 'Yes' - name: extra_run_args default: '' steps: - bash: | set -eEx - $(workspace)/buildlib/az-network-corrupter.sh reset=yes interface=${{ parameters.roce_iface }} + echo "##vso[task.setvariable variable=interference]${{ parameters.interference }}" + for interface in ${{ parameters.roce_iface }} ; do + $(workspace)/buildlib/az-network-corrupter.sh reset=yes interfaces=${interface} + done displayName: Restore port state condition: always() timeoutInMinutes: 2 @@ -32,7 +37,7 @@ steps: cycles=$(cycles) \ downtime=$(downtime) \ uptime=$(uptime) \ - interface=${{ parameters.roce_iface }} \ + interfaces="${{ parameters.roce_iface }}" \ |& add_timestamp &>corrupter.log & while ! pgrep -u "$USER" -f 'network-corrupter' do @@ -43,6 +48,7 @@ steps: echo "corrupter_pid=$corrupter_pid" azure_set_variable "corrupter_pid" "$corrupter_pid" displayName: Start network corrupter + condition: eq(variables['interference'], 'Yes') timeoutInMinutes: 2 - bash: | @@ -50,20 +56,28 @@ steps: sudo /hpc/local/bin/lshca mkdir -p $(workspace)/${{ parameters.name }} # set UCX environment variables - export UCX_NET_DEVICES=$(ibdev2netdev | sed -ne 's/\(\w*\) port \([0-9]\) ==> '${{parameters.roce_iface}}' .*/\1:\2/p') + net_devices="" + for interface in ${{ parameters.roce_iface }} ; do + net_device=$(ibdev2netdev | sed -ne 's/\(\w*\) port \([0-9]\) ==> '${interface}' .*/\1:\2/p') + net_devices="${net_devices} ${net_device}" + done + export UCX_NET_DEVICES=$(echo ${net_devices##*( )} | tr " ", ",") export UCX_TLS=${{ parameters.iodemo_tls }} export UCX_RNDV_THRESH=4k export LD_LIBRARY_PATH=$(workspace)/install/lib:$LD_LIBRARY_PATH + export UCX_LOG_LEVEL=info + num_tasks=$(echo ${{ parameters.roce_iface }} | wc -w) $(workspace)/test/apps/iodemo/run_io_demo.sh \ -H $(agent_hosts) \ - --tasks-per-node 1 \ + --tasks-per-node ${num_tasks} \ --duration ${{ parameters.duration }} \ -v \ - --num-clients 1 \ - --num-servers 1 \ + --bind \ + --num-clients ${num_tasks} \ + --num-servers ${num_tasks} \ --map-by slot \ --log-dir $(workspace)/${{ parameters.name }} \ - -i ${{ parameters.roce_iface }} \ + -i $(echo ${{ parameters.roce_iface }} | tr " " ",") \ ${{ parameters.extra_run_args }} \ $(io_demo_exe) \ -d 512:524288 \ @@ -82,11 +96,22 @@ steps: analyzer_args="-d $(workspace)/${{ parameters.name }}" analyzer_args="$analyzer_args --duration ${{ parameters.duration }}" analyzer_args="$analyzer_args -t 3" + if [ '${{ parameters.interference }}' == 'No' ] ; then + analyzer_args="$analyzer_args --no-allow-list" + fi analyzer_args="$analyzer_args ${{ parameters.analyzer_allow_list_args }}" python ${analyzer} ${analyzer_args} displayName: Analyze for ${{ parameters.name }} timeoutInMinutes: 1 +- task: PublishBuildArtifacts@1 + inputs: + pathToPublish: '$(workspace)/${{ parameters.name }}' + artifactName: log_${{ parameters.name }}_$(Build.BuildId) + displayName: Publish logs for ${{ parameters.name }} + condition: failed() + timeoutInMinutes: 2 + - bash: | set -eEx pid=$(corrupter_pid) @@ -99,12 +124,14 @@ steps: fi cat corrupter.log displayName: Kill corrupter - condition: always() + condition: and(always(), eq(variables['interference'], 'Yes')) timeoutInMinutes: 2 - bash: | set -eEx - $(workspace)/buildlib/az-network-corrupter.sh reset=yes interface=${{ parameters.roce_iface }} + for interface in ${{ parameters.roce_iface }} ; do + $(workspace)/buildlib/az-network-corrupter.sh reset=yes interfaces=${interface} + done displayName: Restore port state condition: always() timeoutInMinutes: 2 diff --git a/buildlib/pr/io_demo/io-demo.yml b/buildlib/pr/io_demo/io-demo.yml index 7b4ad2b813d..643a665ed7c 100644 --- a/buildlib/pr/io_demo/io-demo.yml +++ b/buildlib/pr/io_demo/io-demo.yml @@ -18,36 +18,36 @@ parameters: type: object default: "tag match on CX4": - args: "" duration: 600 interface: $(roce_iface_cx4) tls: "rc_x" test_name: tag_cx4_rc - "tag match on CX6": - args: "" + extra_run_args: "" + "tag match on CX6/RC": duration: 600 interface: $(roce_iface_cx6) tls: "rc_x" test_name: tag_cx6_rc - "tag match on CX6 with user memh": + "tag match on CX4 with user memh": args: "-z" - duration: 600 - interface: $(roce_iface_cx6) - tls: "rc_x" - test_name: tag_umemh_cx6_rc - "server one path compatible on CX4": - args: "" - initial_delay: 9999 # No interference + interference: 'No' duration: 120 interface: $(roce_iface_cx4) tls: "rc_x" - extra_run_args: "--env server UCX_IB_NUM_PATHS=1 --env client UCX_IB_NUM_PATHS=2" - test_name: tag_cx4_rc_server_one_path - "client one path compatible on CX4": - args: "" - initial_delay: 9999 # No interference + test_name: tag_umemh_cx4_rc + extra_run_args: "" + "Multi HCAs: tag match on CX4 and CX6 RC ": + interference: 'No' + duration: 60 + interface: $(roce_iface_cx4) $(roce_iface_cx6) + tls: "rc_x" + test_name: multy_cx6_cx4_rc + extra_run_args: "" + "different UCX_IB_NUM_PATH": + args: "-A" + interference: 'No' duration: 120 - interface: $(roce_iface_cx4) + interface: $(roce_iface_cx6) tls: "rc_x" extra_run_args: "--env server UCX_IB_NUM_PATHS=2 --env client UCX_IB_NUM_PATHS=1" test_name: tag_cx4_rc_client_one_path @@ -71,7 +71,7 @@ jobs: - bash: | set -eEx ./autogen.sh - ./contrib/configure-devel --prefix=$(Build.Repository.LocalPath)/install --without-java + ./contrib/configure-devel --prefix=$(Build.Repository.LocalPath)/install make -j`nproc` make install # build static modules @@ -120,6 +120,7 @@ jobs: test_ucx_tls: ${{ test.Value.tls }} test_extra_run_args: ${{ test.Value.extra_run_args }} initial_delay: ${{ coalesce(test.Value.initial_delay, parameters.initial_delay) }} + test_interference: ${{ test.Value.interference }} maxParallel: 1 variables: @@ -147,6 +148,7 @@ jobs: roce_iface: $(test_intefrafe) iodemo_tls: $(test_ucx_tls) initial_delay: $(initial_delay) + interference: $(test_interference) extra_run_args: $(test_extra_run_args) ${{ if eq(variables['Build.Reason'], 'PullRequest') }}: analyzer_allow_list_args: '--allow_list $(System.PullRequest.TargetBranch)' diff --git a/test/apps/iodemo/run_io_demo.sh b/test/apps/iodemo/run_io_demo.sh index f24302f54cb..c263c0ec19d 100755 --- a/test/apps/iodemo/run_io_demo.sh +++ b/test/apps/iodemo/run_io_demo.sh @@ -81,7 +81,7 @@ check_slurm_env() init_config() { verbose=0 - net_if="bond0" + net_ifs="bond0" duration=30 bind_local=0 base_port_num=20000 @@ -108,7 +108,7 @@ show_config() { echo "Launch configuration:" for key in \ - host_list net_if duration bind_local base_port_num \ + host_list net_ifs duration bind_local base_port_num \ num_clients num_servers tasks_per_node map_by \ client_wait_time launcher dry_run log_dir \ iodemo_exe iodemo_client_args @@ -132,7 +132,7 @@ usage() echo " -h|--help Show this help message" echo " -v|--verbose Turn on verbosity" echo " -H|--hostlist