diff --git a/buildlib/az-network-corrupter.sh b/buildlib/az-network-corrupter.sh index f1649ff72a0..a6a66d3ff44 100644 --- a/buildlib/az-network-corrupter.sh +++ b/buildlib/az-network-corrupter.sh @@ -3,7 +3,7 @@ echo "Running $0 $*..." eval "$*" initial_delay=${initial_delay:=10} -interface=${interface:=bond0} +interfaces=${interfaces:=bond0} cycles=${cycles:=1000} downtime=${downtime:=5} uptime=${uptime:=20} @@ -14,8 +14,10 @@ manager_script_dir=/hpc/noarch/git_projects/hpc-mtt-conf/scripts manager_script=${manager_script_dir}/switch_port_on_off.py if [ "x$reset" = "xyes" ]; then - echo "Resetting interface ${interface} on $(hostname) interface ..." - ${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface} + for interface in ${interfaces} ; do + echo "Resetting interface ${interface} on $(hostname) interface ..." + ${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface} + done sleep "$uptime" exit $? fi @@ -25,10 +27,14 @@ sleep ${initial_delay} for i in $(seq 1 ${cycles}); do echo "#$i Put it down! And sleep ${downtime}" - ${manager_script} --one -d ${manager_script_dir}/hosts --host $(hostname) -a off -i ${interface} + for interface in ${interfaces} ; do + ${manager_script} --one -d ${manager_script_dir}/hosts --host $(hostname) -a off -i ${interface} + done sleep "$downtime" echo "#$i Put it up! And sleep ${uptime}" - ${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface} + for interface in ${interfaces} ; do + ${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface} + done sleep "$uptime" done diff --git a/buildlib/pr/io_demo/az-stage-io-demo.yaml b/buildlib/pr/io_demo/az-stage-io-demo.yaml index 943737bf750..132562aa508 100644 --- a/buildlib/pr/io_demo/az-stage-io-demo.yaml +++ b/buildlib/pr/io_demo/az-stage-io-demo.yaml @@ -13,13 +13,18 @@ parameters: default: 20 - name: analyzer_allow_list_args default: '' +- name: interference + default: 'Yes' - name: extra_run_args default: '' steps: - bash: | set -eEx - $(workspace)/buildlib/az-network-corrupter.sh reset=yes interface=${{ parameters.roce_iface }} + echo "##vso[task.setvariable variable=interference]${{ parameters.interference }}" + for interface in ${{ parameters.roce_iface }} ; do + $(workspace)/buildlib/az-network-corrupter.sh reset=yes interfaces=${interface} + done displayName: Restore port state condition: always() timeoutInMinutes: 2 @@ -32,7 +37,7 @@ steps: cycles=$(cycles) \ downtime=$(downtime) \ uptime=$(uptime) \ - interface=${{ parameters.roce_iface }} \ + interfaces="${{ parameters.roce_iface }}" \ |& add_timestamp &>corrupter.log & while ! pgrep -u "$USER" -f 'network-corrupter' do @@ -43,6 +48,7 @@ steps: echo "corrupter_pid=$corrupter_pid" azure_set_variable "corrupter_pid" "$corrupter_pid" displayName: Start network corrupter + condition: eq(variables['interference'], 'Yes') timeoutInMinutes: 2 - bash: | @@ -50,20 +56,28 @@ steps: sudo /hpc/local/bin/lshca mkdir -p $(workspace)/${{ parameters.name }} # set UCX environment variables - export UCX_NET_DEVICES=$(ibdev2netdev | sed -ne 's/\(\w*\) port \([0-9]\) ==> '${{parameters.roce_iface}}' .*/\1:\2/p') + net_devices="" + for interface in ${{ parameters.roce_iface }} ; do + net_device=$(ibdev2netdev | sed -ne 's/\(\w*\) port \([0-9]\) ==> '${interface}' .*/\1:\2/p') + net_devices="${net_devices} ${net_device}" + done + export UCX_NET_DEVICES=$(echo ${net_devices##*( )} | tr " ", ",") export UCX_TLS=${{ parameters.iodemo_tls }} export UCX_RNDV_THRESH=4k export LD_LIBRARY_PATH=$(workspace)/install/lib:$LD_LIBRARY_PATH + export UCX_LOG_LEVEL=info + num_tasks=$(echo ${{ parameters.roce_iface }} | wc -w) $(workspace)/test/apps/iodemo/run_io_demo.sh \ -H $(agent_hosts) \ - --tasks-per-node 1 \ + --tasks-per-node ${num_tasks} \ --duration ${{ parameters.duration }} \ -v \ - --num-clients 1 \ - --num-servers 1 \ + --bind \ + --num-clients ${num_tasks} \ + --num-servers ${num_tasks} \ --map-by slot \ --log-dir $(workspace)/${{ parameters.name }} \ - -i ${{ parameters.roce_iface }} \ + -i $(echo ${{ parameters.roce_iface }} | tr " " ",") \ ${{ parameters.extra_run_args }} \ $(io_demo_exe) \ -d 512:524288 \ @@ -82,11 +96,22 @@ steps: analyzer_args="-d $(workspace)/${{ parameters.name }}" analyzer_args="$analyzer_args --duration ${{ parameters.duration }}" analyzer_args="$analyzer_args -t 3" + if [ '${{ parameters.interference }}' == 'No' ] ; then + analyzer_args="$analyzer_args --no-allow-list" + fi analyzer_args="$analyzer_args ${{ parameters.analyzer_allow_list_args }}" python ${analyzer} ${analyzer_args} displayName: Analyze for ${{ parameters.name }} timeoutInMinutes: 1 +- task: PublishBuildArtifacts@1 + inputs: + pathToPublish: '$(workspace)/${{ parameters.name }}' + artifactName: log_${{ parameters.name }}_$(Build.BuildId) + displayName: Publish logs for ${{ parameters.name }} + condition: failed() + timeoutInMinutes: 2 + - bash: | set -eEx pid=$(corrupter_pid) @@ -99,12 +124,14 @@ steps: fi cat corrupter.log displayName: Kill corrupter - condition: always() + condition: and(always(), eq(variables['interference'], 'Yes')) timeoutInMinutes: 2 - bash: | set -eEx - $(workspace)/buildlib/az-network-corrupter.sh reset=yes interface=${{ parameters.roce_iface }} + for interface in ${{ parameters.roce_iface }} ; do + $(workspace)/buildlib/az-network-corrupter.sh reset=yes interfaces=${interface} + done displayName: Restore port state condition: always() timeoutInMinutes: 2 diff --git a/buildlib/pr/io_demo/io-demo.yml b/buildlib/pr/io_demo/io-demo.yml index 7b4ad2b813d..643a665ed7c 100644 --- a/buildlib/pr/io_demo/io-demo.yml +++ b/buildlib/pr/io_demo/io-demo.yml @@ -18,36 +18,36 @@ parameters: type: object default: "tag match on CX4": - args: "" duration: 600 interface: $(roce_iface_cx4) tls: "rc_x" test_name: tag_cx4_rc - "tag match on CX6": - args: "" + extra_run_args: "" + "tag match on CX6/RC": duration: 600 interface: $(roce_iface_cx6) tls: "rc_x" test_name: tag_cx6_rc - "tag match on CX6 with user memh": + "tag match on CX4 with user memh": args: "-z" - duration: 600 - interface: $(roce_iface_cx6) - tls: "rc_x" - test_name: tag_umemh_cx6_rc - "server one path compatible on CX4": - args: "" - initial_delay: 9999 # No interference + interference: 'No' duration: 120 interface: $(roce_iface_cx4) tls: "rc_x" - extra_run_args: "--env server UCX_IB_NUM_PATHS=1 --env client UCX_IB_NUM_PATHS=2" - test_name: tag_cx4_rc_server_one_path - "client one path compatible on CX4": - args: "" - initial_delay: 9999 # No interference + test_name: tag_umemh_cx4_rc + extra_run_args: "" + "Multi HCAs: tag match on CX4 and CX6 RC ": + interference: 'No' + duration: 60 + interface: $(roce_iface_cx4) $(roce_iface_cx6) + tls: "rc_x" + test_name: multy_cx6_cx4_rc + extra_run_args: "" + "different UCX_IB_NUM_PATH": + args: "-A" + interference: 'No' duration: 120 - interface: $(roce_iface_cx4) + interface: $(roce_iface_cx6) tls: "rc_x" extra_run_args: "--env server UCX_IB_NUM_PATHS=2 --env client UCX_IB_NUM_PATHS=1" test_name: tag_cx4_rc_client_one_path @@ -71,7 +71,7 @@ jobs: - bash: | set -eEx ./autogen.sh - ./contrib/configure-devel --prefix=$(Build.Repository.LocalPath)/install --without-java + ./contrib/configure-devel --prefix=$(Build.Repository.LocalPath)/install make -j`nproc` make install # build static modules @@ -120,6 +120,7 @@ jobs: test_ucx_tls: ${{ test.Value.tls }} test_extra_run_args: ${{ test.Value.extra_run_args }} initial_delay: ${{ coalesce(test.Value.initial_delay, parameters.initial_delay) }} + test_interference: ${{ test.Value.interference }} maxParallel: 1 variables: @@ -147,6 +148,7 @@ jobs: roce_iface: $(test_intefrafe) iodemo_tls: $(test_ucx_tls) initial_delay: $(initial_delay) + interference: $(test_interference) extra_run_args: $(test_extra_run_args) ${{ if eq(variables['Build.Reason'], 'PullRequest') }}: analyzer_allow_list_args: '--allow_list $(System.PullRequest.TargetBranch)' diff --git a/test/apps/iodemo/run_io_demo.sh b/test/apps/iodemo/run_io_demo.sh index f24302f54cb..c263c0ec19d 100755 --- a/test/apps/iodemo/run_io_demo.sh +++ b/test/apps/iodemo/run_io_demo.sh @@ -81,7 +81,7 @@ check_slurm_env() init_config() { verbose=0 - net_if="bond0" + net_ifs="bond0" duration=30 bind_local=0 base_port_num=20000 @@ -108,7 +108,7 @@ show_config() { echo "Launch configuration:" for key in \ - host_list net_if duration bind_local base_port_num \ + host_list net_ifs duration bind_local base_port_num \ num_clients num_servers tasks_per_node map_by \ client_wait_time launcher dry_run log_dir \ iodemo_exe iodemo_client_args @@ -132,7 +132,7 @@ usage() echo " -h|--help Show this help message" echo " -v|--verbose Turn on verbosity" echo " -H|--hostlist

,

,.. List of host names to run on"$(show_default_value host_list) - echo " -i|--netif Network interface to use"$(show_default_value net_if) + echo " -i|--netif Comma-separated list of network interfaces to use"$(show_default_value net_ifs) echo " -d|--duration How much time to run the application"$(show_default_value duration) echo " --bind Bind to local IP address" echo " --env = Environment variable for (client/server/all)" @@ -171,7 +171,7 @@ parse_args() shift ;; -i|--netif) - net_if="$2" + net_ifs="$2" shift ;; -d|--duration) @@ -290,23 +290,26 @@ set_ssh_options() collect_ip_addrs() { - # convert the output of 'ip' to 'host:ip' list - host_ips=$(eval ${launcher} ${host_list} ip -4 -o address show ${net_if} | - sed -ne 's/^\(\S*\): .* inet \([0-9\.]*\).*$/\1:\2/p') - if [ $(echo ${host_ips} | wc -w) -ne $(split_list ${host_list} | wc -w) ] - then - error "failed to collect host IP addresses for ${net_if}" - fi - - # map the ips to hosts according to host order in the list - for host_ip in ${host_ips} + for net_if in $(split_list ${net_ifs}) do - host=$(echo ${host_ip} | cut -d: -f1) - addr=$(echo ${host_ip} | cut -d: -f2) - if [ -n "${host}" ] && [ -n "${addr}" ] + # convert the output of 'ip' to 'host:ip' list + host_ips=$(eval ${launcher} ${host_list} ip -4 -o address show ${net_if} | + sed -ne 's/^\(\S*\): .* inet \([0-9\.]*\).*$/\1:\2/p') + if [ $(echo ${host_ips} | wc -w) -ne $(split_list ${host_list} | wc -w) ] then - ip_address_per_host[${host}]=${addr} + error "failed to collect host IP addresses for ${net_if}" fi + + # map the ips to hosts according to host order in the list + for host_ip in ${host_ips} + do + host=$(echo ${host_ip} | cut -d: -f1) + addr=$(echo ${host_ip} | cut -d: -f2) + if [ -n "${host}" ] && [ -n "${addr}" ] + then + ip_address_per_host[${host}${net_if}]=${addr} + fi + done done } @@ -456,8 +459,11 @@ make_scripts() do for ((i=0;i<${num_servers_per_host[${host}]};++i)) do - port_num=$((base_port_num + i)) - client_connect_list+=" ${ip_address_per_host[${host}]}:${port_num}" + for net_if in $(split_list ${net_ifs}) + do + port_num=$((base_port_num + i)) + client_connect_list+=" ${ip_address_per_host[${host}${net_if}]}:${port_num}" + done done done @@ -664,7 +670,11 @@ make_scripts() log_file=${log_dir}/$(printf "iodemo_%s_client_%02d.log" ${host} $i) if [ ${bind_local} -eq 1 ] then - client_bind="-I ${ip_address_per_host[${host}]}" + client_bind="" + for net_if in $(split_list ${net_ifs}) + do + client_bind="${client_bind} -I ${ip_address_per_host[${host}${net_if}]}" + done else client_bind="" fi