Skip to content

Commit

Permalink
AZP: Add multi HCAs iodemo test
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexey-Rivkin committed Jul 31, 2022
1 parent 60314d3 commit 1d56eeb
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 53 deletions.
16 changes: 11 additions & 5 deletions buildlib/az-network-corrupter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
echo "Running $0 $*..."
eval "$*"
initial_delay=${initial_delay:=10}
interface=${interface:=bond0}
interfaces=${interfaces:=bond0}
cycles=${cycles:=1000}
downtime=${downtime:=5}
uptime=${uptime:=20}
Expand All @@ -14,8 +14,10 @@ manager_script_dir=/hpc/noarch/git_projects/hpc-mtt-conf/scripts
manager_script=${manager_script_dir}/switch_port_on_off.py

if [ "x$reset" = "xyes" ]; then
echo "Resetting interface ${interface} on $(hostname) interface ..."
${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface}
for interface in ${interfaces} ; do
echo "Resetting interface ${interface} on $(hostname) interface ..."
${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface}
done
sleep "$uptime"
exit $?
fi
Expand All @@ -25,10 +27,14 @@ sleep ${initial_delay}

for i in $(seq 1 ${cycles}); do
echo "#$i Put it down! And sleep ${downtime}"
${manager_script} --one -d ${manager_script_dir}/hosts --host $(hostname) -a off -i ${interface}
for interface in ${interfaces} ; do
${manager_script} --one -d ${manager_script_dir}/hosts --host $(hostname) -a off -i ${interface}
done
sleep "$downtime"

echo "#$i Put it up! And sleep ${uptime}"
${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface}
for interface in ${interfaces} ; do
${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface}
done
sleep "$uptime"
done
45 changes: 36 additions & 9 deletions buildlib/pr/io_demo/az-stage-io-demo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,18 @@ parameters:
default: 20
- name: analyzer_allow_list_args
default: ''
- name: interference
default: 'Yes'
- name: extra_run_args
default: ''

steps:
- bash: |
set -eEx
$(workspace)/buildlib/az-network-corrupter.sh reset=yes interface=${{ parameters.roce_iface }}
echo "##vso[task.setvariable variable=interference]${{ parameters.interference }}"
for interface in ${{ parameters.roce_iface }} ; do
$(workspace)/buildlib/az-network-corrupter.sh reset=yes interfaces=${interface}
done
displayName: Restore port state
condition: always()
timeoutInMinutes: 2
Expand All @@ -32,7 +37,7 @@ steps:
cycles=$(cycles) \
downtime=$(downtime) \
uptime=$(uptime) \
interface=${{ parameters.roce_iface }} \
interfaces="${{ parameters.roce_iface }}" \
|& add_timestamp &>corrupter.log &
while ! pgrep -u "$USER" -f 'network-corrupter'
do
Expand All @@ -43,27 +48,36 @@ steps:
echo "corrupter_pid=$corrupter_pid"
azure_set_variable "corrupter_pid" "$corrupter_pid"
displayName: Start network corrupter
condition: eq(variables['interference'], 'Yes')
timeoutInMinutes: 2

- bash: |
set -eEx
sudo /hpc/local/bin/lshca
mkdir -p $(workspace)/${{ parameters.name }}
# set UCX environment variables
export UCX_NET_DEVICES=$(ibdev2netdev | sed -ne 's/\(\w*\) port \([0-9]\) ==> '${{parameters.roce_iface}}' .*/\1:\2/p')
net_devices=""
for interface in ${{ parameters.roce_iface }} ; do
net_device=$(ibdev2netdev | sed -ne 's/\(\w*\) port \([0-9]\) ==> '${interface}' .*/\1:\2/p')
net_devices="${net_devices} ${net_device}"
done
export UCX_NET_DEVICES=$(echo ${net_devices##*( )} | tr " ", ",")
export UCX_TLS=${{ parameters.iodemo_tls }}
export UCX_RNDV_THRESH=4k
export LD_LIBRARY_PATH=$(workspace)/install/lib:$LD_LIBRARY_PATH
export UCX_LOG_LEVEL=info
num_tasks=$(echo ${{ parameters.roce_iface }} | wc -w)
$(workspace)/test/apps/iodemo/run_io_demo.sh \
-H $(agent_hosts) \
--tasks-per-node 1 \
--tasks-per-node ${num_tasks} \
--duration ${{ parameters.duration }} \
-v \
--num-clients 1 \
--num-servers 1 \
--bind \
--num-clients ${num_tasks} \
--num-servers ${num_tasks} \
--map-by slot \
--log-dir $(workspace)/${{ parameters.name }} \
-i ${{ parameters.roce_iface }} \
-i $(echo ${{ parameters.roce_iface }} | tr " " ",") \
${{ parameters.extra_run_args }} \
$(io_demo_exe) \
-d 512:524288 \
Expand All @@ -82,11 +96,22 @@ steps:
analyzer_args="-d $(workspace)/${{ parameters.name }}"
analyzer_args="$analyzer_args --duration ${{ parameters.duration }}"
analyzer_args="$analyzer_args -t 3"
if [ '${{ parameters.interference }}' == 'No' ] ; then
analyzer_args="$analyzer_args --no-allow-list"
fi
analyzer_args="$analyzer_args ${{ parameters.analyzer_allow_list_args }}"
python ${analyzer} ${analyzer_args}
displayName: Analyze for ${{ parameters.name }}
timeoutInMinutes: 1

- task: PublishBuildArtifacts@1
inputs:
pathToPublish: '$(workspace)/${{ parameters.name }}'
artifactName: log_${{ parameters.name }}_$(Build.BuildId)
displayName: Publish logs for ${{ parameters.name }}
condition: failed()
timeoutInMinutes: 2

- bash: |
set -eEx
pid=$(corrupter_pid)
Expand All @@ -99,12 +124,14 @@ steps:
fi
cat corrupter.log
displayName: Kill corrupter
condition: always()
condition: and(always(), eq(variables['interference'], 'Yes'))
timeoutInMinutes: 2

- bash: |
set -eEx
$(workspace)/buildlib/az-network-corrupter.sh reset=yes interface=${{ parameters.roce_iface }}
for interface in ${{ parameters.roce_iface }} ; do
$(workspace)/buildlib/az-network-corrupter.sh reset=yes interfaces=${interface}
done
displayName: Restore port state
condition: always()
timeoutInMinutes: 2
38 changes: 20 additions & 18 deletions buildlib/pr/io_demo/io-demo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,36 @@ parameters:
type: object
default:
"tag match on CX4":
args: ""
duration: 600
interface: $(roce_iface_cx4)
tls: "rc_x"
test_name: tag_cx4_rc
"tag match on CX6":
args: ""
extra_run_args: ""
"tag match on CX6/RC":
duration: 600
interface: $(roce_iface_cx6)
tls: "rc_x"
test_name: tag_cx6_rc
"tag match on CX6 with user memh":
"tag match on CX4 with user memh":
args: "-z"
duration: 600
interface: $(roce_iface_cx6)
tls: "rc_x"
test_name: tag_umemh_cx6_rc
"server one path compatible on CX4":
args: ""
initial_delay: 9999 # No interference
interference: 'No'
duration: 120
interface: $(roce_iface_cx4)
tls: "rc_x"
extra_run_args: "--env server UCX_IB_NUM_PATHS=1 --env client UCX_IB_NUM_PATHS=2"
test_name: tag_cx4_rc_server_one_path
"client one path compatible on CX4":
args: ""
initial_delay: 9999 # No interference
test_name: tag_umemh_cx4_rc
extra_run_args: ""
"Multi HCAs: tag match on CX4 and CX6 RC ":
interference: 'No'
duration: 60
interface: $(roce_iface_cx4) $(roce_iface_cx6)
tls: "rc_x"
test_name: multy_cx6_cx4_rc
extra_run_args: ""
"different UCX_IB_NUM_PATH":
args: "-A"
interference: 'No'
duration: 120
interface: $(roce_iface_cx4)
interface: $(roce_iface_cx6)
tls: "rc_x"
extra_run_args: "--env server UCX_IB_NUM_PATHS=2 --env client UCX_IB_NUM_PATHS=1"
test_name: tag_cx4_rc_client_one_path
Expand All @@ -71,7 +71,7 @@ jobs:
- bash: |
set -eEx
./autogen.sh
./contrib/configure-devel --prefix=$(Build.Repository.LocalPath)/install --without-java
./contrib/configure-devel --prefix=$(Build.Repository.LocalPath)/install
make -j`nproc`
make install
# build static modules
Expand Down Expand Up @@ -120,6 +120,7 @@ jobs:
test_ucx_tls: ${{ test.Value.tls }}
test_extra_run_args: ${{ test.Value.extra_run_args }}
initial_delay: ${{ coalesce(test.Value.initial_delay, parameters.initial_delay) }}
test_interference: ${{ test.Value.interference }}
maxParallel: 1

variables:
Expand Down Expand Up @@ -147,6 +148,7 @@ jobs:
roce_iface: $(test_intefrafe)
iodemo_tls: $(test_ucx_tls)
initial_delay: $(initial_delay)
interference: $(test_interference)
extra_run_args: $(test_extra_run_args)
${{ if eq(variables['Build.Reason'], 'PullRequest') }}:
analyzer_allow_list_args: '--allow_list $(System.PullRequest.TargetBranch)'
Expand Down
52 changes: 31 additions & 21 deletions test/apps/iodemo/run_io_demo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ check_slurm_env()
init_config()
{
verbose=0
net_if="bond0"
net_ifs="bond0"
duration=30
bind_local=0
base_port_num=20000
Expand All @@ -108,7 +108,7 @@ show_config()
{
echo "Launch configuration:"
for key in \
host_list net_if duration bind_local base_port_num \
host_list net_ifs duration bind_local base_port_num \
num_clients num_servers tasks_per_node map_by \
client_wait_time launcher dry_run log_dir \
iodemo_exe iodemo_client_args
Expand All @@ -132,7 +132,7 @@ usage()
echo " -h|--help Show this help message"
echo " -v|--verbose Turn on verbosity"
echo " -H|--hostlist <h1>,<h2>,.. List of host names to run on"$(show_default_value host_list)
echo " -i|--netif <n> Network interface to use"$(show_default_value net_if)
echo " -i|--netif <n1,n2> Comma-separated list of network interfaces to use"$(show_default_value net_ifs)
echo " -d|--duration <seconds> How much time to run the application"$(show_default_value duration)
echo " --bind Bind to local IP address"
echo " --env <role> <key>=<value> Environment variable for <role> (client/server/all)"
Expand Down Expand Up @@ -171,7 +171,7 @@ parse_args()
shift
;;
-i|--netif)
net_if="$2"
net_ifs="$2"
shift
;;
-d|--duration)
Expand Down Expand Up @@ -290,23 +290,26 @@ set_ssh_options()

collect_ip_addrs()
{
# convert the output of 'ip' to 'host:ip' list
host_ips=$(eval ${launcher} ${host_list} ip -4 -o address show ${net_if} |
sed -ne 's/^\(\S*\): .* inet \([0-9\.]*\).*$/\1:\2/p')
if [ $(echo ${host_ips} | wc -w) -ne $(split_list ${host_list} | wc -w) ]
then
error "failed to collect host IP addresses for ${net_if}"
fi

# map the ips to hosts according to host order in the list
for host_ip in ${host_ips}
for net_if in $(split_list ${net_ifs})
do
host=$(echo ${host_ip} | cut -d: -f1)
addr=$(echo ${host_ip} | cut -d: -f2)
if [ -n "${host}" ] && [ -n "${addr}" ]
# convert the output of 'ip' to 'host:ip' list
host_ips=$(eval ${launcher} ${host_list} ip -4 -o address show ${net_if} |
sed -ne 's/^\(\S*\): .* inet \([0-9\.]*\).*$/\1:\2/p')
if [ $(echo ${host_ips} | wc -w) -ne $(split_list ${host_list} | wc -w) ]
then
ip_address_per_host[${host}]=${addr}
error "failed to collect host IP addresses for ${net_if}"
fi

# map the ips to hosts according to host order in the list
for host_ip in ${host_ips}
do
host=$(echo ${host_ip} | cut -d: -f1)
addr=$(echo ${host_ip} | cut -d: -f2)
if [ -n "${host}" ] && [ -n "${addr}" ]
then
ip_address_per_host[${host}${net_if}]=${addr}
fi
done
done
}

Expand Down Expand Up @@ -456,8 +459,11 @@ make_scripts()
do
for ((i=0;i<${num_servers_per_host[${host}]};++i))
do
port_num=$((base_port_num + i))
client_connect_list+=" ${ip_address_per_host[${host}]}:${port_num}"
for net_if in $(split_list ${net_ifs})
do
port_num=$((base_port_num + i))
client_connect_list+=" ${ip_address_per_host[${host}${net_if}]}:${port_num}"
done
done
done

Expand Down Expand Up @@ -664,7 +670,11 @@ make_scripts()
log_file=${log_dir}/$(printf "iodemo_%s_client_%02d.log" ${host} $i)
if [ ${bind_local} -eq 1 ]
then
client_bind="-I ${ip_address_per_host[${host}]}"
client_bind=""
for net_if in $(split_list ${net_ifs})
do
client_bind="${client_bind} -I ${ip_address_per_host[${host}${net_if}]}"
done
else
client_bind=""
fi
Expand Down

0 comments on commit 1d56eeb

Please sign in to comment.