Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AZP: Add multi HCAs iodemo test #261

Open
wants to merge 1 commit into
base: integration3
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions buildlib/az-network-corrupter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
echo "Running $0 $*..."
eval "$*"
initial_delay=${initial_delay:=10}
interface=${interface:=bond0}
interfaces=${interfaces:=bond0}
cycles=${cycles:=1000}
downtime=${downtime:=5}
uptime=${uptime:=20}
Expand All @@ -14,8 +14,10 @@ manager_script_dir=/hpc/noarch/git_projects/hpc-mtt-conf/scripts
manager_script=${manager_script_dir}/switch_port_on_off.py

if [ "x$reset" = "xyes" ]; then
echo "Resetting interface ${interface} on $(hostname) interface ..."
${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface}
for interface in ${interfaces} ; do
echo "Resetting interface ${interface} on $(hostname) interface ..."
${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface}
done
sleep "$uptime"
exit $?
fi
Expand All @@ -25,10 +27,14 @@ sleep ${initial_delay}

for i in $(seq 1 ${cycles}); do
echo "#$i Put it down! And sleep ${downtime}"
${manager_script} --one -d ${manager_script_dir}/hosts --host $(hostname) -a off -i ${interface}
for interface in ${interfaces} ; do
${manager_script} --one -d ${manager_script_dir}/hosts --host $(hostname) -a off -i ${interface}
done
sleep "$downtime"

echo "#$i Put it up! And sleep ${uptime}"
${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface}
for interface in ${interfaces} ; do
${manager_script} -d ${manager_script_dir}/hosts --host $(hostname) -a on -i ${interface}
done
sleep "$uptime"
done
45 changes: 36 additions & 9 deletions buildlib/pr/io_demo/az-stage-io-demo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,18 @@ parameters:
default: 20
- name: analyzer_allow_list_args
default: ''
- name: interference
default: 'Yes'
- name: extra_run_args
default: ''

steps:
- bash: |
set -eEx
$(workspace)/buildlib/az-network-corrupter.sh reset=yes interface=${{ parameters.roce_iface }}
echo "##vso[task.setvariable variable=interference]${{ parameters.interference }}"
for interface in ${{ parameters.roce_iface }} ; do
$(workspace)/buildlib/az-network-corrupter.sh reset=yes interfaces=${interface}
done
displayName: Restore port state
condition: always()
timeoutInMinutes: 2
Expand All @@ -32,7 +37,7 @@ steps:
cycles=$(cycles) \
downtime=$(downtime) \
uptime=$(uptime) \
interface=${{ parameters.roce_iface }} \
interfaces="${{ parameters.roce_iface }}" \
|& add_timestamp &>corrupter.log &
while ! pgrep -u "$USER" -f 'network-corrupter'
do
Expand All @@ -43,27 +48,36 @@ steps:
echo "corrupter_pid=$corrupter_pid"
azure_set_variable "corrupter_pid" "$corrupter_pid"
displayName: Start network corrupter
condition: eq(variables['interference'], 'Yes')
timeoutInMinutes: 2

- bash: |
set -eEx
sudo /hpc/local/bin/lshca
mkdir -p $(workspace)/${{ parameters.name }}
# set UCX environment variables
export UCX_NET_DEVICES=$(ibdev2netdev | sed -ne 's/\(\w*\) port \([0-9]\) ==> '${{parameters.roce_iface}}' .*/\1:\2/p')
net_devices=""
for interface in ${{ parameters.roce_iface }} ; do
net_device=$(ibdev2netdev | sed -ne 's/\(\w*\) port \([0-9]\) ==> '${interface}' .*/\1:\2/p')
net_devices="${net_devices} ${net_device}"
done
export UCX_NET_DEVICES=$(echo ${net_devices##*( )} | tr " ", ",")
export UCX_TLS=${{ parameters.iodemo_tls }}
export UCX_RNDV_THRESH=4k
export LD_LIBRARY_PATH=$(workspace)/install/lib:$LD_LIBRARY_PATH
export UCX_LOG_LEVEL=info
num_tasks=$(echo ${{ parameters.roce_iface }} | wc -w)
$(workspace)/test/apps/iodemo/run_io_demo.sh \
-H $(agent_hosts) \
--tasks-per-node 1 \
--tasks-per-node ${num_tasks} \
--duration ${{ parameters.duration }} \
-v \
--num-clients 1 \
--num-servers 1 \
--bind \
--num-clients ${num_tasks} \
--num-servers ${num_tasks} \
--map-by slot \
--log-dir $(workspace)/${{ parameters.name }} \
-i ${{ parameters.roce_iface }} \
-i $(echo ${{ parameters.roce_iface }} | tr " " ",") \
${{ parameters.extra_run_args }} \
$(io_demo_exe) \
-d 512:524288 \
Expand All @@ -82,11 +96,22 @@ steps:
analyzer_args="-d $(workspace)/${{ parameters.name }}"
analyzer_args="$analyzer_args --duration ${{ parameters.duration }}"
analyzer_args="$analyzer_args -t 3"
if [ '${{ parameters.interference }}' == 'No' ] ; then
analyzer_args="$analyzer_args --no-allow-list"
fi
analyzer_args="$analyzer_args ${{ parameters.analyzer_allow_list_args }}"
python ${analyzer} ${analyzer_args}
displayName: Analyze for ${{ parameters.name }}
timeoutInMinutes: 1

- task: PublishBuildArtifacts@1
inputs:
pathToPublish: '$(workspace)/${{ parameters.name }}'
artifactName: log_${{ parameters.name }}_$(Build.BuildId)
displayName: Publish logs for ${{ parameters.name }}
condition: failed()
timeoutInMinutes: 2

- bash: |
set -eEx
pid=$(corrupter_pid)
Expand All @@ -99,12 +124,14 @@ steps:
fi
cat corrupter.log
displayName: Kill corrupter
condition: always()
condition: and(always(), eq(variables['interference'], 'Yes'))
timeoutInMinutes: 2

- bash: |
set -eEx
$(workspace)/buildlib/az-network-corrupter.sh reset=yes interface=${{ parameters.roce_iface }}
for interface in ${{ parameters.roce_iface }} ; do
$(workspace)/buildlib/az-network-corrupter.sh reset=yes interfaces=${interface}
done
displayName: Restore port state
condition: always()
timeoutInMinutes: 2
38 changes: 20 additions & 18 deletions buildlib/pr/io_demo/io-demo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,36 @@ parameters:
type: object
default:
"tag match on CX4":
args: ""
duration: 600
interface: $(roce_iface_cx4)
tls: "rc_x"
test_name: tag_cx4_rc
"tag match on CX6":
args: ""
extra_run_args: ""
"tag match on CX6/RC":
yosefe marked this conversation as resolved.
Show resolved Hide resolved
duration: 600
interface: $(roce_iface_cx6)
tls: "rc_x"
test_name: tag_cx6_rc
"tag match on CX6 with user memh":
"tag match on CX4 with user memh":
args: "-z"
duration: 600
interface: $(roce_iface_cx6)
tls: "rc_x"
test_name: tag_umemh_cx6_rc
"server one path compatible on CX4":
args: ""
initial_delay: 9999 # No interference
interference: 'No'
duration: 120
interface: $(roce_iface_cx4)
tls: "rc_x"
extra_run_args: "--env server UCX_IB_NUM_PATHS=1 --env client UCX_IB_NUM_PATHS=2"
test_name: tag_cx4_rc_server_one_path
"client one path compatible on CX4":
args: ""
initial_delay: 9999 # No interference
test_name: tag_umemh_cx4_rc
extra_run_args: ""
"Multi HCAs: tag match on CX4 and CX6 RC ":
interference: 'No'
duration: 60
interface: $(roce_iface_cx4) $(roce_iface_cx6)
tls: "rc_x"
test_name: multy_cx6_cx4_rc
extra_run_args: ""
"different UCX_IB_NUM_PATH":
args: "-A"
interference: 'No'
duration: 120
interface: $(roce_iface_cx4)
interface: $(roce_iface_cx6)
tls: "rc_x"
extra_run_args: "--env server UCX_IB_NUM_PATHS=2 --env client UCX_IB_NUM_PATHS=1"
test_name: tag_cx4_rc_client_one_path
Expand All @@ -71,7 +71,7 @@ jobs:
- bash: |
set -eEx
./autogen.sh
./contrib/configure-devel --prefix=$(Build.Repository.LocalPath)/install --without-java
./contrib/configure-devel --prefix=$(Build.Repository.LocalPath)/install
make -j`nproc`
make install
# build static modules
Expand Down Expand Up @@ -120,6 +120,7 @@ jobs:
test_ucx_tls: ${{ test.Value.tls }}
test_extra_run_args: ${{ test.Value.extra_run_args }}
initial_delay: ${{ coalesce(test.Value.initial_delay, parameters.initial_delay) }}
test_interference: ${{ test.Value.interference }}
maxParallel: 1

variables:
Expand Down Expand Up @@ -147,6 +148,7 @@ jobs:
roce_iface: $(test_intefrafe)
iodemo_tls: $(test_ucx_tls)
initial_delay: $(initial_delay)
interference: $(test_interference)
extra_run_args: $(test_extra_run_args)
${{ if eq(variables['Build.Reason'], 'PullRequest') }}:
analyzer_allow_list_args: '--allow_list $(System.PullRequest.TargetBranch)'
Expand Down
52 changes: 31 additions & 21 deletions test/apps/iodemo/run_io_demo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ check_slurm_env()
init_config()
{
verbose=0
net_if="bond0"
net_ifs="bond0"
duration=30
bind_local=0
base_port_num=20000
Expand All @@ -108,7 +108,7 @@ show_config()
{
echo "Launch configuration:"
for key in \
host_list net_if duration bind_local base_port_num \
host_list net_ifs duration bind_local base_port_num \
num_clients num_servers tasks_per_node map_by \
client_wait_time launcher dry_run log_dir \
iodemo_exe iodemo_client_args
Expand All @@ -132,7 +132,7 @@ usage()
echo " -h|--help Show this help message"
echo " -v|--verbose Turn on verbosity"
echo " -H|--hostlist <h1>,<h2>,.. List of host names to run on"$(show_default_value host_list)
echo " -i|--netif <n> Network interface to use"$(show_default_value net_if)
echo " -i|--netif <n1,n2> Comma-separated list of network interfaces to use"$(show_default_value net_ifs)
echo " -d|--duration <seconds> How much time to run the application"$(show_default_value duration)
echo " --bind Bind to local IP address"
echo " --env <role> <key>=<value> Environment variable for <role> (client/server/all)"
Expand Down Expand Up @@ -171,7 +171,7 @@ parse_args()
shift
;;
-i|--netif)
net_if="$2"
net_ifs="$2"
shift
;;
-d|--duration)
Expand Down Expand Up @@ -290,23 +290,26 @@ set_ssh_options()

collect_ip_addrs()
{
# convert the output of 'ip' to 'host:ip' list
host_ips=$(eval ${launcher} ${host_list} ip -4 -o address show ${net_if} |
sed -ne 's/^\(\S*\): .* inet \([0-9\.]*\).*$/\1:\2/p')
if [ $(echo ${host_ips} | wc -w) -ne $(split_list ${host_list} | wc -w) ]
then
error "failed to collect host IP addresses for ${net_if}"
fi

# map the ips to hosts according to host order in the list
for host_ip in ${host_ips}
for net_if in $(split_list ${net_ifs})
do
host=$(echo ${host_ip} | cut -d: -f1)
addr=$(echo ${host_ip} | cut -d: -f2)
if [ -n "${host}" ] && [ -n "${addr}" ]
# convert the output of 'ip' to 'host:ip' list
host_ips=$(eval ${launcher} ${host_list} ip -4 -o address show ${net_if} |
sed -ne 's/^\(\S*\): .* inet \([0-9\.]*\).*$/\1:\2/p')
if [ $(echo ${host_ips} | wc -w) -ne $(split_list ${host_list} | wc -w) ]
then
ip_address_per_host[${host}]=${addr}
error "failed to collect host IP addresses for ${net_if}"
fi

# map the ips to hosts according to host order in the list
for host_ip in ${host_ips}
do
host=$(echo ${host_ip} | cut -d: -f1)
addr=$(echo ${host_ip} | cut -d: -f2)
if [ -n "${host}" ] && [ -n "${addr}" ]
then
ip_address_per_host[${host}${net_if}]=${addr}
fi
done
done
}

Expand Down Expand Up @@ -456,8 +459,11 @@ make_scripts()
do
for ((i=0;i<${num_servers_per_host[${host}]};++i))
do
port_num=$((base_port_num + i))
client_connect_list+=" ${ip_address_per_host[${host}]}:${port_num}"
for net_if in $(split_list ${net_ifs})
do
port_num=$((base_port_num + i))
client_connect_list+=" ${ip_address_per_host[${host}${net_if}]}:${port_num}"
done
done
done

Expand Down Expand Up @@ -664,7 +670,11 @@ make_scripts()
log_file=${log_dir}/$(printf "iodemo_%s_client_%02d.log" ${host} $i)
if [ ${bind_local} -eq 1 ]
then
client_bind="-I ${ip_address_per_host[${host}]}"
client_bind=""
for net_if in $(split_list ${net_ifs})
do
client_bind="${client_bind} -I ${ip_address_per_host[${host}${net_if}]}"
done
else
client_bind=""
fi
Expand Down