add wdmerger scaling numbers on frontier (#2914)

AMReX-Astro · Jul 12, 2024 · 6b4f1c3 · 6b4f1c3
1 parent 8b289c1
commit 6b4f1c3
Show file tree

Hide file tree

Showing 13 changed files with 1,062 additions and 0 deletions.
diff --git a/Exec/science/wdmerger/scaling/frontier/README.md b/Exec/science/wdmerger/scaling/frontier/README.md
@@ -0,0 +1,48 @@
+# wdmerger scaling on Frontier
+
+This explores a 12.5 km resolution wdmerger simulation using the
+Pakmor initial conditions.
+
+We consider 3 different gridding strategies:
+
+* 256^3 base + 3 AMR levels, each a jump of 4
+
+* 512^3 base + 3 AMR levels with jumps of 4, 4, 2
+
+* 1024^3 base + 2 AMR levels with jumps of 4, 4
+
+The inputs file here is setup for the 256^3 base.
+
+We report the total evolution time excluding initialization that is
+output by Castro at the end of the run.
+
+Some general observations:
+
+* We seem to do well with `max_grid_size` set to 64 or 128, but not 96
+
+* At large node counts, it really doesn't matter which of the gridding
+  strategies we use, since there is plenty of work to go around.  The
+  main consideration would be that the larger coarse grid would make
+  the plotfiles bigger.
+
+* We seem to benefit from using `castro.hydro_memory_footprint_ratio=3`
+
+* There really is no burning yet, since this is early in the
+  evolution, so we would expect scaling to improve as the stars
+  interact (more grids) and burning begins (more local work).
+
+Note that for the 256^3 base grid, on 64 nodes, the grid structure is:
+
+```
+INITIAL GRIDS
+  Level 0   512 grids  16777216 cells  100 % of domain
+            smallest grid: 32 x 32 x 32  biggest grid: 32 x 32 x 32
+  Level 1   96 grids  3145728 cells  0.29296875 % of domain
+            smallest grid: 32 x 32 x 32  biggest grid: 32 x 32 x 32
+  Level 2   674 grids  38797312 cells  0.05645751953 % of domain
+            smallest grid: 32 x 32 x 32  biggest grid: 64 x 32 x 32
+  Level 3   7247 grids  1428029440 cells  0.03246963024 % of domain
+            smallest grid: 32 x 32 x 32  biggest grid: 64 x 64 x 64
+```
+
+So only a small amount of the finest grid is refined in this problem.
diff --git a/Exec/science/wdmerger/scaling/frontier/frontier-128nodes.slurm b/Exec/science/wdmerger/scaling/frontier/frontier-128nodes.slurm
@@ -0,0 +1,72 @@
+#!/bin/bash
+#SBATCH -A AST106
+#SBATCH -J wdmerger_128nodes
+#SBATCH -o %x-%j.out
+#SBATCH -t 00:30:00
+#SBATCH -p batch
+# here N is the number of compute nodes
+#SBATCH -N 128
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=closest
+
+EXEC=./Castro3d.hip.x86-trento.MPI.HIP.ex
+INPUTS=inputs_scaling
+
+module load PrgEnv-gnu
+module load cray-mpich/8.1.27
+module load craype-accel-amd-gfx90a
+module load amd-mixed/6.0.0
+module unload darshan-runtime
+
+function find_chk_file {
+    # find_chk_file takes a single argument -- the wildcard pattern
+    # for checkpoint files to look through
+    chk=$1
+
+    # find the latest 2 restart files.  This way if the latest didn't
+    # complete we fall back to the previous one.
+    temp_files=$(find . -maxdepth 1 -name "${chk}" -print | sort | tail -2)
+    restartFile=""
+    for f in ${temp_files}
+    do
+        # the Header is the last thing written -- check if it's there, otherwise,
+        # fall back to the second-to-last check file written
+        if [ ! -f ${f}/Header ]; then
+            restartFile=""
+        else
+            restartFile="${f}"
+        fi
+    done
+
+}
+
+# look for 7-digit chk files
+find_chk_file "*chk???????"
+
+if [ "${restartFile}" = "" ]; then
+    # look for 6-digit chk files
+    find_chk_file "*chk??????"
+fi
+
+if [ "${restartFile}" = "" ]; then
+    # look for 5-digit chk files
+    find_chk_file "*chk?????"
+fi
+
+# restartString will be empty if no chk files are found -- i.e. new run
+if [ "${restartFile}" = "" ]; then
+    restartString=""
+else
+    restartString="amr.restart=${restartFile}"
+fi
+
+export OMP_NUM_THREADS=1
+export NMPI_PER_NODE=8
+export TOTAL_NMPI=$(( ${SLURM_JOB_NUM_NODES} * ${NMPI_PER_NODE} ))
+
+srun -n${TOTAL_NMPI} -N${SLURM_JOB_NUM_NODES} --ntasks-per-node=8 --gpus-per-task=1 ./$EXEC $INPUTS ${restartString}
+
+
+
diff --git a/Exec/science/wdmerger/scaling/frontier/frontier-16nodes.slurm b/Exec/science/wdmerger/scaling/frontier/frontier-16nodes.slurm
@@ -0,0 +1,72 @@
+#!/bin/bash
+#SBATCH -A AST106
+#SBATCH -J wdmerger_16nodes
+#SBATCH -o %x-%j.out
+#SBATCH -t 01:20:00
+#SBATCH -p batch
+# here N is the number of compute nodes
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=closest
+
+EXEC=./Castro3d.hip.x86-trento.MPI.HIP.ex
+INPUTS=inputs_scaling
+
+module load PrgEnv-gnu
+module load cray-mpich/8.1.27
+module load craype-accel-amd-gfx90a
+module load amd-mixed/6.0.0
+module unload darshan-runtime
+
+function find_chk_file {
+    # find_chk_file takes a single argument -- the wildcard pattern
+    # for checkpoint files to look through
+    chk=$1
+
+    # find the latest 2 restart files.  This way if the latest didn't
+    # complete we fall back to the previous one.
+    temp_files=$(find . -maxdepth 1 -name "${chk}" -print | sort | tail -2)
+    restartFile=""
+    for f in ${temp_files}
+    do
+        # the Header is the last thing written -- check if it's there, otherwise,
+        # fall back to the second-to-last check file written
+        if [ ! -f ${f}/Header ]; then
+            restartFile=""
+        else
+            restartFile="${f}"
+        fi
+    done
+
+}
+
+# look for 7-digit chk files
+find_chk_file "*chk???????"
+
+if [ "${restartFile}" = "" ]; then
+    # look for 6-digit chk files
+    find_chk_file "*chk??????"
+fi
+
+if [ "${restartFile}" = "" ]; then
+    # look for 5-digit chk files
+    find_chk_file "*chk?????"
+fi
+
+# restartString will be empty if no chk files are found -- i.e. new run
+if [ "${restartFile}" = "" ]; then
+    restartString=""
+else
+    restartString="amr.restart=${restartFile}"
+fi
+
+export OMP_NUM_THREADS=1
+export NMPI_PER_NODE=8
+export TOTAL_NMPI=$(( ${SLURM_JOB_NUM_NODES} * ${NMPI_PER_NODE} ))
+
+srun -n${TOTAL_NMPI} -N${SLURM_JOB_NUM_NODES} --ntasks-per-node=8 --gpus-per-task=1 ./$EXEC $INPUTS ${restartString}
+
+
+
diff --git a/Exec/science/wdmerger/scaling/frontier/frontier-256nodes.slurm b/Exec/science/wdmerger/scaling/frontier/frontier-256nodes.slurm
@@ -0,0 +1,72 @@
+#!/bin/bash
+#SBATCH -A AST106
+#SBATCH -J wdmerger_256nodes
+#SBATCH -o %x-%j.out
+#SBATCH -t 00:30:00
+#SBATCH -p batch
+# here N is the number of compute nodes
+#SBATCH -N 256
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=closest
+
+EXEC=./Castro3d.hip.x86-trento.MPI.HIP.ex
+INPUTS=inputs_scaling
+
+module load PrgEnv-gnu
+module load cray-mpich/8.1.27
+module load craype-accel-amd-gfx90a
+module load amd-mixed/6.0.0
+module unload darshan-runtime
+
+function find_chk_file {
+    # find_chk_file takes a single argument -- the wildcard pattern
+    # for checkpoint files to look through
+    chk=$1
+
+    # find the latest 2 restart files.  This way if the latest didn't
+    # complete we fall back to the previous one.
+    temp_files=$(find . -maxdepth 1 -name "${chk}" -print | sort | tail -2)
+    restartFile=""
+    for f in ${temp_files}
+    do
+        # the Header is the last thing written -- check if it's there, otherwise,
+        # fall back to the second-to-last check file written
+        if [ ! -f ${f}/Header ]; then
+            restartFile=""
+        else
+            restartFile="${f}"
+        fi
+    done
+
+}
+
+# look for 7-digit chk files
+find_chk_file "*chk???????"
+
+if [ "${restartFile}" = "" ]; then
+    # look for 6-digit chk files
+    find_chk_file "*chk??????"
+fi
+
+if [ "${restartFile}" = "" ]; then
+    # look for 5-digit chk files
+    find_chk_file "*chk?????"
+fi
+
+# restartString will be empty if no chk files are found -- i.e. new run
+if [ "${restartFile}" = "" ]; then
+    restartString=""
+else
+    restartString="amr.restart=${restartFile}"
+fi
+
+export OMP_NUM_THREADS=1
+export NMPI_PER_NODE=8
+export TOTAL_NMPI=$(( ${SLURM_JOB_NUM_NODES} * ${NMPI_PER_NODE} ))
+
+srun -n${TOTAL_NMPI} -N${SLURM_JOB_NUM_NODES} --ntasks-per-node=8 --gpus-per-task=1 ./$EXEC $INPUTS ${restartString}
+
+
+
diff --git a/Exec/science/wdmerger/scaling/frontier/frontier-32nodes.slurm b/Exec/science/wdmerger/scaling/frontier/frontier-32nodes.slurm
@@ -0,0 +1,72 @@
+#!/bin/bash
+#SBATCH -A AST106
+#SBATCH -J wdmerger_32nodes
+#SBATCH -o %x-%j.out
+#SBATCH -t 00:30:00
+#SBATCH -p batch
+# here N is the number of compute nodes
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=closest
+
+EXEC=./Castro3d.hip.x86-trento.MPI.HIP.ex
+INPUTS=inputs_scaling
+
+module load PrgEnv-gnu
+module load cray-mpich/8.1.27
+module load craype-accel-amd-gfx90a
+module load amd-mixed/6.0.0
+module unload darshan-runtime
+
+function find_chk_file {
+    # find_chk_file takes a single argument -- the wildcard pattern
+    # for checkpoint files to look through
+    chk=$1
+
+    # find the latest 2 restart files.  This way if the latest didn't
+    # complete we fall back to the previous one.
+    temp_files=$(find . -maxdepth 1 -name "${chk}" -print | sort | tail -2)
+    restartFile=""
+    for f in ${temp_files}
+    do
+        # the Header is the last thing written -- check if it's there, otherwise,
+        # fall back to the second-to-last check file written
+        if [ ! -f ${f}/Header ]; then
+            restartFile=""
+        else
+            restartFile="${f}"
+        fi
+    done
+
+}
+
+# look for 7-digit chk files
+find_chk_file "*chk???????"
+
+if [ "${restartFile}" = "" ]; then
+    # look for 6-digit chk files
+    find_chk_file "*chk??????"
+fi
+
+if [ "${restartFile}" = "" ]; then
+    # look for 5-digit chk files
+    find_chk_file "*chk?????"
+fi
+
+# restartString will be empty if no chk files are found -- i.e. new run
+if [ "${restartFile}" = "" ]; then
+    restartString=""
+else
+    restartString="amr.restart=${restartFile}"
+fi
+
+export OMP_NUM_THREADS=1
+export NMPI_PER_NODE=8
+export TOTAL_NMPI=$(( ${SLURM_JOB_NUM_NODES} * ${NMPI_PER_NODE} ))
+
+srun -n${TOTAL_NMPI} -N${SLURM_JOB_NUM_NODES} --ntasks-per-node=8 --gpus-per-task=1 ./$EXEC $INPUTS ${restartString}
+
+
+