The problem size is given as
\(\kappa = N_\text{cells}\cdot p^2\)
For weak scaling we shall focus on the patchsize as this is easier to control. Due to the quadratic dependence of the \(\kappa\) on \(p\), we perform our variations with \(p\propto\sqrt{2}\).
ssh hamilton
# cisusername@hamilton.dur.ac.uk
module list
intel/2020.4 2) gcc/9.3.0 3) intelmpi/intel/2019.6 4) gcc/8.2.0 5) python/3.6.8
Data is stored at /ddn/data/cisuername/
#!/bin/bash
#SBATCH -t 00:30:00
#SBATCH --exclusive
#SBATCH -p par7.q
#SBATCH -N 1
#SBATCH --ntasks-per-node=1
#SBATCH --mail-type=END
#SBATCH --mail-user=holger.@durham.ac.uk
source /etc/profile.d/modules.sh
module purge
module load intel/2020.4 gcc/9.3.0 intelmpi/intel/2019.6 gsl/intel/2.4
export OMP_NUM_THREADS=24
export OMP_PROC_BIND=close
mpirun ./peano4
And submission:
for i in {1,2,4,8,16,32,64};do echo $i;sbatch --nodes $i Hamilton.sbatch;done
This is the executable generating code
declare -a H=("0.012345")
#declare -a H=("0.33" "0.11" "0.037" "0.012345")
declare -a P=("36" "51" "72" "102")
for h in "${H[@]}"
do
for p in "${P[@]}"
do
echo "$h $p"
python3 example-scripts/finitevolumes-with-ExaHyPE2-benchmark.py -cs ${h} -et 0.00005 -t enclave -ps ${p} -d 2 -f --load-balancing-quality 0.9 -o peano4_${p}_${h} -j 40
done
done
# , echo=FALSE, message=FALSE}
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
breaks <- 10^(-10:10)
minor_breaks <- rep(1:9, 21)*(10^rep(-10:10, each=9))
coreconfig <- c("4x40", "4x80", "4x160", "4x320")
read_csv("/home/gcgt96/Funding/fundingcse/thedata.csv") %>%
mutate(timeperdof=time/problemsize) %>%
group_by(problemsize) %>%
mutate(Efficiency= ifelse(problemsize<588747000,first(time)/time/nodes,2*first(time)/time/nodes)) %>%
filter(problemsize<688747000) ->df
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ncells = col_double(),
## patchsize = col_double(),
## problemsize = col_double(),
## nodes = col_double(),
## time = col_double(),
## track = col_double()
## )
df %>% filter(track==3) ->df3
ggplot(NULL, aes(x=nodes, y=time,color=factor(problemsize))) +
scale_x_log10(breaks = c(1,2,4,8,16), minor_breaks=c(1,2,4,8,16), name="Number of nodes") +
scale_y_continuous(name="Time [s]") +
scale_colour_discrete("Problemsize") +
geom_line(data=df%>%filter(track==1), aes(x=nodes, y=time), color="gray", linetype="dashed") +
geom_line(data=df%>%filter(track==2), aes(x=nodes, y=time), color="gray", linetype="dashed") +
geom_line(data=df%>%filter(track==3), aes(x=nodes, y=time), color="gray", linetype="dashed") +
geom_line(data=df%>%filter(track==4), aes(x=nodes, y=time), color="gray", linetype="dashed") +
geom_line(data=df%>%filter(track==5), aes(x=nodes, y=time), color="gray", linetype="dashed") +
geom_point(data = df, size=3) +
geom_line(data=df) +
ggtitle("Peano/ExaHyPE timestepping OpenMP and MPI on Hamilton") +
ggsave("spiderhamilton.pdf", device="pdf", width=6, height=4)
ggplot(NULL, aes(x=nodes, y=Efficiency,color=factor(problemsize))) +
scale_x_log10(breaks = c(1,2,4,8,16), minor_breaks=c(1,2,4,8,16), name="Number of nodes") +
geom_point(data=df, size=3) +
geom_line( data=df) +
scale_y_continuous(name="Efficiency") +
ylim(0,1) +
# geom_point(data=df%>%filter(problemsize<588747000), size=3) +
# geom_line( data=df%>%filter(problemsize<588747000)) +
# geom_point(data=df%>%filter(problemsize>588747000), aes(x=nodes,y=2*eff), size=3) +
# geom_line( data=df%>%filter(problemsize>588747000), aes(x=nodes,y=2*eff)) +
scale_colour_discrete("Problemsize", breaks=c(76527504,153586449,306110016,614345796)) +
ggtitle("Peano/ExaHyPE timestepping OpenMP and MPI on Hamilton") +
ggsave("spiderhamiltoneff.pdf", device="pdf", width=6, height=4)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
We use a single node as such:
#!/bin/bash
#SBATCH -t 01:00:00
#SBATCH --exclusive
#SBATCH -p par7.q
#SBATCH -N 1
#SBATCH --ntasks-per-node=1
#SBATCH --mail-type=END
#SBATCH --mail-user=holger.schulz@durham.ac.uk
source /etc/profile.d/modules.sh
module purge
module load intel/2020.4 gcc/9.3.0 intelmpi/intel/2019.6 likwid
export OMP_NUM_THREADS=24
export OMP_PROC_BIND=close
mpirun likwid-perfctr -g CACHES ./peano4_51_0.012345 --threading-model native > likwid-_51_0.012345-caches.result
mpirun likwid-perfctr -g FLOPS_DP ./peano4_51_0.012345 --threading-model native > likwid-_51_0.012345-flops.result
mpirun likwid-perfctr -g MEM ./peano4_51_0.012345 --threading-model native > likwid-_51_0.012345-mem.result
mpirun likwid-perfctr -g L2CACHE ./peano4_51_0.012345 --threading-model native > likwid-_51_0.012345-l2cache.result
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz
CPU type: Intel Xeon Broadwell EN/EP/EX processor
CPU clock: 2.20 GHz
--------------------------------------------------------------------------------
...
+---------------------------------------------+-----------------+-----------+-----------------+--------------+
| Metric | Sum | Min | Max | Avg |
+---------------------------------------------+-----------------+-----------+-----------------+--------------+
| Runtime (RDTSC) [s] STAT | 42439.6344 | 1768.3181 | 1768.3181 | 1768.3181 |
| Runtime unhalted [s] STAT | 48050.9890 | 1991.3499 | 2008.0700 | 2002.1245 |
| Clock [MHz] STAT | 59999.8855 | 2499.9583 | 2499.9992 | 2499.9952 |
| CPI STAT | 32.2980 | 1.2355 | 1.6770 | 1.3458 |
| L2 to L1 load bandwidth [MBytes/s] STAT | 9502.1186 | 328.9382 | 443.8521 | 395.9216 |
| L2 to L1 load data volume [GBytes] STAT | 16802.7681 | 581.6673 | 784.8718 | 700.1153 |
| L1 to L2 evict bandwidth [MBytes/s] STAT | 3266.3769 | 117.3723 | 149.2941 | 136.0990 |
| L1 to L2 evict data volume [GBytes] STAT | 5775.9940 | 207.5516 | 263.9995 | 240.6664 |
| L1 to/from L2 bandwidth [MBytes/s] STAT | 12768.4958 | 446.3105 | 584.0004 | 532.0207 |
| L1 to/from L2 data volume [GBytes] STAT | 22578.7618 | 789.2189 | 1032.6984 | 940.7817 |
| L3 to L2 load bandwidth [MBytes/s] STAT | 9183.0788 | 316.1890 | 458.7526 | 382.6283 |
| L3 to L2 load data volume [GBytes] STAT | 16238.6045 | 559.1228 | 811.2205 | 676.6085 |
| L2 to L3 evict bandwidth [MBytes/s] STAT | 1936.7233 | 64.3655 | 88.1054 | 80.6968 |
| L2 to L3 evict data volume [GBytes] STAT | 3424.7429 | 113.8186 | 155.7983 | 142.6976 |
| L2 to/from L3 bandwidth [MBytes/s] STAT | 11119.8023 | 380.5545 | 539.8978 | 463.3251 |
| L2 to/from L3 data volume [GBytes] STAT | 19663.3475 | 672.9414 | 954.7110 | 819.3061 |
| System to L3 bandwidth [MBytes/s] STAT | 5547.4753 | 0 | 2872.6022 | 231.1448 |
| System to L3 data volume [GBytes] STAT | 9809.7007 | 0 | 5079.6743 | 408.7375 |
| L3 to system bandwidth [MBytes/s] STAT | 649.9197 | 0 | 336.3507 | 27.0800 |
| L3 to system data volume [GBytes] STAT | 1149.2648 | 0 | 594.7750 | 47.8860 |
| L3 to/from system bandwidth [MBytes/s] STAT | 6197.3950 | 0 | 3208.9529 | 258.2248 |
| L3 to/from system data volume [GBytes] STAT | 10958.9654 | 0 | 5674.4493 | 456.6236 |
| Memory read bandwidth [MBytes/s] STAT | 0 | 0 | 0 | 0 |
| Memory read data volume [GBytes] STAT | 2361183000000 | 0 | 2361183000000 | 98382625000 |
| Memory write bandwidth [MBytes/s] STAT | 1335271000000 | 0 | 1335271000000 | 5.563629e+10 |
| Memory write data volume [GBytes] STAT | 2361183000000 | 0 | 2361183000000 | 98382625000 |
| Memory bandwidth [MBytes/s] STAT | 2670541000000 | 0 | 2670541000000 | 1.112725e+11 |
| Memory data volume [GBytes] STAT | 146393366000000 | 0 | 141671000000000 | 6.099724e+12 |
+---------------------------------------------+-----------------+-----------+-----------------+--------------+
+---------------------------+------------+-----------+-----------+-----------+
| Metric | Sum | Min | Max | Avg |
+---------------------------+------------+-----------+-----------+-----------+
| Runtime (RDTSC) [s] STAT | 14628.4608 | 609.5192 | 609.5192 | 609.5192 |
| Runtime unhalted [s] STAT | 16406.0036 | 679.7978 | 686.7601 | 683.5835 |
| Clock [MHz] STAT | 59982.3112 | 2499.0943 | 2499.4213 | 2499.2630 |
| CPI STAT | 27.0594 | 0.8947 | 2.6634 | 1.1275 |
| DP [MFLOP/s] STAT | 13470.3782 | 134.1143 | 701.1380 | 561.2658 |
| AVX DP [MFLOP/s] STAT | 9727.9691 | 88.9132 | 510.4351 | 405.3320 |
| Packed [MUOPS/s] STAT | 2980.1840 | 31.4200 | 154.3646 | 124.1743 |
| Scalar [MUOPS/s] STAT | 2646.0254 | 26.8177 | 137.6208 | 110.2511 |
| Vectorization ratio STAT | 1272.6609 | 52.7959 | 53.9513 | 53.0275 |
+---------------------------+------------+-----------+-----------+-----------+
+----------------------------------------+------------+-----------+-----------+-----------+
| Metric | Sum | Min | Max | Avg |
+----------------------------------------+------------+-----------+-----------+-----------+
| Runtime (RDTSC) [s] STAT | 14236.2144 | 593.1756 | 593.1756 | 593.1756 |
| Runtime unhalted [s] STAT | 15979.8066 | 662.1803 | 669.4214 | 665.8253 |
| Clock [MHz] STAT | 59987.6616 | 2499.3995 | 2499.6463 | 2499.4859 |
| CPI STAT | 25.3833 | 0.9191 | 1.5410 | 1.0576 |
| Memory read bandwidth [MBytes/s] STAT | 4352.2146 | 0 | 2313.4298 | 181.3423 |
| Memory read data volume [GBytes] STAT | 2581.6275 | 0 | 1372.2701 | 107.5678 |
| Memory write bandwidth [MBytes/s] STAT | 4736.0698 | 0 | 2492.4196 | 197.3362 |
| Memory write data volume [GBytes] STAT | 2809.3212 | 0 | 1478.4426 | 117.0551 |
| Memory bandwidth [MBytes/s] STAT | 9088.2844 | 0 | 4805.8494 | 378.6785 |
| Memory data volume [GBytes] STAT | 5390.9488 | 0 | 2850.7127 | 224.6229 |
+----------------------------------------+------------+-----------+-----------+-----------+
+---------------------------+------------+-----------+-----------+-----------+
| Metric | Sum | Min | Max | Avg |
+---------------------------+------------+-----------+-----------+-----------+
| Runtime (RDTSC) [s] STAT | 14153.0472 | 589.7103 | 589.7103 | 589.7103 |
| Runtime unhalted [s] STAT | 15868.9594 | 658.4518 | 664.5983 | 661.2066 |
| Clock [MHz] STAT | 59988.1826 | 2499.2973 | 2499.6752 | 2499.5076 |
| CPI STAT | 25.1551 | 0.9100 | 1.5041 | 1.0481 |
| L2 request rate STAT | 0.5775 | 0.0212 | 0.0366 | 0.0241 |
| L2 miss rate STAT | 0.1274 | 0.0045 | 0.0080 | 0.0053 |
| L2 miss ratio STAT | 5.2896 | 0.2132 | 0.2270 | 0.2204 |
+---------------------------+------------+-----------+-----------+-----------+
FOR COMPARISON, stream
24 threads
+---------------------------+------------+-----------+-----------+-----------+
| Metric | Sum | Min | Max | Avg |
+---------------------------+------------+-----------+-----------+-----------+
| Runtime (RDTSC) [s] STAT | 2.9760 | 0.1240 | 0.1240 | 0.1240 |
| Runtime unhalted [s] STAT | 2.0163 | 0.0792 | 0.1060 | 0.0840 |
| Clock [MHz] STAT | 42436.8402 | 1676.6767 | 2200.0278 | 1768.2017 |
| CPI STAT | 51.8634 | 2.0553 | 2.9326 | 2.1610 |
| L2 request rate STAT | 7.6286 | 0.2879 | 0.6562 | 0.3179 |
| L2 miss rate STAT | 1.5592 | 0.0542 | 0.1654 | 0.0650 |
| L2 miss ratio STAT | 4.8499 | 0.1802 | 0.2520 | 0.2021 |
+---------------------------+------------+-----------+-----------+-----------+
48 threads
+---------------------------+------------+-----------+-----------+-----------+
| Metric | Sum | Min | Max | Avg |
+---------------------------+------------+-----------+-----------+-----------+
| Runtime (RDTSC) [s] STAT | 4.1232 | 0.1718 | 0.1718 | 0.1718 |
| Runtime unhalted [s] STAT | 1.7832 | 0.0663 | 0.0950 | 0.0743 |
| Clock [MHz] STAT | 45596.5562 | 1749.0036 | 2121.1077 | 1899.8565 |
| CPI STAT | 128.0157 | 3.1290 | 5.9132 | 5.3340 |
| L2 request rate STAT | 21.7096 | 0.6343 | 0.9552 | 0.9046 |
| L2 miss rate STAT | 4.2741 | 0.1575 | 0.1845 | 0.1781 |
| L2 miss ratio STAT | 4.7425 | 0.1859 | 0.2483 | 0.1976 |
+---------------------------+------------+-----------+-----------+-----------+
Currently Loaded Modulefiles:
1) intel/2020.4 2) gcc/9.3.0 3) intelmpi/intel/2019.6
wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c
icc -O3 -xhost -qopenmp stream.c -o stream.exe
#!/bin/bash
#SBATCH -t 01:00:00
#SBATCH --exclusive
#SBATCH -p par7.q
#SBATCH -N 1
#SBATCH --ntasks-per-node=1
#SBATCH --mail-type=END
#SBATCH --mail-user=holger.schulz@durham.ac.uk
source /etc/profile.d/modules.sh
module purge
module load intel/2020.4 gcc/9.3.0 intelmpi/intel/2019.6 likwid
export OMP_NUM_THREADS=24
export OMP_PROC_BIND=close
likwid-perfctr -g CACHES ./stream.exe > stream-caches.result
likwid-perfctr -g FLOPS_DP ./stream.exe > stream-flops.result
likwid-perfctr -g MEM ./stream.exe > stream-mem.result
likwid-perfctr -g L2CACHE ./stream.exe > stream-l2cache.result
+---------------------------------------------+------------+-----------+------------+-----------+
| Metric | Sum | Min | Max | Avg |
+---------------------------------------------+------------+-----------+------------+-----------+
| Runtime (RDTSC) [s] STAT | 3.3936 | 0.1414 | 0.1414 | 0.1414 |
| Runtime unhalted [s] STAT | 1.9976 | 0.0787 | 0.1034 | 0.0832 |
| Clock [MHz] STAT | 41709.5209 | 1672.0178 | 2178.1570 | 1737.8967 |
| CPI STAT | 51.1987 | 1.8785 | 3.0330 | 2.1333 |
| L2 to L1 load bandwidth [MBytes/s] STAT | 42161.6297 | 1676.5987 | 3385.1306 | 1756.7346 |
| L2 to L1 load data volume [GBytes] STAT | 5.9607 | 0.2370 | 0.4786 | 0.2484 |
| L1 to L2 evict bandwidth [MBytes/s] STAT | 6337.9949 | 261.1495 | 310.3884 | 264.0831 |
| L1 to L2 evict data volume [GBytes] STAT | 0.8957 | 0.0369 | 0.0439 | 0.0373 |
| L1 to/from L2 bandwidth [MBytes/s] STAT | 48499.6246 | 1937.7518 | 3650.8721 | 2020.8177 |
| L1 to/from L2 data volume [GBytes] STAT | 6.8568 | 0.2740 | 0.5162 | 0.2857 |
| L3 to L2 load bandwidth [MBytes/s] STAT | 42273.5243 | 1677.8332 | 3391.7004 | 1761.3968 |
| L3 to L2 load data volume [GBytes] STAT | 5.9767 | 0.2372 | 0.4795 | 0.2490 |
| L2 to L3 evict bandwidth [MBytes/s] STAT | 950.6979 | 37.3066 | 58.2573 | 39.6124 |
| L2 to L3 evict data volume [GBytes] STAT | 0.1345 | 0.0053 | 0.0082 | 0.0056 |
| L2 to/from L3 bandwidth [MBytes/s] STAT | 43224.2226 | 1716.6436 | 3432.6208 | 1801.0093 |
| L2 to/from L3 data volume [GBytes] STAT | 6.1109 | 0.2427 | 0.4853 | 0.2546 |
| System to L3 bandwidth [MBytes/s] STAT | 36847.4221 | 0 | 19286.3569 | 1535.3093 |
| System to L3 data volume [GBytes] STAT | 5.2095 | 0 | 2.7267 | 0.2171 |
| L3 to system bandwidth [MBytes/s] STAT | 7363.0097 | 0 | 3697.2107 | 306.7921 |
| L3 to system data volume [GBytes] STAT | 1.0410 | 0 | 0.5227 | 0.0434 |
| L3 to/from system bandwidth [MBytes/s] STAT | 44210.4318 | 0 | 22983.5676 | 1842.1013 |
| L3 to/from system data volume [GBytes] STAT | 6.2504 | 0 | 3.2494 | 0.2604 |
| Memory read bandwidth [MBytes/s] STAT | 37042.7662 | 0 | 18622.9435 | 1543.4486 |
| Memory read data volume [GBytes] STAT | 5.2371 | 0 | 2.6329 | 0.2182 |
| Memory write bandwidth [MBytes/s] STAT | 28014.2382 | 0 | 14481.9039 | 1167.2599 |
| Memory write data volume [GBytes] STAT | 3.9606 | 0 | 2.0474 | 0.1650 |
| Memory bandwidth [MBytes/s] STAT | 65057.0044 | 0 | 33104.8474 | 2710.7085 |
| Memory data volume [GBytes] STAT | 9.1977 | 0 | 4.6803 | 0.3832 |
+---------------------------------------------+------------+-----------+------------+-----------+
+---------------------------+------------+-----------+-----------+-----------+
| Metric | Sum | Min | Max | Avg |
+---------------------------+------------+-----------+-----------+-----------+
| Runtime (RDTSC) [s] STAT | 2.9880 | 0.1245 | 0.1245 | 0.1245 |
| Runtime unhalted [s] STAT | 2.0072 | 0.0789 | 0.1046 | 0.0836 |
| Clock [MHz] STAT | 42392.5775 | 1670.9472 | 2176.3130 | 1766.3574 |
| CPI STAT | 52.2871 | 2.0584 | 2.9734 | 2.1786 |
| DP [MFLOP/s] STAT | 3774.1746 | 137.1811 | 619.0028 | 157.2573 |
| AVX DP [MFLOP/s] STAT | 3774.1400 | 137.1805 | 618.9885 | 157.2558 |
| Packed [MUOPS/s] STAT | 943.5344 | 34.2951 | 154.7471 | 39.3139 |
| Scalar [MUOPS/s] STAT | 0.0353 | 0.0007 | 0.0143 | 0.0015 |
| Vectorization ratio STAT | 2399.9307 | 99.9907 | 99.9981 | 99.9971 |
+---------------------------+------------+-----------+-----------+-----------+
+----------------------------------------+------------+-----------+------------+-----------+
| Metric | Sum | Min | Max | Avg |
+----------------------------------------+------------+-----------+------------+-----------+
| Runtime (RDTSC) [s] STAT | 3.1248 | 0.1302 | 0.1302 | 0.1302 |
| Runtime unhalted [s] STAT | 2.1063 | 0.0810 | 0.1060 | 0.0878 |
| Clock [MHz] STAT | 43049.7238 | 1723.7446 | 2151.3493 | 1793.7385 |
| CPI STAT | 50.3197 | 1.9637 | 2.8818 | 2.0967 |
| Memory read bandwidth [MBytes/s] STAT | 40153.2065 | 0 | 20185.5785 | 1673.0503 |
| Memory read data volume [GBytes] STAT | 5.2298 | 0 | 2.6291 | 0.2179 |
| Memory write bandwidth [MBytes/s] STAT | 30435.5807 | 0 | 15721.3561 | 1268.1492 |
| Memory write data volume [GBytes] STAT | 3.9640 | 0 | 2.0476 | 0.1652 |
| Memory bandwidth [MBytes/s] STAT | 70588.7871 | 0 | 35906.9345 | 2941.1995 |
| Memory data volume [GBytes] STAT | 9.1938 | 0 | 4.6767 | 0.3831 |
+----------------------------------------+------------+-----------+------------+-----------+
+---------------------------+------------+-----------+-----------+-----------+
| Metric | Sum | Min | Max | Avg |
+---------------------------+------------+-----------+-----------+-----------+
| Runtime (RDTSC) [s] STAT | 2.9760 | 0.1240 | 0.1240 | 0.1240 |
| Runtime unhalted [s] STAT | 2.0163 | 0.0792 | 0.1060 | 0.0840 |
| Clock [MHz] STAT | 42436.8402 | 1676.6767 | 2200.0278 | 1768.2017 |
| CPI STAT | 51.8634 | 2.0553 | 2.9326 | 2.1610 |
| L2 request rate STAT | 7.6286 | 0.2879 | 0.6562 | 0.3179 |
| L2 miss rate STAT | 1.5592 | 0.0542 | 0.1654 | 0.0650 |
| L2 miss ratio STAT | 4.8499 | 0.1802 | 0.2520 | 0.2021 |
+---------------------------+------------+-----------+-----------+-----------+
We would need to load libgsl when running.
#!/bin/bash
#SBATCH -t 00:30:00
#SBATCH --exclusive
#SBATCH -p gpu
#SBATCH -A bddur02
#SBATCH -N 1
#SBATCH --mail-type=END
#SBATCH --mail-user=holger.@durham.ac.uk
#SBATCH --gres=gpu:4
module purge
module load gcc/10.2.0 openmpi/4.0.5
export OMP_NUM_THREADS=40
export OMP_PROC_BIND=close
bede-mpirun --bede-par 1ppn ./peano4
Stuff is here /nobackup/projects/bddur02/Peano_p4/examples/exahype2/euler