Intro

I used a single Bede interactive node to run a parameter variation study of the CCZ4 example in the branch ccz4-cpp (7e02a6cd8b2c29f8990e7a6d588f73dc71d0d58a). The methodology is simple, vary parameters (recompile if the problem setup changes) and record the log file — I extract the time stepping time with a separate script that converts the logfiles to csv.

This is my bash script:

#!/bin/bash

module load cuda/10.1.243 llvm 
source ~/miniconda3/bin/activate
export PYTHONPATH=$PWD/../../../python:$PYTHONPATH

declare -a THREADS=("19" "39")
declare -a FNUM=("4" "8" "16" "32" "64" "128" "256" "512" "1024")
declare -a FMAX=("0" "1" "2" "4" "16" "32" "64" "128" "256")
declare -a CS=("0.4" "0.2" "0.1")
declare -a PS=("6" "8" "10" "20")

for cs in "${CS[@]}"
do
    for ps in "${PS[@]}"
    do
        python3 ccz4.py -impl fv-fixed-gpu -cs ${cs} -ps ${ps} -et 0.1 -plt 0
        for fnum in "${FNUM[@]}"
        do
            for fmax in "${FMAX[@]}"
            do
                for threads in "${THREADS[@]}"
                do
                    echo "$cs $ps $fnum $fmax $threads"
                    FUSEMAX=${fmax} FUSENUM=${fnum} taskset -c 0-${threads} ./peano4 2>&1 > test_CS_${cs}_PS_${ps}_FUSENUM_${fnum}_FUSEMAX_${fmax}_THREADS_${threads}.log
                done
            done
        done
    done
done

Computation of the problemsize

The problemsize requires the knowledge of the number of cells which I extract from the log files as such:

def getNCells(fname):
    celltot = []
    complete = []
    with open(fname) as f:
        for line in f:
            l = line.strip()
            if "total=" in l:
                celltot.append(l)
            else:
                continue

    ncells = []
    for ct in reversed(celltot):
        data = ct.split()
        if data[2] not in complete:
            ncells.append(data[-2].split("=")[-1])
            complete.append(data[2])

    nc = 0
    for i in ncells:
        l, r = i.split("/")
        nc+=int(l)
        # nc+=int(r)
    return nc

\(S = p^2 * N_\text{cells}\) where p is the patch size, \(N_\text{cells}\) is the just extracted number of cells and \(S\) is the problem size.

library(tidyverse)
read_csv("data01.csv") %>% filter(time >0)  %>% mutate(timeperdof=time/problemsize) ->df
df %>% filter(threads==19) ->df20
df %>% filter(threads==39) ->df40

Let’s look at some data

The best times without gpu are (20 and 40 threads)

df %>% filter(fusemax==0) %>% group_by(ps,threads) %>% summarize(first=min(timeperdof), last=min(timeperdof))
## # A tibble: 4 x 4
## # Groups:   ps [2]
##      ps threads   first    last
##   <dbl>   <dbl>   <dbl>   <dbl>
## 1     6      19 0.00177 0.00177
## 2     6      39 0.00162 0.00162
## 3     8      19 0.00192 0.00192
## 4     8      39 0.00168 0.00168

The best times with gpu:

df %>% filter(fusemax>0) %>% group_by(ps,threads)  %>% summarize(first=min(timeperdof), last=min(timeperdof))
## # A tibble: 4 x 4
## # Groups:   ps [2]
##      ps threads   first    last
##   <dbl>   <dbl>   <dbl>   <dbl>
## 1     6      19 0.00144 0.00144
## 2     6      39 0.00109 0.00109
## 3     8      19 0.00125 0.00125
## 4     8      39 0.00112 0.00112

The data for 20 threads

df %>% filter(threads==19 & (fusemax ==0 | fusemax<500) ) %>% ggplot(aes(x=fusenum, y=timeperdof, color=factor(fusemax))) + geom_point(size=1) + facet_wrap(vars(ps))

Here’s a plot for 40 threads

df %>% filter(threads==39 & (fusemax ==0 | fusemax<500) ) %>% ggplot(aes(x=fusenum, y=timeperdof, color=factor(fusemax))) + geom_point(size=1) + facet_wrap(vars(ps))

Investigate segfault

We observe quite a number of runs exiting with a segfault. The plot shows for which parameter combinations this happens.

read_csv("data01.csv") ->dfall
dfall %>% ggplot(aes(x=fusenum, y=fusemax, color=factor(sign(time)))) + geom_point(size=1)+ facet_wrap(vars(threads))

All of the above for the cs 0.2 dataset

library(tidyverse)
read_csv("data02.csv") %>% filter(time >0)  %>% mutate(timeperdof=time/problemsize) ->df
df %>% filter(threads==19) ->df20
df %>% filter(threads==39) ->df40

Let’s look at some data

The best times without gpu are (20 and 40 threads)

df20 %>% filter(fusemax==0) %>% summarize(first=min(timeperdof), last=min(timeperdof))
## # A tibble: 1 x 2
##     first    last
##     <dbl>   <dbl>
## 1 0.00320 0.00320
df40 %>% filter(fusemax==0) %>% summarize(first=min(timeperdof), last=min(timeperdof))
## # A tibble: 1 x 2
##     first    last
##     <dbl>   <dbl>
## 1 0.00325 0.00325

The best times with gpu:

df20 %>% filter(fusemax>0) %>% summarize(first=min(timeperdof), last=min(timeperdof))
## # A tibble: 1 x 2
##     first    last
##     <dbl>   <dbl>
## 1 0.00307 0.00307
df40 %>% filter(fusemax>0) %>% summarize(first=min(timeperdof), last=min(timeperdof))
## # A tibble: 1 x 2
##     first    last
##     <dbl>   <dbl>
## 1 0.00292 0.00292

The data for 20 threads

df %>% filter(threads==19 & (fusemax ==0 | fusemax<500) ) %>% ggplot(aes(x=fusenum, y=timeperdof, color=factor(fusemax))) + geom_point(size=1)

Here’s a plot for 40 threads

df %>% filter(threads==39 & (fusemax ==0 | fusemax<500) ) %>% ggplot(aes(x=fusenum, y=timeperdof, color=factor(fusemax))) + geom_point(size=1)

Investigate segfault

We observe quite a number of runs exiting with a segfault. The plot shows for which parameter combinations this happens.

read_csv("data02.csv") ->dfall
dfall %>% ggplot(aes(x=fusenum, y=fusemax, color=factor(sign(time)))) + geom_point(size=1)+ facet_wrap(vars(threads))