1 Importing the libraries

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✔ ggplot2 3.3.3     ✔ purrr   0.3.4
## ✔ tibble  3.0.4     ✔ dplyr   1.0.2
## ✔ tidyr   1.1.2     ✔ stringr 1.4.0
## ✔ readr   1.4.0     ✔ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(corrr)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

2 Importing the data

To get the data from the CiGri DB:

\copy (SELECT id, campaign_id, state, start_time, stop_time, cluster_id, param_id FROM jobs WHERE submission_time >= '01-01-2017 00:00:00+02') to ./cigri_jobs_2017.csv CSV HEADER

filename <- "./cigri_jobs_2017.csv"

df_jobs <- read_csv(filename, col_names = T) %>%
    filter(state == "terminated") %>%
    drop_na(stop_time, start_time, campaign_id) %>%
    mutate(duration = as.duration(stop_time - start_time)) %>%
    select(-state, -stop_time, -start_time)

filename_campaigns <- "./cigri_campaign_properties.csv"

df_camp <- read_csv(filename_campaigns, col_names = T) %>%
    select(-id) %>%
    filter(name == "project" | name == "walltime" | name == "resources") %>%
    pivot_wider(names_from = name, values_from = value) %>%
    drop_na() %>%
    mutate(
        project = factor(project),
        walltime_dur = as.duration(hms::as_hms(walltime))
    ) %>%
    drop_na(walltime_dur) %>%
    select(campaign_id, project, walltime_dur, resources)

df_camp

filename_clusters <- "./cigri_clusters.csv"

df_clusters <- read_csv(filename_clusters, col_names = T) %>%
    mutate(
        cluster_id = id,
        cluster_name = name
    ) %>%
    select(cluster_id, cluster_name)

df <- df_jobs %>%
    left_join(df_camp, by = "campaign_id") %>%
    left_join(df_clusters, by = "cluster_id") %>%
    drop_na()

3 Plotting the distribution of Execution Time

We plot the histogram of the execution times of all the CiGri jobs.

df %>%
    ggplot(aes(x = duration, y = ..count..)) +
    geom_histogram(binwidth = 0.05) +
    scale_x_continuous(trans = "log10") +
    xlab("Execution Time (s)") +
    ylab("Count") +
    ggtitle("Histogram of the Execution Times of CiGri jobs") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal")

df %>%
  mutate(
    core = as.numeric(str_extract(resources, "(?<=cores?(=|>|>=|<|<=))[[:digit:]]+")),
    cpu = as.numeric(str_extract(resources, "(?<=cpus?(=|>|>=|<|<=))[[:digit:]]+")),
    host = as.numeric(str_extract(resources, "(?<=hosts?(=|>|>=|<|<=))[[:digit:]]+")),
    gpu = as.numeric(str_extract(resources, "(?<=gpus?(=|>|>=|<|<=))[[:digit:]]+"))
    ) %>%
  select(id, core, cpu, host, gpu) %>%
  melt(id.vars = c("id"), na.rm = TRUE) %>%
  ggplot(aes(x = value, y = ..count.., fill = variable)) +
  facet_grid(variable ~ .) +
  scale_y_continuous(trans = "log10") +
  geom_histogram(binwidth = 1) +
  theme_bw() +
  theme(legend.position = "bottom", legend.box = "horizontal")

4 Distribution of the Execution Time per Project

df %>%
    group_by(project) %>%
    summarise(
        mean_duration = mean(duration),
        sd_duration = sd(duration),
        n = n()
    ) %>%
    mutate(
       ci_inf = mean_duration - 2 * sd_duration / sqrt(n),
       ci_sup = mean_duration + 2 * sd_duration / sqrt(n),
       project_name = fct_reorder(project, mean_duration)
    ) %>%
    ggplot(aes(x = project_name, y = mean_duration, color = log10(n))) +
    geom_errorbar(aes(ymin = ci_inf, ymax = ci_sup), width=.1) +
    geom_point() +
    scale_y_continuous(trans = "log10") +
    coord_flip() +
    ylab("Execution Time (s)") +
    xlab("Projects") +
    ggtitle("Distribution of Execution Time per project") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal")

5 Histograms of the execution time per project (of at least 5000 jobs)

We now plot the execution times of jobs from the same project. To not plot everything, we filter out the projects that have less than 5000 jobs.

We also specify the cluster on which the jobs have been executed, as this can also be a factor for the execution time.

df %>%
    group_by(project) %>%
    mutate(n = n()) %>%
    filter(n > 5000) %>%
    ungroup() %>%
    ggplot(aes(x = duration, y = ..density.., fill = cluster_name)) +
    geom_histogram(binwidth = 0.05) +
    scale_x_continuous(trans = "log10") +
    facet_wrap(.~ project, scales = "free") +
    xlab("Execution Time (s)") +
    ylab("Density") +
    ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal") +
    scale_fill_discrete(name = "Cluster")

One hypothesis we have on Ctrl-CiGri is that jobs from the same project have similar execution times. This means we expect the distribution of the execution times to follow a sort of bell curve around a single value.

However, we can see that this is not the case for every project.

The most striking is the Phyloalps project (center plot). The jobs from this project seem to have 3 modes

In order to get back to our hypothesis, we will try to look at the parameters of the jobs in order to assign them one of those three modes. If we manage to do this, we will be able to “sort” the jobs of the project in order to have all the jobs from the first mode, then the second and finally the third.

6 The Phyloalps Project

df %>%
    filter(project == "phyloalps") %>%
    ggplot(aes(x = duration, y = ..density..)) +
    geom_histogram(binwidth = 0.05) +
    scale_x_continuous(trans = "log10") +
    xlab("Execution Time (s)") +
    ylab("Density") +
    ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal")

6.1 Intro and preparing the data

The parameters of the jobs from the Phyloalps project have the following form:

phyloalps_campaign_ids <- df %>%
    filter(project == "phyloalps") %>%
    select(campaign_id) %>%
    deframe() %>%
    unique() %>%
    sort()

filename_params <- "./cigri_parameters_phyloapls.csv"
df_phyloalps <- read_csv(filename_params, col_names = T) %>%
    filter(campaign_id %in% phyloalps_campaign_ids) %>%
    mutate(param_id = id) %>%
    select(param_id, name) %>%
    left_join(df, by = "param_id") %>%
    drop_na(duration) %>%
    select(param_id, name, duration) %>%
    separate(name, into = c("base", "extension"), sep = "\\.", remove = FALSE)

df_phyloalps %>% map_df(rev)

df_phyloalps %>%
    ggplot(aes(x = duration, fill = extension)) +
    geom_histogram(binwidth = 0.05) +
    scale_x_continuous(trans = "log10") +
    xlab("Execution Time (s)") +
    ylab("Density") +
    ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal") +
    scale_fill_discrete(name = "Extension")

df_phyloalps %>%
    ggplot(aes(x = duration, fill = extension)) +
    facet_grid(extension ~ .) +
    geom_histogram(binwidth = 0.05) +
    scale_x_continuous(trans = "log10") +
    xlab("Execution Time (s)") +
    ylab("Density") +
    ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal") +
    scale_fill_discrete(name = "Extension")

df_na <- df_phyloalps %>%
    filter(is.na(extension)) %>%
    mutate(name = base) %>%
    select(-base, -extension)

We then split the name of the parameters as follows:

<BASE>_<TYPE1>_<TYPE2>_full.fastq

In some cases, there are no <TYPE2>.

This is why we will only focus on the <BASE> and <TYPE1>.

(TODO Quentin: ça serait pas mal de montrer proprement que l’impact du type2 est faible sur le temps d’exec …)

df_params <- df_na %>%
    separate(name, into = c("base", "type1", "type2"), sep = c("_", "_")) %>%
    mutate(
        base = factor(base),
        type1 = factor(type1)
    )

df_params

6.2 Plotting the execution time per parameter

We plot the mean execution time of the jobs for a given couple of parameters (<BASE>, <TYPE1>).

df_params %>%
    group_by(base, type1) %>%
    summarise(
        mean_duration = mean(duration),
        sd_duration = sd(duration),
        n = n()) %>%
    filter(n > 0) %>% # So we can still see something
    group_by(base) %>%
    mutate(base_mean_duration = mean(mean_duration)) %>%
    ungroup() %>%
    group_by(type1) %>%
    mutate(type1_mean_duration = mean(mean_duration)) %>%
    ungroup() %>%
    mutate(
        base_f = fct_reorder(base, base_mean_duration),
        type1_f = fct_reorder(type1, type1_mean_duration),
        time = mean_duration - min(mean_duration)
    ) %>%
    ggplot(aes(x = base_f, y = type1_f, color = time, size = sd_duration)) +
    geom_point() +
    # scale_color_gradient(low = "yellow", high = "red", na.value = NA) +
    scale_color_gradientn(name = "Mean Duration", colors = rainbow(5)) +
    theme_bw() +
    xlab("Base") +
    ylab("Type1") +
    ggtitle("Mean Duration of the jobs from the PhyloAlps Project based on the Parameters") +
    theme(axis.text.x = element_text(angle=90), legend.position = "bottom", legend.box = "horizontal")

6.3 Trying to cluster the parameters

# set.seed(17440)
set.seed(38100)

km <- kmeans(df_params$duration, centers = 2, nstart = 25)

df_params$cluster <- as.factor(km$cluster)

df_params %>%
    ggplot(aes(x = duration, fill = cluster)) +
    geom_histogram(binwidth = 0.05) +
    scale_x_continuous(trans = "log10") +
    xlab("Execution Time (s)") +
    ylab("Count") +
    ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal") +
    scale_fill_discrete(name = "Cluster")

df_params

df_params %>%
    group_by(base, type1, cluster) %>%
    summarise(
        mean_duration = mean(duration),
        sd_duration = sd(duration),
        n = n()) %>%
    filter(n > 0) %>% # So we can still see something
    group_by(base) %>%
    mutate(base_mean_duration = mean(mean_duration)) %>%
    ungroup() %>%
    group_by(type1) %>%
    mutate(type1_mean_duration = mean(mean_duration)) %>%
    ungroup() %>%
    mutate(
        base_f = fct_reorder(base, base_mean_duration),
        type1_f = fct_reorder(type1, type1_mean_duration),
        time = mean_duration - min(mean_duration)
    ) %>%
    ggplot(aes(x = base_f, y = type1_f, color = cluster)) +
    geom_point() +
    theme_bw() +
    xlab("Base") +
    ylab("Type1") +
    ggtitle("Mean Duration of the jobs from the PhyloAlps Project based on the Parameters") +
    theme(axis.text.x = element_text(angle=90), legend.position = "bottom", legend.box = "horizontal")

What is going on with CiGri ?

Quentin