knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE)
library(tidyverse)## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.3 ✔ purrr 0.3.4
## ✔ tibble 3.0.4 ✔ dplyr 1.0.2
## ✔ tidyr 1.1.2 ✔ stringr 1.4.0
## ✔ readr 1.4.0 ✔ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(corrr)
library(reshape2)##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
To get the data from the CiGri DB:
\copy (SELECT id, campaign_id, state, start_time, stop_time, cluster_id, param_id FROM jobs WHERE submission_time >= '01-01-2017 00:00:00+02') to ./cigri_jobs_2017.csv CSV HEADER
filename <- "./cigri_jobs_2017.csv"
df_jobs <- read_csv(filename, col_names = T) %>%
filter(state == "terminated") %>%
drop_na(stop_time, start_time, campaign_id) %>%
mutate(duration = as.duration(stop_time - start_time)) %>%
select(-state, -stop_time, -start_time)filename_campaigns <- "./cigri_campaign_properties.csv"
df_camp <- read_csv(filename_campaigns, col_names = T) %>%
select(-id) %>%
filter(name == "project" | name == "walltime" | name == "resources") %>%
pivot_wider(names_from = name, values_from = value) %>%
drop_na() %>%
mutate(
project = factor(project),
walltime_dur = as.duration(hms::as_hms(walltime))
) %>%
drop_na(walltime_dur) %>%
select(campaign_id, project, walltime_dur, resources)
df_campfilename_clusters <- "./cigri_clusters.csv"
df_clusters <- read_csv(filename_clusters, col_names = T) %>%
mutate(
cluster_id = id,
cluster_name = name
) %>%
select(cluster_id, cluster_name)
df <- df_jobs %>%
left_join(df_camp, by = "campaign_id") %>%
left_join(df_clusters, by = "cluster_id") %>%
drop_na()We plot the histogram of the execution times of all the CiGri jobs.
df %>%
ggplot(aes(x = duration, y = ..count..)) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(trans = "log10") +
xlab("Execution Time (s)") +
ylab("Count") +
ggtitle("Histogram of the Execution Times of CiGri jobs") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal")df %>%
mutate(
core = as.numeric(str_extract(resources, "(?<=cores?(=|>|>=|<|<=))[[:digit:]]+")),
cpu = as.numeric(str_extract(resources, "(?<=cpus?(=|>|>=|<|<=))[[:digit:]]+")),
host = as.numeric(str_extract(resources, "(?<=hosts?(=|>|>=|<|<=))[[:digit:]]+")),
gpu = as.numeric(str_extract(resources, "(?<=gpus?(=|>|>=|<|<=))[[:digit:]]+"))
) %>%
select(id, core, cpu, host, gpu) %>%
melt(id.vars = c("id"), na.rm = TRUE) %>%
ggplot(aes(x = value, y = ..count.., fill = variable)) +
facet_grid(variable ~ .) +
scale_y_continuous(trans = "log10") +
geom_histogram(binwidth = 1) +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal")df %>%
group_by(project) %>%
summarise(
mean_duration = mean(duration),
sd_duration = sd(duration),
n = n()
) %>%
mutate(
ci_inf = mean_duration - 2 * sd_duration / sqrt(n),
ci_sup = mean_duration + 2 * sd_duration / sqrt(n),
project_name = fct_reorder(project, mean_duration)
) %>%
ggplot(aes(x = project_name, y = mean_duration, color = log10(n))) +
geom_errorbar(aes(ymin = ci_inf, ymax = ci_sup), width=.1) +
geom_point() +
scale_y_continuous(trans = "log10") +
coord_flip() +
ylab("Execution Time (s)") +
xlab("Projects") +
ggtitle("Distribution of Execution Time per project") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal")We now plot the execution times of jobs from the same project. To not plot everything, we filter out the projects that have less than 5000 jobs.
We also specify the cluster on which the jobs have been executed, as this can also be a factor for the execution time.
df %>%
group_by(project) %>%
mutate(n = n()) %>%
filter(n > 5000) %>%
ungroup() %>%
ggplot(aes(x = duration, y = ..density.., fill = cluster_name)) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(trans = "log10") +
facet_wrap(.~ project, scales = "free") +
xlab("Execution Time (s)") +
ylab("Density") +
ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal") +
scale_fill_discrete(name = "Cluster")One hypothesis we have on Ctrl-CiGri is that jobs from the same project have similar execution times. This means we expect the distribution of the execution times to follow a sort of bell curve around a single value.
However, we can see that this is not the case for every project.
The most striking is the Phyloalps project (center plot). The jobs from this project seem to have 3 modes
In order to get back to our hypothesis, we will try to look at the parameters of the jobs in order to assign them one of those three modes. If we manage to do this, we will be able to “sort” the jobs of the project in order to have all the jobs from the first mode, then the second and finally the third.
df %>%
filter(project == "phyloalps") %>%
ggplot(aes(x = duration, y = ..density..)) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(trans = "log10") +
xlab("Execution Time (s)") +
ylab("Density") +
ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal")The parameters of the jobs from the Phyloalps project have the following form:
phyloalps_campaign_ids <- df %>%
filter(project == "phyloalps") %>%
select(campaign_id) %>%
deframe() %>%
unique() %>%
sort()
filename_params <- "./cigri_parameters_phyloapls.csv"
df_phyloalps <- read_csv(filename_params, col_names = T) %>%
filter(campaign_id %in% phyloalps_campaign_ids) %>%
mutate(param_id = id) %>%
select(param_id, name) %>%
left_join(df, by = "param_id") %>%
drop_na(duration) %>%
select(param_id, name, duration) %>%
separate(name, into = c("base", "extension"), sep = "\\.", remove = FALSE)
df_phyloalps %>% map_df(rev)df_phyloalps %>%
ggplot(aes(x = duration, fill = extension)) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(trans = "log10") +
xlab("Execution Time (s)") +
ylab("Density") +
ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal") +
scale_fill_discrete(name = "Extension")df_phyloalps %>%
ggplot(aes(x = duration, fill = extension)) +
facet_grid(extension ~ .) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(trans = "log10") +
xlab("Execution Time (s)") +
ylab("Density") +
ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal") +
scale_fill_discrete(name = "Extension")df_na <- df_phyloalps %>%
filter(is.na(extension)) %>%
mutate(name = base) %>%
select(-base, -extension)We then split the name of the parameters as follows:
<BASE>_<TYPE1>_<TYPE2>_full.fastq
In some cases, there are no <TYPE2>.
This is why we will only focus on the <BASE> and <TYPE1>.
(TODO Quentin: ça serait pas mal de montrer proprement que l’impact du type2 est faible sur le temps d’exec …)
df_params <- df_na %>%
separate(name, into = c("base", "type1", "type2"), sep = c("_", "_")) %>%
mutate(
base = factor(base),
type1 = factor(type1)
)
df_paramsWe plot the mean execution time of the jobs for a given couple of parameters (<BASE>, <TYPE1>).
df_params %>%
group_by(base, type1) %>%
summarise(
mean_duration = mean(duration),
sd_duration = sd(duration),
n = n()) %>%
filter(n > 0) %>% # So we can still see something
group_by(base) %>%
mutate(base_mean_duration = mean(mean_duration)) %>%
ungroup() %>%
group_by(type1) %>%
mutate(type1_mean_duration = mean(mean_duration)) %>%
ungroup() %>%
mutate(
base_f = fct_reorder(base, base_mean_duration),
type1_f = fct_reorder(type1, type1_mean_duration),
time = mean_duration - min(mean_duration)
) %>%
ggplot(aes(x = base_f, y = type1_f, color = time, size = sd_duration)) +
geom_point() +
# scale_color_gradient(low = "yellow", high = "red", na.value = NA) +
scale_color_gradientn(name = "Mean Duration", colors = rainbow(5)) +
theme_bw() +
xlab("Base") +
ylab("Type1") +
ggtitle("Mean Duration of the jobs from the PhyloAlps Project based on the Parameters") +
theme(axis.text.x = element_text(angle=90), legend.position = "bottom", legend.box = "horizontal")# set.seed(17440)
set.seed(38100)
km <- kmeans(df_params$duration, centers = 2, nstart = 25)
df_params$cluster <- as.factor(km$cluster)
df_params %>%
ggplot(aes(x = duration, fill = cluster)) +
geom_histogram(binwidth = 0.05) +
scale_x_continuous(trans = "log10") +
xlab("Execution Time (s)") +
ylab("Count") +
ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal") +
scale_fill_discrete(name = "Cluster")df_paramsdf_params %>%
group_by(base, type1, cluster) %>%
summarise(
mean_duration = mean(duration),
sd_duration = sd(duration),
n = n()) %>%
filter(n > 0) %>% # So we can still see something
group_by(base) %>%
mutate(base_mean_duration = mean(mean_duration)) %>%
ungroup() %>%
group_by(type1) %>%
mutate(type1_mean_duration = mean(mean_duration)) %>%
ungroup() %>%
mutate(
base_f = fct_reorder(base, base_mean_duration),
type1_f = fct_reorder(type1, type1_mean_duration),
time = mean_duration - min(mean_duration)
) %>%
ggplot(aes(x = base_f, y = type1_f, color = cluster)) +
geom_point() +
theme_bw() +
xlab("Base") +
ylab("Type1") +
ggtitle("Mean Duration of the jobs from the PhyloAlps Project based on the Parameters") +
theme(axis.text.x = element_text(angle=90), legend.position = "bottom", legend.box = "horizontal")