knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE)
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(corrr)
library(EnvStats)##
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
##
## predict, predict.lm
## The following object is masked from 'package:base':
##
## print.default
library(reshape2)##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(e1071)##
## Attaching package: 'e1071'
## The following objects are masked from 'package:EnvStats':
##
## kurtosis, skewness
library(Rfast2)## Loading required package: Rcpp
library(goftest)
library(ggdendro)
library(twosamples)
library(evd)
library(fitdistrplus)## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:EnvStats':
##
## boxcox
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: survival
set.seed(38100)To get the data from the CiGri DB:
\copy (SELECT jobs.id as job_id, project_table.campaign_id, start_time, stop_time, clusters.name as cluster_name, project_table.value as project FROM (jobs LEFT OUTER JOIN clusters ON (jobs.cluster_id = clusters.id)) LEFT OUTER JOIN (SELECT * FROM campaign_properties WHERE name = 'project' AND value != '')
filename <- "/home/quentin/these/data_cigri_ciment/cigri_jobs_2017.csv"
percentage_duration <- 0.975
percentage_n <- 0.05
df <- read_csv(filename, col_names = T) %>%
filter(is.na(tag)) %>%
mutate(duration = as.duration(stop_time - start_time)) %>%
dplyr::select(-stop_time, -start_time, -tag) %>%
drop_na() %>%
filter(duration < quantile(duration, percentage_duration)) %>%
group_by(campaign_id) %>%
mutate(number_of_samples = n()) %>%
ungroup() %>%
filter(number_of_samples > quantile(number_of_samples, percentage_n)) %>%
dplyr::select(-number_of_samples)We remove the 2.5 % of the jobs with greater duration as they are probably outliers.
We also remove the 5 % campaigns with the smallest number of jobs as they are probably just test campaigns.
We plot the histogram of the execution times of all the CiGri jobs.
df %>%
ggplot(aes(x = duration, y = ..count..)) +
geom_histogram(binwidth = 10) +
xlab("Execution Time (s)") +
ylab("Count") +
ggtitle("Histogram of the Execution Times of CiGri jobs") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal")df %>%
group_by(project, campaign_id) %>%
summarise(
n = n(),
mean_duration = mean(duration),
sd_duration = sd(duration)
) %>%
ggplot(aes(x = n, y = mean_duration, color = project)) +
geom_point() +
ggtitle("Execution Time and Number of jobs per campaign") +
xlab("Number of jobs per campaign") +
ylab("Mean Execution Time (s)") +
scale_color_discrete(name = "Project") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal")data_campaigns <- df %>%
group_by(project, campaign_id) %>%
summarise(
n = n(),
mean_duration = mean(duration),
sd_duration = sd(duration)
)
mean_campaign_n <- mean(data_campaigns$n)
mean_campaign_duration <- mean(data_campaigns$mean_duration)So the mean campaign has 2.694012^{4} jobs with a mean execution time of 731.2373443
df %>%
group_by(campaign_id) %>%
summarise(n = n()) %>%
ggplot(aes(x = n)) +
stat_ecdf(geom = "step") +
geom_vline(xintercept = mean_campaign_n, linetype = "dashed", color = "red") +
xlab("Number of jobs per campaign") +
ylab("CDF") +
ggtitle("ECDF for the number of jobs per campaign") +
theme_bw()df %>%
group_by(campaign_id) %>%
summarise(n = mean(duration)) %>%
ggplot(aes(x = n)) +
stat_ecdf(geom = "step") +
geom_vline(xintercept = mean_campaign_duration, linetype = "dashed", color = "red") +
xlab("Mean duration of jobs in a campaign") +
ylab("CDF") +
ggtitle("ECDF for the mean duration of the jobs per campaign") +
theme_bw()df %>%
group_by(project) %>%
summarise(
mean_duration = mean(duration),
sd_duration = sd(duration),
n = n()
) %>%
mutate(
ci_inf = mean_duration - 2 * sd_duration / sqrt(n),
ci_sup = mean_duration + 2 * sd_duration / sqrt(n),
project_name = fct_reorder(project, mean_duration)
) %>%
ggplot(aes(x = project_name, y = mean_duration, color = log10(n))) +
geom_errorbar(aes(ymin = ci_inf, ymax = ci_sup), width=.1) +
geom_point() +
coord_flip() +
ylab("Execution Time (s)") +
xlab("Projects") +
ggtitle("Distribution of Execution Time per project") +
scale_color_continuous(name = "Log10 of Number of jobs") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal")We now plot the execution times of jobs from the same project. To not plot everything, we filter out the projects that have less than 5000 jobs.
We also specify the cluster on which the jobs have been executed, as this can also be a factor for the execution time.
df %>%
group_by(project) %>%
filter(n() > 5000) %>%
ungroup() %>%
ggplot(aes(x = duration, fill = cluster_name)) +
geom_histogram(bins = 30) +
facet_wrap(.~ project, scales = "free", ncol = 4) +
xlab("Execution Time (s)") +
ylab("Density") +
ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal") +
scale_fill_discrete(name = "Cluster")One hypothesis we have on Ctrl-CiGri is that jobs from the same project have similar execution times. This means we expect the distribution of the execution times to follow a sort of bell curve around a single value.
However, we can see that this is not the case for every project.
The most striking is the Phyloalps project (center plot). The jobs from this project seem to have 3 modes
In order to get back to our hypothesis, we will try to look at the parameters of the jobs in order to assign them one of those three modes. If we manage to do this, we will be able to “sort” the jobs of the project in order to have all the jobs from the first mode, then the second and finally the third.
display_full_histo <- function(project_name) {
df_project = df %>%
filter(project == project_name)
nb_bins = df_project %>%
group_by(project) %>%
summarise(nbins = ceiling(log2(n()) + 1)) %>%
dplyr::select(nbins) %>%
deframe()
bin_width = df_project %>%
group_by(project) %>%
summarise(h = 3.5 * sd(duration) * n() ^ (-1/3)) %>%
dplyr::select(h) %>%
deframe()
# nb_bins_doane = df_project %>%
# group_by(project) %>%
# summarise(
# nbins = ceiling(
# 1 + log2(n()) + log2(1 + abs(skewness(duration, type = 1) / sqrt((6 * (n() - 2)) / ((n() + 1) * (n() + 3)))))
# )
# ) %>%
# dplyr::select(nbins) %>%
# deframe()
plot_full = df_project %>%
ggplot(aes(x = duration, fill = cluster_name)) +
geom_histogram(bins = nb_bins) +
xlab("Execution Time (s)") +
ylab("Count") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal") +
scale_fill_discrete(name = "Cluster")
plot_full
}
display_all_histo <- function(project_name, n_min = 0) {
df_project = df %>%
filter(project == project_name)
nb_bins = df_project %>%
group_by(project) %>%
summarise(nbins = ceiling(log2(n()) + 1)) %>%
dplyr::select(nbins) %>%
deframe()
plot_per_campaign = df_project %>%
group_by(campaign_id) %>%
filter(n() > n_min) %>%
ungroup() %>%
ggplot(aes(x = duration, fill = cluster_name)) +
geom_histogram(bins = nb_bins) +
facet_wrap(. ~ campaign_id, ncol = 3) +
xlab("Execution Time (s)") +
ylab("Count") +
theme_bw() +
theme(legend.position = "bottom", legend.box = "horizontal") +
scale_fill_discrete(name = "Cluster")
plot_per_campaign
}display_full_histo("rockyfor3d") +
ggtitle("Histogram of the Execution Times of CiGri jobs for the rockyfor3d project")display_all_histo("rockyfor3d") +
ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the rockyfor3d project")display_full_histo("eventdetection") +
ggtitle("Histogram of the Execution Times of CiGri jobs for the eventdetection project")display_all_histo("eventdetection", n_min = 1) +
ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the eventdetection project")display_full_histo("seiscope") +
ggtitle("Histogram of the Execution Times of CiGri jobs for the seiscope project")display_all_histo("seiscope", n_min = 100) +
ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the seiscope project")display_full_histo("snowem") +
ggtitle("Histogram of the Execution Times of CiGri jobs for the snowem project")display_all_histo("snowem") +
ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the snowem project")display_full_histo("teembio") +
ggtitle("Histogram of the Execution Times of CiGri jobs for the teembio project")display_all_histo("teembio", n_min = 300) +
ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the teembio project")display_full_histo("phyloalps") +
ggtitle("Histogram of the Execution Times of CiGri jobs for the phyloalps project")display_all_histo("phyloalps") +
ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the phyloalps project")display_full_histo("f-image") +
ggtitle("Histogram of the Execution Times of CiGri jobs for the f-image project")display_all_histo("f-image") +
ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the f-image project")ks_distance <- function(samples_x, samples_y) {
cvm_stat(unlist(samples_x), unlist(samples_y))
}
get_clusters <- function(samples_l, k) {
N <- length(samples_l)
dist_mat <- matrix(0, ncol = N, nrow = N)
for (i in 1:N) {
for (j in 1:N) {
dist_mat[i, j] <- ks_distance(samples_l[i], samples_l[j])
}
}
dist_mat <- as.dist(dist_mat)
hclust_avg <- hclust(dist_mat, method = 'complete')
cutree(hclust_avg, k = k)
}
k <- 5
nb_of_sampling <- 30
nb_iter <- 1
df <- df %>% filter(project != "fate") %>%
group_by(project, campaign_id, cluster_name) %>%
filter(n() >= nb_of_sampling) %>%
ungroup()
df_pvalues <- df %>%
group_by(project, cluster_name) %>%
summarise(
weibull_sum = 0,
gamma_sum = 0,
lnorm_sum = 0,
norm_sum = 0,
frechet_sum = 0,
) %>% crossing(tibble(cluster = seq(1, k)))
for (i in 1:nb_iter) {
df_sample <- df %>%
group_by(project, campaign_id, cluster_name) %>%
filter(job_id %in% sample(job_id, nb_of_sampling)) %>%
mutate(duration_num = as.numeric(duration)) %>%
summarise(samples_l = list(duration_num)) %>%
ungroup() %>%
mutate(
cluster = get_clusters(samples_l, k)
) %>%
group_by(project, cluster_name, cluster) %>%
summarise(
all_samples = list(reduce(flatten(list(samples_l)), c, .init = c())),
n = length(unlist(all_samples)),
# Weibull Distibution -----------------------------------------------------
shape_w = (fitdist(unlist(all_samples), "weibull"))$estimate[1],
scale_w = (fitdist(unlist(all_samples), "weibull"))$estimate[2],
weibull = cvm.test(unlist(all_samples), "pweibull", shape = shape_w, scale = scale_w)$p.value,
# Gamma Distribution ------------------------------------------------------
shape_g = egamma(unlist(all_samples))$parameters[1],
scale_g = egamma(unlist(all_samples))$parameters[2],
gamma = cvm.test(unlist(all_samples), "pgamma", shape = shape_g, rate = 1/scale_g)$p.value,
# LogNormal Distribution --------------------------------------------------
shape_l = (fitdist(unlist(all_samples), "lnorm"))$estimate[1],
scale_l = (fitdist(unlist(all_samples), "lnorm"))$estimate[2],
lnorm = cvm.test(unlist(all_samples), "plnorm", shape_l, scale_l)$p.value,
# Normal Distribution -----------------------------------------------------
shape_n = (fitdist(unlist(all_samples), "norm"))$estimate[1],
scale_n = (fitdist(unlist(all_samples), "norm"))$estimate[2],
norm = cvm.test(unlist(all_samples), "pnorm", shape_n, scale_n)$p.value,
# Frechet Distribution ----------------------------------------------------
shape_f = eweibull((unlist(map(all_samples, function(x) {1/x}))))$parameters[1],
scale_f = eweibull((unlist(map(all_samples, function(x) {1/x}))))$parameters[2],
frechet = cvm.test(unlist(all_samples), "pfrechet", shape = shape_f, scale = 1 / scale_f)$p.value,
) %>%
dplyr::select(project, cluster_name, cluster, weibull, gamma, lnorm, norm, frechet) %>%
group_by(project, cluster_name) %>%
arrange(cluster) %>%
mutate(cluster = row_number()) %>%
ungroup()
df_pvalues <- df_pvalues %>%
left_join(df_sample, by = c("project", "cluster_name", "cluster")) %>%
mutate(
weibull_sum = weibull_sum + weibull,
gamma_sum = gamma_sum + gamma,
lnorm_sum = lnorm_sum + lnorm,
norm_sum = norm_sum + norm,
frechet_sum = frechet_sum + frechet
) %>%
dplyr::select(-weibull, -gamma, - lnorm, -norm, -frechet)
}
df_pvalues <- df_pvalues %>%
mutate(
Weibull = weibull_sum / nb_iter,
Gamma = gamma_sum / nb_iter,
Lnorm = lnorm_sum / nb_iter,
Norm = norm_sum / nb_iter,
Frechet = frechet_sum / nb_iter,
) %>%
dplyr::select(project, cluster_name, cluster, Weibull, Gamma, Lnorm, Norm, Frechet) %>%
drop_na(Weibull, Gamma, Lnorm, Norm, Frechet)
df_pvaluesdf_pvalues %>%
melt(id.vars = c("project", "cluster_name", "cluster")) %>%
ggplot(aes(x = factor(cluster), y = value, fill = factor(variable))) +
geom_col(position = "dodge") +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "black") +
facet_grid(project ~ cluster_name, scale = "free_x") +
theme_bw() +
ggtitle("Model per project per cluster") +
xlab("#Cluster") +
ylab("p-value") +
ylim(0, 1) +
scale_fill_discrete(name = "Distribution") +
theme(legend.position = "bottom", legend.box = "horizontal")df_pvalues %>%
melt(id.vars = c("project", "cluster_name", "cluster")) %>%
group_by(project, cluster_name, cluster) %>%
summarise(
max = max(value),
type = variable[which.max(value)]
) %>%
ggplot(aes(x = factor(cluster), y = max, fill = factor(type))) +
geom_col() +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "black") +
facet_grid(project ~ cluster_name, scale = "free_x") +
theme_bw() +
ggtitle("Model per project per cluster") +
xlab("#Cluster") +
ylab("p-value") +
ylim(0, 1) +
scale_fill_discrete(name = "Distribution") +
theme(legend.position = "bottom", legend.box = "horizontal")df_pvalues %>%
melt(id.vars = c("project", "cluster_name", "cluster")) %>%
group_by(project, cluster_name, cluster) %>%
summarise(
max = max(value),
type = variable[which.max(value)]
)