1 Importing the libraries

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(corrr)
library(EnvStats)
## 
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
## 
##     predict, predict.lm
## The following object is masked from 'package:base':
## 
##     print.default
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(e1071)
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:EnvStats':
## 
##     kurtosis, skewness
library(Rfast2)
## Loading required package: Rcpp
library(goftest)
library(ggdendro)
library(twosamples)
library(evd)
library(fitdistrplus)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:EnvStats':
## 
##     boxcox
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: survival
set.seed(38100)

2 Importing the data

To get the data from the CiGri DB:

\copy (SELECT jobs.id as job_id, project_table.campaign_id, start_time, stop_time, clusters.name as cluster_name, project_table.value as project FROM (jobs LEFT OUTER JOIN clusters ON (jobs.cluster_id = clusters.id)) LEFT OUTER JOIN (SELECT * FROM campaign_properties WHERE name = 'project' AND value != '')
filename <- "/home/quentin/these/data_cigri_ciment/cigri_jobs_2017.csv"

percentage_duration <- 0.975
percentage_n <- 0.05

df <- read_csv(filename, col_names = T) %>%
    filter(is.na(tag)) %>%
    mutate(duration = as.duration(stop_time - start_time)) %>%
    dplyr::select(-stop_time, -start_time, -tag) %>%
    drop_na() %>%
    filter(duration < quantile(duration, percentage_duration)) %>%
    group_by(campaign_id) %>%
    mutate(number_of_samples = n()) %>%
    ungroup() %>%
    filter(number_of_samples > quantile(number_of_samples, percentage_n)) %>%
    dplyr::select(-number_of_samples)

We remove the 2.5 % of the jobs with greater duration as they are probably outliers.

We also remove the 5 % campaigns with the smallest number of jobs as they are probably just test campaigns.

3 Plotting the distribution of Execution Time

We plot the histogram of the execution times of all the CiGri jobs.

df %>%
    ggplot(aes(x = duration, y = ..count..)) +
    geom_histogram(binwidth = 10) +
    xlab("Execution Time (s)") +
    ylab("Count") +
    ggtitle("Histogram of the Execution Times of CiGri jobs") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal")

4 Execution Time and Number of jobs per campaigns

df %>%
    group_by(project, campaign_id) %>%
    summarise(
        n = n(),
        mean_duration = mean(duration),
        sd_duration = sd(duration)
    ) %>%
    ggplot(aes(x = n, y = mean_duration, color = project)) +
    geom_point() +
    ggtitle("Execution Time and Number of jobs per campaign") +
    xlab("Number of jobs per campaign") +
    ylab("Mean Execution Time (s)") +
    scale_color_discrete(name = "Project") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal")

data_campaigns <- df %>%
    group_by(project, campaign_id) %>%
    summarise(
        n = n(),
        mean_duration = mean(duration),
        sd_duration = sd(duration)
    )
mean_campaign_n <- mean(data_campaigns$n)
mean_campaign_duration <- mean(data_campaigns$mean_duration)

So the mean campaign has 2.694012^{4} jobs with a mean execution time of 731.2373443

df %>%
  group_by(campaign_id) %>%
  summarise(n = n()) %>%
  ggplot(aes(x = n)) +
  stat_ecdf(geom = "step") +
  geom_vline(xintercept = mean_campaign_n, linetype = "dashed", color = "red") +
  xlab("Number of jobs per campaign") +
  ylab("CDF") +
  ggtitle("ECDF for the number of jobs per campaign") +
  theme_bw()

df %>%
  group_by(campaign_id) %>%
  summarise(n = mean(duration)) %>%
  ggplot(aes(x = n)) +
  stat_ecdf(geom = "step") +
  geom_vline(xintercept = mean_campaign_duration, linetype = "dashed", color = "red") +
  xlab("Mean duration of jobs in a campaign") +
  ylab("CDF") +
  ggtitle("ECDF for the mean duration of the jobs per campaign") +
  theme_bw()

5 Distribution of the Execution Time per Project

df %>%
    group_by(project) %>%
    summarise(
        mean_duration = mean(duration),
        sd_duration = sd(duration),
        n = n()
    ) %>%
    mutate(
       ci_inf = mean_duration - 2 * sd_duration / sqrt(n),
       ci_sup = mean_duration + 2 * sd_duration / sqrt(n),
       project_name = fct_reorder(project, mean_duration)
    ) %>%
    ggplot(aes(x = project_name, y = mean_duration, color = log10(n))) +
    geom_errorbar(aes(ymin = ci_inf, ymax = ci_sup), width=.1) +
    geom_point() +
    coord_flip() +
    ylab("Execution Time (s)") +
    xlab("Projects") +
    ggtitle("Distribution of Execution Time per project") +
    scale_color_continuous(name = "Log10 of Number of jobs") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal")

6 Histograms of the execution time per project (of at least 5000 jobs)

We now plot the execution times of jobs from the same project. To not plot everything, we filter out the projects that have less than 5000 jobs.

We also specify the cluster on which the jobs have been executed, as this can also be a factor for the execution time.

df %>%
    group_by(project) %>%
    filter(n() > 5000) %>%
    ungroup() %>%
    ggplot(aes(x = duration, fill = cluster_name)) +
    geom_histogram(bins = 30) +
    facet_wrap(.~ project, scales = "free", ncol = 4) +
    xlab("Execution Time (s)") +
    ylab("Density") +
    ggtitle("Histogram of the Execution Times of CiGri jobs per project") +
    theme_bw() +
    theme(legend.position = "bottom", legend.box = "horizontal") +
    scale_fill_discrete(name = "Cluster")

One hypothesis we have on Ctrl-CiGri is that jobs from the same project have similar execution times. This means we expect the distribution of the execution times to follow a sort of bell curve around a single value.

However, we can see that this is not the case for every project.

The most striking is the Phyloalps project (center plot). The jobs from this project seem to have 3 modes

In order to get back to our hypothesis, we will try to look at the parameters of the jobs in order to assign them one of those three modes. If we manage to do this, we will be able to “sort” the jobs of the project in order to have all the jobs from the first mode, then the second and finally the third.

display_full_histo <- function(project_name) {
    df_project = df %>%
        filter(project == project_name)

    nb_bins = df_project %>%
      group_by(project) %>%
      summarise(nbins = ceiling(log2(n()) + 1)) %>%
      dplyr::select(nbins) %>%
      deframe()

    bin_width = df_project %>%
      group_by(project) %>%
      summarise(h = 3.5 * sd(duration) * n() ^ (-1/3)) %>%
      dplyr::select(h) %>%
      deframe()

   #  nb_bins_doane = df_project %>%
   #    group_by(project) %>%
   #    summarise(
   #      nbins = ceiling(
   #          1 + log2(n()) + log2(1 + abs(skewness(duration, type = 1) / sqrt((6 * (n() - 2)) / ((n() + 1) * (n() + 3)))))
   #      )
   #    ) %>%
   #    dplyr::select(nbins) %>%
   #    deframe()

    plot_full = df_project %>%
        ggplot(aes(x = duration, fill = cluster_name)) +
        geom_histogram(bins = nb_bins) +
        xlab("Execution Time (s)") +
        ylab("Count") +
        theme_bw() +
        theme(legend.position = "bottom", legend.box = "horizontal") +
        scale_fill_discrete(name = "Cluster")
    plot_full
}

display_all_histo <- function(project_name, n_min = 0) {
    df_project = df %>%
        filter(project == project_name)

    nb_bins = df_project %>%
      group_by(project) %>%
      summarise(nbins = ceiling(log2(n()) + 1)) %>%
      dplyr::select(nbins) %>%
      deframe()

    plot_per_campaign = df_project %>%
        group_by(campaign_id) %>%
        filter(n() > n_min) %>%
        ungroup() %>%
        ggplot(aes(x = duration, fill = cluster_name)) +
        geom_histogram(bins = nb_bins) +
        facet_wrap(. ~ campaign_id, ncol = 3) +
        xlab("Execution Time (s)") +
        ylab("Count") +
        theme_bw() +
        theme(legend.position = "bottom", legend.box = "horizontal") +
        scale_fill_discrete(name = "Cluster")
    plot_per_campaign
}

7 Some Examples of Projects

7.1 The rockyfor3d Project

display_full_histo("rockyfor3d") +
    ggtitle("Histogram of the Execution Times of CiGri jobs for the rockyfor3d project")

display_all_histo("rockyfor3d") +
    ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the rockyfor3d project")

7.2 The eventdetection Project

display_full_histo("eventdetection") +
    ggtitle("Histogram of the Execution Times of CiGri jobs for the eventdetection project")

display_all_histo("eventdetection", n_min = 1) +
    ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the eventdetection project")

7.3 The Seiscope Project

display_full_histo("seiscope") +
    ggtitle("Histogram of the Execution Times of CiGri jobs for the seiscope project")

display_all_histo("seiscope", n_min = 100) +
    ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the seiscope project")

7.4 The Snowem Project

display_full_histo("snowem") +
    ggtitle("Histogram of the Execution Times of CiGri jobs for the snowem project")

display_all_histo("snowem") +
    ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the snowem project")

7.5 The Teembio Project

display_full_histo("teembio") +
    ggtitle("Histogram of the Execution Times of CiGri jobs for the teembio project")

display_all_histo("teembio", n_min = 300) +
    ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the teembio project")

7.6 The Phyloalps Project

display_full_histo("phyloalps") +
    ggtitle("Histogram of the Execution Times of CiGri jobs for the phyloalps project")

display_all_histo("phyloalps") +
    ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the phyloalps project")

7.7 The f-image Project

display_full_histo("f-image") +
    ggtitle("Histogram of the Execution Times of CiGri jobs for the f-image project")

display_all_histo("f-image") +
    ggtitle("Histograms of the Execution Times of CiGri jobs per campaign for the f-image project")

8 Estimation of the distibution of the campaigns

ks_distance <- function(samples_x, samples_y) {
  cvm_stat(unlist(samples_x), unlist(samples_y))
}

get_clusters <- function(samples_l, k) {
  N <- length(samples_l)
  dist_mat <- matrix(0, ncol = N, nrow = N)
  for (i in 1:N) {
    for (j in 1:N) {
      dist_mat[i, j] <- ks_distance(samples_l[i], samples_l[j])
    }
  }
  dist_mat <- as.dist(dist_mat)
  hclust_avg <- hclust(dist_mat, method = 'complete')
  cutree(hclust_avg, k = k)
}
k <- 5
nb_of_sampling <- 30
nb_iter <- 1

df <- df %>% filter(project != "fate") %>%
    group_by(project, campaign_id, cluster_name) %>%
    filter(n() >= nb_of_sampling) %>%
    ungroup()

df_pvalues <- df %>%
  group_by(project, cluster_name) %>%
  summarise(
    weibull_sum = 0,
    gamma_sum = 0,
    lnorm_sum = 0,
    norm_sum = 0,
    frechet_sum = 0,
  ) %>% crossing(tibble(cluster = seq(1, k)))

for (i in 1:nb_iter) {
    df_sample <- df %>%
        group_by(project, campaign_id, cluster_name) %>%
        filter(job_id %in% sample(job_id, nb_of_sampling)) %>%
        mutate(duration_num = as.numeric(duration)) %>%
        summarise(samples_l = list(duration_num)) %>%
        ungroup() %>%
        mutate(
            cluster = get_clusters(samples_l, k)
        ) %>%
        group_by(project, cluster_name, cluster) %>%
        summarise(
            all_samples = list(reduce(flatten(list(samples_l)), c, .init = c())),
            n = length(unlist(all_samples)),

    # Weibull Distibution -----------------------------------------------------

            shape_w = (fitdist(unlist(all_samples), "weibull"))$estimate[1],
            scale_w = (fitdist(unlist(all_samples), "weibull"))$estimate[2],

            weibull = cvm.test(unlist(all_samples), "pweibull", shape = shape_w, scale = scale_w)$p.value,

    # Gamma Distribution ------------------------------------------------------

            shape_g = egamma(unlist(all_samples))$parameters[1],
            scale_g = egamma(unlist(all_samples))$parameters[2],

            gamma = cvm.test(unlist(all_samples), "pgamma", shape = shape_g, rate = 1/scale_g)$p.value,

    # LogNormal Distribution --------------------------------------------------

            shape_l = (fitdist(unlist(all_samples), "lnorm"))$estimate[1],
            scale_l = (fitdist(unlist(all_samples), "lnorm"))$estimate[2],

            lnorm = cvm.test(unlist(all_samples), "plnorm", shape_l, scale_l)$p.value,

    # Normal Distribution -----------------------------------------------------

            shape_n = (fitdist(unlist(all_samples), "norm"))$estimate[1],
            scale_n = (fitdist(unlist(all_samples), "norm"))$estimate[2],

            norm = cvm.test(unlist(all_samples), "pnorm", shape_n, scale_n)$p.value,

    # Frechet Distribution ----------------------------------------------------

            shape_f = eweibull((unlist(map(all_samples, function(x) {1/x}))))$parameters[1],
            scale_f = eweibull((unlist(map(all_samples, function(x) {1/x}))))$parameters[2],

            frechet = cvm.test(unlist(all_samples), "pfrechet", shape = shape_f, scale = 1 / scale_f)$p.value,
        ) %>%
        dplyr::select(project, cluster_name, cluster, weibull, gamma, lnorm, norm, frechet) %>%
        group_by(project, cluster_name) %>%
        arrange(cluster) %>%
        mutate(cluster = row_number()) %>%
        ungroup()

    df_pvalues <- df_pvalues %>%
      left_join(df_sample, by = c("project", "cluster_name", "cluster")) %>%
      mutate(
        weibull_sum = weibull_sum + weibull,
        gamma_sum = gamma_sum + gamma,
        lnorm_sum = lnorm_sum + lnorm,
        norm_sum = norm_sum + norm,
        frechet_sum = frechet_sum + frechet
      ) %>%
      dplyr::select(-weibull, -gamma, - lnorm, -norm, -frechet)
}

df_pvalues <- df_pvalues %>%
  mutate(
    Weibull = weibull_sum / nb_iter,
    Gamma = gamma_sum / nb_iter,
    Lnorm = lnorm_sum / nb_iter,
    Norm = norm_sum / nb_iter,
    Frechet = frechet_sum / nb_iter,
  ) %>%
  dplyr::select(project, cluster_name, cluster, Weibull, Gamma, Lnorm, Norm, Frechet) %>%
  drop_na(Weibull, Gamma, Lnorm, Norm, Frechet)

df_pvalues
df_pvalues %>%
    melt(id.vars = c("project", "cluster_name", "cluster")) %>%
    ggplot(aes(x = factor(cluster), y = value, fill = factor(variable))) +
    geom_col(position = "dodge") +
    geom_hline(yintercept = 0.05, linetype = "dashed", color = "black") +
    facet_grid(project ~ cluster_name, scale = "free_x") +
    theme_bw() +
    ggtitle("Model per project per cluster") +
    xlab("#Cluster") +
    ylab("p-value") +
    ylim(0, 1) +
    scale_fill_discrete(name = "Distribution") +
    theme(legend.position = "bottom", legend.box = "horizontal")

df_pvalues %>%
    melt(id.vars = c("project", "cluster_name", "cluster")) %>%
    group_by(project, cluster_name, cluster) %>%
    summarise(
      max = max(value),
      type = variable[which.max(value)]
    ) %>%
    ggplot(aes(x = factor(cluster), y = max, fill = factor(type))) +
    geom_col() +
    geom_hline(yintercept = 0.05, linetype = "dashed", color = "black") +
    facet_grid(project ~ cluster_name, scale = "free_x") +
    theme_bw() +
    ggtitle("Model per project per cluster") +
    xlab("#Cluster") +
    ylab("p-value") +
    ylim(0, 1) +
    scale_fill_discrete(name = "Distribution") +
    theme(legend.position = "bottom", legend.box = "horizontal")

df_pvalues %>%
  melt(id.vars = c("project", "cluster_name", "cluster")) %>%
  group_by(project, cluster_name, cluster) %>%
  summarise(
    max = max(value),
    type = variable[which.max(value)]
  )