Load Libraries

library(readr)
library(tidyr)
library(dplyr)
library(ggplot2)

Load Data

kronos_strelka_raw <- read_tsv("../1-get_kronos_times/strelka_pipeline/times.tsv")
kronos_titan_raw <- read_tsv("../1-get_kronos_times/titan_pipeline/times.tsv")
kronos_sequenza_raw <- read_tsv("../1-get_kronos_times/sequenza_pipeline2/times.tsv")
galaxy_raw <- read_tsv("galaxy.tsv")

Tidy Kronos Data

# Function to melt and tidy
prep <- function(df){
  df_m <- df %>% 
    gather("sample", "ktime", -step) %>%
    separate(sample, c("sample", "pipeline", "extra"), sep = "_") %>%
    select(pipeline, sample, step, ktime)
  return(df_m)
}

# Function for normalize step names
normalize <- function(names) {
  change_name <- function(name) {
    if (name == "TASK_MUTATIONSEQ") { return("mutationseq") }
    if (name == "TASK_VCF2MAF_SNVS_MUTATIONSEQ") { return("vcf2maf") }
    if (name == "TASK_CLIPOVERLAP_TUMOUR,TASK_CLIPOVERLAP_NORMAL") { return("preprocess") }
    if (name == "TASK_SAMTOOLS_INDEX_TUMOUR,TASK_SAMTOOLS_INDEX_NORMAL") { return("NA") }
    if (name == "TASK_STRELKA") { return("strelka") }
    if (name == "TASK_VCF2MAF_SNVS,TASK_VCF2MAF_INDELS") { return("vcf2maf") }
    if (name == "TASK_CONVERT_VCF2COUNTS") { return("transform_vcf_to_counts") }
    if (name == "TASK_READCOUNTER_TUMOUR,TASK_READCOUNTER_NORMAL") { return("read_counter") }
    if (name == "TASK_CALC_CORRECTREADS") { return("correct_read_counter") }
    if (name == "TASK_TITAN") { return("titan") }
    if (name == "TASK_CALC_CNSEGMENTS") { return("calc_cnsegments") }
    if (name == "TASK_SEQUENZA_BAM2SEQZ") { return("create_seqz_file") }
    if (name == "TASK_SEQUENZA_SEQZBINNING") { return("create_seqz_file") }
    if (name == "TASK_SEQUENZA_ANALYSIS") { return("sequenza_pipeline") }
  }
  names_norm <- sapply(names, change_name)
  return(names_norm)
}

# Prepare Kronos data and combine into one df
kronos_strelka <- prep(kronos_strelka_raw)
kronos_titan <- prep(kronos_titan_raw)
kronos_sequenza <- prep(kronos_sequenza_raw)
kronos <- bind_rows(kronos_strelka, kronos_titan, kronos_sequenza)

# Tidy and normalize Kronos data
kronos <- kronos %>%
  # Split MutationSeq steps into their own pipeline
  mutate(pipeline = ifelse(grepl("strelka", pipeline) & grepl("MUTATIONSEQ", step), "mutationseq", pipeline)) %>%
  # Normalize step names (to match Galaxy's names)
  mutate(step = normalize(step)) %>%
  # Remove any unpaired steps
  filter(step != "NA") %>%
  # Sum up related steps
  group_by(pipeline, sample, step) %>%
  summarise(ktime = sum(ktime)) %>%
  ungroup() %>%
  # Sort
  arrange(pipeline, sample, step)

Tidy Galaxy Data

# Tidy Galaxy data
galaxy <- galaxy_raw %>%
  # Restrict to sequential data
  filter(Mode == "Sequential", !(Tool %in% c("fetch_interval", "merge", "merge_maf_collection"))) %>%
  # Select same columns as Kronos
  select(Workflow, Sample, Tool, max_time) %>%
  # Rename them the same way
  rename(pipeline = Workflow, sample = Sample, step = Tool, gtime = max_time) %>%
  # Some quick normalizing
  mutate(pipeline = tolower(pipeline)) %>%
  # Sort
  arrange(pipeline, sample, step)

Find Missing Galaxy Data

# Join all rows
missing <- left_join(kronos, galaxy, by = c("pipeline", "sample", "step")) %>%
  # Filter for rows with missing Galaxy runtimes
  filter(is.na(gtime))
print(missing)
Source: local data frame [4 x 5]

     pipeline sample        step ktime gtime
        (chr)  (chr)       (chr) (int) (int)
1 mutationseq LS1668 mutationseq 16224    NA
2 mutationseq LS1668     vcf2maf  1288    NA
3 mutationseq LS2264 mutationseq 17394    NA
4 mutationseq LS2264     vcf2maf   717    NA

Merge Kronos and Galaxy Data

# Perform an inner join, i.e., rows that exist in both datasets
data <- inner_join(kronos, galaxy, by = c("pipeline", "sample", "step")) %>%
  # Rename times as systems
  rename(galaxy = gtime, kronos = ktime) %>%
  # Re-melt data for plot-friendly data frame
  gather("system", "time", galaxy, kronos)

Plot Runtimes

p <- ggplot(data) +
  # Facet the plotting according to pipelines
  facet_grid(. ~ pipeline, scales = "free_x", space = "free") +
  # Use boxplots for show variance of runtimes
  geom_boxplot(aes(x = step, y = time, fill = system)) +
  # Rotate X-axis labels
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("plot.pdf", width = 8.5, height = 11)
p