Load Libraries
library(readr)
library(tidyr)
library(dplyr)
library(ggplot2)
Load Data
kronos_strelka_raw <- read_tsv("../1-get_kronos_times/strelka_pipeline/times.tsv")
kronos_titan_raw <- read_tsv("../1-get_kronos_times/titan_pipeline/times.tsv")
kronos_sequenza_raw <- read_tsv("../1-get_kronos_times/sequenza_pipeline2/times.tsv")
galaxy_raw <- read_tsv("galaxy.tsv")
Tidy Kronos Data
# Function to melt and tidy
prep <- function(df){
df_m <- df %>%
gather("sample", "ktime", -step) %>%
separate(sample, c("sample", "pipeline", "extra"), sep = "_") %>%
select(pipeline, sample, step, ktime)
return(df_m)
}
# Function for normalize step names
normalize <- function(names) {
change_name <- function(name) {
if (name == "TASK_MUTATIONSEQ") { return("mutationseq") }
if (name == "TASK_VCF2MAF_SNVS_MUTATIONSEQ") { return("vcf2maf") }
if (name == "TASK_CLIPOVERLAP_TUMOUR,TASK_CLIPOVERLAP_NORMAL") { return("preprocess") }
if (name == "TASK_SAMTOOLS_INDEX_TUMOUR,TASK_SAMTOOLS_INDEX_NORMAL") { return("NA") }
if (name == "TASK_STRELKA") { return("strelka") }
if (name == "TASK_VCF2MAF_SNVS,TASK_VCF2MAF_INDELS") { return("vcf2maf") }
if (name == "TASK_CONVERT_VCF2COUNTS") { return("transform_vcf_to_counts") }
if (name == "TASK_READCOUNTER_TUMOUR,TASK_READCOUNTER_NORMAL") { return("read_counter") }
if (name == "TASK_CALC_CORRECTREADS") { return("correct_read_counter") }
if (name == "TASK_TITAN") { return("titan") }
if (name == "TASK_CALC_CNSEGMENTS") { return("calc_cnsegments") }
if (name == "TASK_SEQUENZA_BAM2SEQZ") { return("create_seqz_file") }
if (name == "TASK_SEQUENZA_SEQZBINNING") { return("create_seqz_file") }
if (name == "TASK_SEQUENZA_ANALYSIS") { return("sequenza_pipeline") }
}
names_norm <- sapply(names, change_name)
return(names_norm)
}
# Prepare Kronos data and combine into one df
kronos_strelka <- prep(kronos_strelka_raw)
kronos_titan <- prep(kronos_titan_raw)
kronos_sequenza <- prep(kronos_sequenza_raw)
kronos <- bind_rows(kronos_strelka, kronos_titan, kronos_sequenza)
# Tidy and normalize Kronos data
kronos <- kronos %>%
# Split MutationSeq steps into their own pipeline
mutate(pipeline = ifelse(grepl("strelka", pipeline) & grepl("MUTATIONSEQ", step), "mutationseq", pipeline)) %>%
# Normalize step names (to match Galaxy's names)
mutate(step = normalize(step)) %>%
# Remove any unpaired steps
filter(step != "NA") %>%
# Sum up related steps
group_by(pipeline, sample, step) %>%
summarise(ktime = sum(ktime)) %>%
ungroup() %>%
# Sort
arrange(pipeline, sample, step)
Tidy Galaxy Data
# Tidy Galaxy data
galaxy <- galaxy_raw %>%
# Restrict to sequential data
filter(Mode == "Sequential", !(Tool %in% c("fetch_interval", "merge", "merge_maf_collection"))) %>%
# Select same columns as Kronos
select(Workflow, Sample, Tool, max_time) %>%
# Rename them the same way
rename(pipeline = Workflow, sample = Sample, step = Tool, gtime = max_time) %>%
# Some quick normalizing
mutate(pipeline = tolower(pipeline)) %>%
# Sort
arrange(pipeline, sample, step)
Find Missing Galaxy Data
# Join all rows
missing <- left_join(kronos, galaxy, by = c("pipeline", "sample", "step")) %>%
# Filter for rows with missing Galaxy runtimes
filter(is.na(gtime))
print(missing)
Source: local data frame [4 x 5]
pipeline sample step ktime gtime
(chr) (chr) (chr) (int) (int)
1 mutationseq LS1668 mutationseq 16224 NA
2 mutationseq LS1668 vcf2maf 1288 NA
3 mutationseq LS2264 mutationseq 17394 NA
4 mutationseq LS2264 vcf2maf 717 NA
Merge Kronos and Galaxy Data
# Perform an inner join, i.e., rows that exist in both datasets
data <- inner_join(kronos, galaxy, by = c("pipeline", "sample", "step")) %>%
# Rename times as systems
rename(galaxy = gtime, kronos = ktime) %>%
# Re-melt data for plot-friendly data frame
gather("system", "time", galaxy, kronos)
Plot Runtimes
p <- ggplot(data) +
# Facet the plotting according to pipelines
facet_grid(. ~ pipeline, scales = "free_x", space = "free") +
# Use boxplots for show variance of runtimes
geom_boxplot(aes(x = step, y = time, fill = system)) +
# Rotate X-axis labels
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("plot.pdf", width = 8.5, height = 11)
p
