CellProfiler 2.2.1
parallel
--no-run-if-empty \
--eta \
--results ../../log/${BATCH_ID}/analysis/{/.} \
--joblog ../../log/${BATCH_ID}/analysis.log \
--keep-order \
-a ../../scratch/${BATCH_ID}/cp_docker_commands_analysis.txt
library(glue)
library(magrittr)
library(stringr)
library(tidyverse)
library(corrplot)
skopy_version <- "31f62607"
cp_log <- read_tsv("timing/cp-221-analysis.log")
cp_log %<>%
rowwise() %>%
mutate(
pws = Command %>%
str_split("/status_dir/") %>%
extract2(1) %>%
extract2(2) %>%
str_split("\\.") %>%
extract2(1) %>%
extract2(1)) %>%
ungroup() %>%
separate(pws, into = c("Metadata_Plate", "Metadata_Well"), sep = "-") %>%
select(Metadata_Plate, Metadata_Well, JobRuntime)
skopy_log <- read_tsv(glue("timing/skopy-{version}-analysis.log", version = skopy_version))
skopy_log %<>%
rowwise() %>%
mutate(
pws = Command %>%
str_split(" && ") %>% extract2(1) %>% extract2(1) %>% str_split("features_") %>% extract2(1) %>% extract2(2)) %>%
ungroup() %>%
separate(pws, into = c("Metadata_Plate1", "Metadata_Plate2", "Metadata_Well"), sep = "_", extra = "drop") %>%
unite(Metadata_Plate, Metadata_Plate1, Metadata_Plate2, remove = T, sep = "_") %>%
select(Metadata_Plate, Metadata_Well, JobRuntime)
full_log <-
inner_join(
skopy_log,
cp_log,
by = c("Metadata_Plate", "Metadata_Well"),
suffix = c("_skopy", "_cp")
)
maxtime <-
full_log %>%
gather(sw, time, -Metadata_Plate, -Metadata_Well) %>%
summarize(max_time = max(time)) %>%
extract2("max_time")
ggplot(full_log, aes(JobRuntime_skopy, JobRuntime_cp)) +
geom_hex(binwidth = 20) +
geom_abline(slope = 1, intercept = 0, linetype = 2, color = "red", alpha = 0.5) +
xlim(0, maxtime) +
ylim(0, maxtime) +
coord_equal() +
ggtitle("Run time per well (n = 632)")

full_log %>%
gather(sw, time, -Metadata_Plate, -Metadata_Well) %>%
ggplot(aes(sw, time)) +
geom_boxplot() +
ggtitle("Run time per well (n = 632)")

ggplot(full_log, aes(JobRuntime_cp/JobRuntime_skopy-1)) +
scale_x_continuous(labels = scales::percent) +
xlab("speedup") +
geom_histogram(binwidth = .1) +
ggtitle("Speedup per well (n = 632)")

ggplot(full_log, aes(JobRuntime_cp/JobRuntime_skopy-1)) +
scale_x_continuous(labels = scales::percent) +
xlab("speedup") +
stat_ecdf() +
ggtitle("Speedup per well (n = 632)")

full_log %>%
summarise_at(c("JobRuntime_cp", "JobRuntime_skopy"), sum) %>%
gather(sw, time) %>%
ggplot(aes(sw, time/3600)) +
ylab("hours") +
geom_bar(stat = "identity") +
ggtitle("Estimated run time on a single core (n = 632)")

cp_log <- read_tsv("timing/cp-221-analysis.log")
skopy_log <- read_tsv(glue("timing/skopy-{version}-analysis.log", version = skopy_version))
get_wall_time <- function(runlog) {
s1 <- runlog %>% arrange(Seq) %>% extract2("Seq")
s2 <- seq(nrow(runlog))
stopifnot(all(s1==s2))
t1 <- runlog %>% filter(Seq == s1[[1]]) %>% extract2("Starttime")
t2 <- runlog %>% filter(Seq == s1[[nrow(runlog)]]) %>% extract2("Starttime")
t2 - t1
}
tribble(~cp, ~skopy,
get_wall_time(cp_log),
get_wall_time(skopy_log)) %>%
gather(sw, time) %>%
ggplot(aes(sw, time/3600)) +
ylab("hours") +
geom_bar(stat = "identity")

ggtitle("Estimated run time on a 16 cores, across all n = 632 wells")
$title
[1] "Estimated run time on a 16 cores, across all n = 632 wells"
$subtitle
NULL
attr(,"class")
[1] "labels"
data_frame(wall_time_speedup =
(get_wall_time(cp_log) / get_wall_time(skopy_log)) - 1) %>%
ggplot(aes("-", wall_time_speedup)) +
geom_bar(stat = "identity") +
xlab("") +
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
ggtitle("Estimated speedup on 16 cores, across all n = 632 wells")
