Configuration

Volume

{
    "AvailabilityZone": "us-east-1a", 
    "Attachments": [
        {
            "AttachTime": "2018-02-07T15:13:07.000Z", 
            "InstanceId": "i-09a31c8d356966248", 
            "VolumeId": "vol-0d8d10c68176aa97b", 
            "State": "attached", 
            "DeleteOnTermination": false, 
            "Device": "/dev/sdf"
        }
    ], 
    "Encrypted": false, 
    "VolumeType": "gp2", 
    "VolumeId": "vol-0d8d10c68176aa97b", 
    "State": "in-use", 
    "Iops": 900, 
    "SnapshotId": "", 
    "CreateTime": "2018-01-19T20:14:49.325Z", 
    "Size": 300
}

ec2 instance

{
  "IamFleetRole": "arn:aws:iam::385009899373:role/aws-ec2-spot-fleet-tagging-role",
  "AllocationStrategy": "lowestPrice",
  "TargetCapacity": 1,
  "SpotPrice": "0.6",
  "ValidFrom": "2018-01-15T18:27:00Z",
  "ValidUntil": "2019-01-15T18:27:00Z",
  "TerminateInstancesWithExpiration": true,
  "LaunchSpecifications": [
    {
      "ImageId": "ami-3480df4e",
      "InstanceType": "c4.4xlarge",
      "SubnetId": "subnet-d2d5c7a4",
      "KeyName": "CellProfiler",
      "SpotPrice": "0.5",
      "IamInstanceProfile": {
        "Arn": "arn:aws:iam::385009899373:instance-profile/s3-imaging-platform-role"
      },
      "BlockDeviceMappings": [
        {
          "DeviceName": "/dev/sda1",
          "Ebs": {
            "DeleteOnTermination": true,
            "VolumeType": "gp2",
            "VolumeSize": 60,
            "SnapshotId": "snap-0d61c49c2c8ecee7a"
          }
        }
      ],
      "SecurityGroups": [
        {
          "GroupId": "sg-2a88ab51"
        },
        {
          "GroupId": "sg-74b99a0f"
        }
      ],
      "TagSpecifications": [
        {
          "ResourceType": "instance",
          "Tags": [
            {
              "Key": "Name",
              "Value": "Shantanu-cytotools"
            }
          ]
        }
      ]
    }
  ],
  "Type": "request"
}

Software

CP 2.2.1 Pipeline

Setup

  • All images were copied to the 300 Gb EBS volume and processes, per well, using GNU parallel, across 16 cores.

Parallel config

skopy

parallel \
  --no-run-if-empty \
  --eta \
  --results ../../log/${BATCH_ID}/skopy-analysis/{/.} \
  --joblog ../../log/${BATCH_ID}/skopy-analysis.log \
  --keep-order \
  -a ../../scratch/ljosa_2013/skopy_commands_analysis.txt

CellProfiler 2.2.1

parallel   
  --no-run-if-empty \
  --eta \
  --results ../../log/${BATCH_ID}/analysis/{/.} \
  --joblog ../../log/${BATCH_ID}/analysis.log \
  --keep-order \
  -a ../../scratch/${BATCH_ID}/cp_docker_commands_analysis.txt
library(glue)
library(magrittr)
library(stringr)
library(tidyverse)
library(corrplot)
skopy_version <- "31f62607"
cp_log <- read_tsv("timing/cp-221-analysis.log") 
cp_log %<>% 
  rowwise() %>% 
  mutate(
    pws = Command %>% 
      str_split("/status_dir/") %>% 
      extract2(1) %>% 
      extract2(2) %>% 
      str_split("\\.") %>% 
      extract2(1) %>% 
      extract2(1)) %>% 
  ungroup() %>% 
  separate(pws, into = c("Metadata_Plate", "Metadata_Well"), sep = "-") %>% 
  select(Metadata_Plate, Metadata_Well, JobRuntime)
skopy_log <- read_tsv(glue("timing/skopy-{version}-analysis.log", version = skopy_version)) 
skopy_log %<>% 
  rowwise() %>% 
  mutate(
    pws = Command %>% 
      str_split(" && ") %>% extract2(1) %>% extract2(1) %>% str_split("features_") %>% extract2(1) %>% extract2(2)) %>% 
  ungroup() %>% 
  separate(pws, into = c("Metadata_Plate1", "Metadata_Plate2", "Metadata_Well"), sep = "_", extra = "drop") %>% 
  unite(Metadata_Plate, Metadata_Plate1, Metadata_Plate2, remove = T, sep = "_") %>%
  select(Metadata_Plate, Metadata_Well, JobRuntime)
full_log <-
  inner_join(
    skopy_log,
    cp_log,
    by = c("Metadata_Plate", "Metadata_Well"),
    suffix = c("_skopy", "_cp")
  )
maxtime <- 
  full_log %>% 
  gather(sw, time, -Metadata_Plate, -Metadata_Well) %>%
  summarize(max_time = max(time)) %>%
  extract2("max_time")
ggplot(full_log, aes(JobRuntime_skopy, JobRuntime_cp)) + 
  geom_hex(binwidth = 20) +
  geom_abline(slope = 1, intercept = 0, linetype = 2, color = "red", alpha = 0.5) +
  xlim(0, maxtime) +
  ylim(0, maxtime) +
  coord_equal() +
  ggtitle("Run time per well (n = 632)")

full_log %>% 
  gather(sw, time, -Metadata_Plate, -Metadata_Well) %>%
  ggplot(aes(sw, time)) + 
  geom_boxplot() +
  ggtitle("Run time per well (n = 632)")

ggplot(full_log, aes(JobRuntime_cp/JobRuntime_skopy-1)) + 
  scale_x_continuous(labels = scales::percent) +
  xlab("speedup") + 
  geom_histogram(binwidth = .1) +
  ggtitle("Speedup per well (n = 632)")

ggplot(full_log, aes(JobRuntime_cp/JobRuntime_skopy-1)) + 
  scale_x_continuous(labels = scales::percent) +
  xlab("speedup") + 
  stat_ecdf() +
  ggtitle("Speedup per well (n = 632)")

full_log %>% 
  summarise_at(c("JobRuntime_cp", "JobRuntime_skopy"), sum) %>%
  gather(sw, time) %>%
  ggplot(aes(sw, time/3600)) + 
  ylab("hours") +
  geom_bar(stat = "identity") +
  ggtitle("Estimated run time on a single core (n = 632)")

cp_log <- read_tsv("timing/cp-221-analysis.log") 
skopy_log <- read_tsv(glue("timing/skopy-{version}-analysis.log", version = skopy_version)) 
get_wall_time <- function(runlog) {
  
  s1 <- runlog %>% arrange(Seq) %>% extract2("Seq")
  s2 <- seq(nrow(runlog))
  
  stopifnot(all(s1==s2))
  
  t1 <- runlog %>% filter(Seq == s1[[1]]) %>% extract2("Starttime")
    
  t2 <- runlog %>% filter(Seq == s1[[nrow(runlog)]]) %>% extract2("Starttime")
  
  t2 - t1
    
}
tribble(~cp, ~skopy,
        get_wall_time(cp_log), 
        get_wall_time(skopy_log)) %>% 
  gather(sw, time) %>%
  ggplot(aes(sw, time/3600)) + 
  ylab("hours") +
  geom_bar(stat = "identity")

  ggtitle("Estimated run time on a 16 cores, across all n = 632 wells")
$title
[1] "Estimated run time on a 16 cores, across all n = 632 wells"

$subtitle
NULL

attr(,"class")
[1] "labels"
data_frame(wall_time_speedup = 
             (get_wall_time(cp_log) / get_wall_time(skopy_log)) - 1) %>%
  ggplot(aes("-", wall_time_speedup)) + 
  geom_bar(stat = "identity") +
  xlab("") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1))  +
  ggtitle("Estimated speedup on 16 cores, across all n = 632 wells")

---
title: "skopy timing on BBBC021"
output: html_notebook
---

# Configuration

## Volume

```
{
    "AvailabilityZone": "us-east-1a", 
    "Attachments": [
        {
            "AttachTime": "2018-02-07T15:13:07.000Z", 
            "InstanceId": "i-09a31c8d356966248", 
            "VolumeId": "vol-0d8d10c68176aa97b", 
            "State": "attached", 
            "DeleteOnTermination": false, 
            "Device": "/dev/sdf"
        }
    ], 
    "Encrypted": false, 
    "VolumeType": "gp2", 
    "VolumeId": "vol-0d8d10c68176aa97b", 
    "State": "in-use", 
    "Iops": 900, 
    "SnapshotId": "", 
    "CreateTime": "2018-01-19T20:14:49.325Z", 
    "Size": 300
}
```

## ec2 instance
```
{
  "IamFleetRole": "arn:aws:iam::385009899373:role/aws-ec2-spot-fleet-tagging-role",
  "AllocationStrategy": "lowestPrice",
  "TargetCapacity": 1,
  "SpotPrice": "0.6",
  "ValidFrom": "2018-01-15T18:27:00Z",
  "ValidUntil": "2019-01-15T18:27:00Z",
  "TerminateInstancesWithExpiration": true,
  "LaunchSpecifications": [
    {
      "ImageId": "ami-3480df4e",
      "InstanceType": "c4.4xlarge",
      "SubnetId": "subnet-d2d5c7a4",
      "KeyName": "CellProfiler",
      "SpotPrice": "0.5",
      "IamInstanceProfile": {
        "Arn": "arn:aws:iam::385009899373:instance-profile/s3-imaging-platform-role"
      },
      "BlockDeviceMappings": [
        {
          "DeviceName": "/dev/sda1",
          "Ebs": {
            "DeleteOnTermination": true,
            "VolumeType": "gp2",
            "VolumeSize": 60,
            "SnapshotId": "snap-0d61c49c2c8ecee7a"
          }
        }
      ],
      "SecurityGroups": [
        {
          "GroupId": "sg-2a88ab51"
        },
        {
          "GroupId": "sg-74b99a0f"
        }
      ],
      "TagSpecifications": [
        {
          "ResourceType": "instance",
          "Tags": [
            {
              "Key": "Name",
              "Value": "Shantanu-cytotools"
            }
          ]
        }
      ]
    }
  ],
  "Type": "request"
}
```

## Software
- docker image of CellProfiler 2.2.1 shntnu/cellprofiler:2.2.1 https://hub.docker.com/r/shntnu/cellprofiler/tags/
- skopy https://github.com/broadinstitute/skopy/tree/31f626078509032f453b338f993a207efa562656/
- GNU parallel v20180122

## CP 2.2.1 Pipeline
- https://github.com/broadinstitute/imaging-platform-pipelines/blob/master/bbbc021_mcf7_20x_imagexpress/analysis.cppipe
Processing was grouped by Metadata_Plate, Metadata_Well. 632 groups in total

## Setup
- All images were copied to the 300 Gb EBS volume and processes, per well, using GNU parallel, across 16 cores.

## Parallel config

### skopy 
```
parallel \
  --no-run-if-empty \
  --eta \
  --results ../../log/${BATCH_ID}/skopy-analysis/{/.} \
  --joblog ../../log/${BATCH_ID}/skopy-analysis.log \
  --keep-order \
  -a ../../scratch/ljosa_2013/skopy_commands_analysis.txt
```

### CellProfiler 2.2.1
```
parallel   
  --no-run-if-empty \
  --eta \
  --results ../../log/${BATCH_ID}/analysis/{/.} \
  --joblog ../../log/${BATCH_ID}/analysis.log \
  --keep-order \
  -a ../../scratch/${BATCH_ID}/cp_docker_commands_analysis.txt
```

```{r message=FALSE}
library(glue)
library(magrittr)
library(stringr)
library(tidyverse)
library(corrplot)
```


```{r}
skopy_version <- "31f62607"
```


```{r message=FALSE}
cp_log <- read_tsv("timing/cp-221-analysis.log") 

cp_log %<>% 
  rowwise() %>% 
  mutate(
    pws = Command %>% 
      str_split("/status_dir/") %>% 
      extract2(1) %>% 
      extract2(2) %>% 
      str_split("\\.") %>% 
      extract2(1) %>% 
      extract2(1)) %>% 
  ungroup() %>% 
  separate(pws, into = c("Metadata_Plate", "Metadata_Well"), sep = "-") %>% 
  select(Metadata_Plate, Metadata_Well, JobRuntime)

skopy_log <- read_tsv(glue("timing/skopy-{version}-analysis.log", version = skopy_version)) 

skopy_log %<>% 
  rowwise() %>% 
  mutate(
    pws = Command %>% 
      str_split(" && ") %>% extract2(1) %>% extract2(1) %>% str_split("features_") %>% extract2(1) %>% extract2(2)) %>% 
  ungroup() %>% 
  separate(pws, into = c("Metadata_Plate1", "Metadata_Plate2", "Metadata_Well"), sep = "_", extra = "drop") %>% 
  unite(Metadata_Plate, Metadata_Plate1, Metadata_Plate2, remove = T, sep = "_") %>%
  select(Metadata_Plate, Metadata_Well, JobRuntime)

full_log <-
  inner_join(
    skopy_log,
    cp_log,
    by = c("Metadata_Plate", "Metadata_Well"),
    suffix = c("_skopy", "_cp")
  )

maxtime <- 
  full_log %>% 
  gather(sw, time, -Metadata_Plate, -Metadata_Well) %>%
  summarize(max_time = max(time)) %>%
  extract2("max_time")

ggplot(full_log, aes(JobRuntime_skopy, JobRuntime_cp)) + 
  geom_hex(binwidth = 20) +
  geom_abline(slope = 1, intercept = 0, linetype = 2, color = "red", alpha = 0.5) +
  xlim(0, maxtime) +
  ylim(0, maxtime) +
  coord_equal() +
  ggtitle("Run time per well (n = 632)")

full_log %>% 
  gather(sw, time, -Metadata_Plate, -Metadata_Well) %>%
  ggplot(aes(sw, time)) + 
  geom_boxplot() +
  ggtitle("Run time per well (n = 632)")

ggplot(full_log, aes(JobRuntime_cp/JobRuntime_skopy-1)) + 
  scale_x_continuous(labels = scales::percent) +
  xlab("speedup") + 
  geom_histogram(binwidth = .1) +
  ggtitle("Speedup per well (n = 632)")

ggplot(full_log, aes(JobRuntime_cp/JobRuntime_skopy-1)) + 
  scale_x_continuous(labels = scales::percent) +
  xlab("speedup") + 
  stat_ecdf() +
  ggtitle("Speedup per well (n = 632)")

full_log %>% 
  summarise_at(c("JobRuntime_cp", "JobRuntime_skopy"), sum) %>%
  gather(sw, time) %>%
  ggplot(aes(sw, time/3600)) + 
  ylab("hours") +
  geom_bar(stat = "identity") +
  ggtitle("Estimated run time on a single core (n = 632)")

cp_log <- read_tsv("timing/cp-221-analysis.log") 

skopy_log <- read_tsv(glue("timing/skopy-{version}-analysis.log", version = skopy_version)) 

get_wall_time <- function(runlog) {
  
  s1 <- runlog %>% arrange(Seq) %>% extract2("Seq")
  s2 <- seq(nrow(runlog))
  
  stopifnot(all(s1==s2))
  
  t1 <- runlog %>% filter(Seq == s1[[1]]) %>% extract2("Starttime")
    
  t2 <- runlog %>% filter(Seq == s1[[nrow(runlog)]]) %>% extract2("Starttime")
  
  t2 - t1
    
}

tribble(~cp, ~skopy,
        get_wall_time(cp_log), 
        get_wall_time(skopy_log)) %>% 
  gather(sw, time) %>%
  ggplot(aes(sw, time/3600)) + 
  ylab("hours") +
  geom_bar(stat = "identity")
  ggtitle("Estimated run time on a 16 cores, across all n = 632 wells")

data_frame(wall_time_speedup = 
             (get_wall_time(cp_log) / get_wall_time(skopy_log)) - 1) %>%
  ggplot(aes("-", wall_time_speedup)) + 
  geom_bar(stat = "identity") +
  xlab("") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1))  +
  ggtitle("Estimated speedup on 16 cores, across all n = 632 wells")

```

