{
"AvailabilityZone": "us-east-1a",
"Attachments": [
{
"AttachTime": "2018-02-07T15:13:07.000Z",
"InstanceId": "i-09a31c8d356966248",
"VolumeId": "vol-0d8d10c68176aa97b",
"State": "attached",
"DeleteOnTermination": false,
"Device": "/dev/sdf"
}
],
"Encrypted": false,
"VolumeType": "gp2",
"VolumeId": "vol-0d8d10c68176aa97b",
"State": "in-use",
"Iops": 900,
"SnapshotId": "",
"CreateTime": "2018-01-19T20:14:49.325Z",
"Size": 300
}
{
"IamFleetRole": "arn:aws:iam::385009899373:role/aws-ec2-spot-fleet-tagging-role",
"AllocationStrategy": "lowestPrice",
"TargetCapacity": 1,
"SpotPrice": "0.6",
"ValidFrom": "2018-01-15T18:27:00Z",
"ValidUntil": "2019-01-15T18:27:00Z",
"TerminateInstancesWithExpiration": true,
"LaunchSpecifications": [
{
"ImageId": "ami-3480df4e",
"InstanceType": "c4.4xlarge",
"SubnetId": "subnet-d2d5c7a4",
"KeyName": "CellProfiler",
"SpotPrice": "0.5",
"IamInstanceProfile": {
"Arn": "arn:aws:iam::385009899373:instance-profile/s3-imaging-platform-role"
},
"BlockDeviceMappings": [
{
"DeviceName": "/dev/sda1",
"Ebs": {
"DeleteOnTermination": true,
"VolumeType": "gp2",
"VolumeSize": 60,
"SnapshotId": "snap-0d61c49c2c8ecee7a"
}
}
],
"SecurityGroups": [
{
"GroupId": "sg-2a88ab51"
},
{
"GroupId": "sg-74b99a0f"
}
],
"TagSpecifications": [
{
"ResourceType": "instance",
"Tags": [
{
"Key": "Name",
"Value": "Shantanu-cytotools"
}
]
}
]
}
],
"Type": "request"
}
https://github.com/broadinstitute/imaging-platform-pipelines/blob/master/bbbc021_mcf7_20x_imagexpress/analysis.cppipe Processing was grouped by Metadata_Plate, Metadata_Well. 632 groups in total
time parallel --no-run-if-empty --eta --results ../../log/${BATCH_ID}/analysis/{/.} --joblog ../../log/${BATCH_ID}/analysis.log --keep-order -a ../../scratch/${BATCH_ID}/cp_docker_commands_analysis.txt
real 317m42.417s
user 0m42.691s
sys 1m21.371s
See data/cp-221-analysis.log for details
parallel mkdir basename {1} ::: find . -name "Image.csv"
parallel dirname {} ::: `find . -name "Image.csv"` > /tmp/dirs
cd /tmp/
parallel mkdir -p {} :::: dirs
parallel "csvcut -c Metadata_Plate,Metadata_Well,Metadata_Site,Count_Cells,ExecutionTime_01LoadData,ExecutionTime_02CorrectIlluminationApply,ExecutionTime_03IdentifyPrimaryObjects,ExecutionTime_04IdentifySecondaryObjects,ExecutionTime_05IdentifyTertiaryObjects,ExecutionTime_06MeasureObjectSizeShape,ExecutionTime_07MeasureObjectIntensity,ExecutionTime_08MeasureObjectIntensity,ExecutionTime_09MeasureObjectIntensity,ExecutionTime_10MeasureObjectNeighbors,ExecutionTime_11MeasureObjectNeighbors,ExecutionTime_12MeasureObjectNeighbors,ExecutionTime_13MeasureObjectNeighbors,ExecutionTime_14MeasureTexture {} > /tmp/{}" ::: `find . -name "Image.csv"`
csvstack `find /tmp/ljosa_2013/ -name "Image.csv"` > time.csv
library(magrittr)
library(stringr)
library(tidyverse)
library(corrplot)
df <- read_csv("data/cp-221-time.csv") %>%
select(-matches("Metadata_"))
names(df) <- str_replace_all(names(df), "ExecutionTime_", "")
for (i in 1:5) {
nonmax <-
setdiff(seq(nrow(df)),
df %>%
summarize_all(funs(which.max)) %>%
select(-Count_Cells) %>%
gather(k, v) %>%
extract2("v"))
df <- df[nonmax, ]
}
long_module <-
tribble(~module, ~is_long,
"13MeasureObjectNeighbors", TRUE,
"06MeasureObjectSizeShape", TRUE)
df2 <-
df %>%
gather(module, time, -Count_Cells)
df2 %<>% left_join(long_module)
Joining, by = "module"
df2 %<>% mutate(is_long = ifelse(is.na(is_long), FALSE, TRUE))
p <-
df2 %>%
ggplot(aes(module, time)) + geom_boxplot() + facet_wrap(~is_long, scale = "free_x", nrow = 2) +
coord_flip()
p
ggsave("timepermodule.pdf", width = 6, height = 6)
p <-
df2 %>%
ggplot(aes(Count_Cells, time)) + geom_hex() + facet_wrap(~module, scales = "free")
p
ggsave("scatter.pdf", width = 9, height = 8)
pdf("cordf.pdf", width = 10, height = 10)
corrplot(cor(df), method = "number", order = "hclust")
dev.off()
null device
1
pdf("cordf-ellipse.pdf", width = 10, height = 10)
corrplot(cor(df), method = "ellipse", order = "hclust")
dev.off()
null device
1
corrplot(cor(df), method = "ellipse", order = "hclust")
df1 <- read_csv("data/cp-221-time.csv")
df1 %<>% mutate(ExecutionTime = ExecutionTime_01LoadData+ExecutionTime_02CorrectIlluminationApply+ExecutionTime_03IdentifyPrimaryObjects+ExecutionTime_04IdentifySecondaryObjects+ExecutionTime_05IdentifyTertiaryObjects+ExecutionTime_06MeasureObjectSizeShape+ExecutionTime_07MeasureObjectIntensity+ExecutionTime_08MeasureObjectIntensity+ExecutionTime_09MeasureObjectIntensity+ExecutionTime_10MeasureObjectNeighbors+ExecutionTime_11MeasureObjectNeighbors+ExecutionTime_12MeasureObjectNeighbors+ExecutionTime_13MeasureObjectNeighbors+ExecutionTime_14MeasureTexture) %>%
group_by(Metadata_Plate, Metadata_Well) %>%
summarise(ExecutionTime = sum(ExecutionTime)) %>% ungroup()
df2 <- read_tsv("data/cp-221-analysis.log")
df2 %<>% rowwise() %>% mutate(pws = Command %>% str_split("/status_dir/") %>% extract2(1) %>% extract2(2) %>% str_split("\\.") %>% extract2(1) %>% extract2(1)) %>% ungroup() %>% separate(pws, into = c("Metadata_Plate", "Metadata_Well"), sep = "-") %>% select(Metadata_Plate, Metadata_Well, JobRuntime)
df <- inner_join(df1, df2)
corval <- with(df, cor(ExecutionTime, JobRuntime))
ggplot(df, aes(ExecutionTime, JobRuntime)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
xlim(0, 900) +
ylim(0, 900) +
coord_equal() +
ggtitle(paste0("cor = ", corval))
m <- lm(data = df, formula = JobRuntime ~ ExecutionTime)
summary(m)
Call:
lm(formula = JobRuntime ~ ExecutionTime, data = df)
Residuals:
Min 1Q Median 3Q Max
-59.597 -5.055 0.372 6.346 36.146
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.889e+02 1.606e+00 117.6 <2e-16 ***
ExecutionTime 1.163e+00 6.199e-03 187.6 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.8 on 630 degrees of freedom
Multiple R-squared: 0.9824, Adjusted R-squared: 0.9824
F-statistic: 3.521e+04 on 1 and 630 DF, p-value: < 2.2e-16
plot(m)