This notebook analyzes feature distributions. We categorize features as being multimodal, skewed but unimodel, or symmetric and unimodal. A statistic (Hartigan’s dip test statistic) is used to check for multimodality. Unimodal features with skewness > 2 are categorized as “skewed”. All other features are categorized as symmetric and unimodal. This is reported in a table in the notebook and also available as a CSV file. Additionally, we plot the histogram of each feature, saved as a PNG.

set.seed(42)

Download files

mkdir -p ~/Downloads/BBBC022_workspace

cd ~/Downloads/BBBC022_workspace

wget https://s3.amazonaws.com/imaging-platform-collaborator/2016_09_09_cytominer_workshop/backend_BBBC022_20646.tar.gz

wget https://s3.amazonaws.com/imaging-platform-collaborator/2016_09_09_cytominer_workshop/metadata_BBBC022.tar.gz

tar xvzf backend_BBBC022_20646.tar.gz

tar xvzf metadata_BBBC022.tar.gz

Load database backend

workspace_dir <- file.path(Sys.getenv("HOME"), "Downloads", "BBBC022_workspace")

batch_id <- "BBBC022_2013"

plate_id <- "20646"

plate_backend <- 
  file.path(workspace_dir, 
            paste0("backend/", batch_id, "/", plate_id, "/", plate_id,".sqlite"))

db <- src_sqlite(path = plate_backend)

Load metadata

barcode_platemap <- 
  suppressMessages(read_csv(file.path(workspace_dir, paste0("metadata/", batch_id, "/barcode_platemap.csv"))))

metadata <- 
paste0(
  file.path(workspace_dir, paste0("metadata/", batch_id,"/platemap/")),
  barcode_platemap$Plate_Map_Name %>% unique(),
  ".txt"
  ) %>% 
  map_df(function(x) suppressMessages(read_tsv(x))) %>%
  rename(Plate_Map_Name = plate_map_name) %>%
  inner_join(barcode_platemap, by = c("Plate_Map_Name")) %>%
  mutate(Plate = Assay_Plate_Barcode,
         Well = well_position) %>%
  mutate(broad_sample = ifelse(is.na(broad_sample), "DMSO", broad_sample))

names(metadata) %<>% str_c("Metadata", ., sep = "_")

if (db_has_table(db$con, table = "metadata")) {
  db$con %>% db_drop_table(table = "metadata")
}
metadata <- dplyr::copy_to(db, metadata)

Sample cells from DMSO wells

frac_cells_per_image <- .8

images_per_well <- 6

# sample images from DMSO wells
sampled_images <- 
  metadata %>%
  filter(Metadata_broad_sample == "DMSO") %>% 
  inner_join(tbl(db, "Image"), by = c("Metadata_Plate" = "Image_Metadata_Plate", 
                         "Metadata_Well" = "Image_Metadata_Well")) %>%
  select(matches("Metadata_|TableNumber|ImageNumber")) %>%
  collect() %>%
  group_by(Metadata_Plate, Metadata_Well) %>%
  sample_n(images_per_well) %>%
  ungroup()

if (db_has_table(db$con, table = "sampled_images")) {
  db$con %>% db_drop_table(table = "sampled_images")
}  

sampled_images <- dplyr::copy_to(db, sampled_images)

# sample cells from the sampled images
sampled_objects <-
  sampled_images %>%
  inner_join(
    tbl(db, "Cells") %>% select(TableNumber, ImageNumber, ObjectNumber),
    by = c("TableNumber", "ImageNumber")) %>%
  collect() %>%
  group_by(TableNumber, ImageNumber) %>%
  sample_frac(frac_cells_per_image) %>%
  ungroup()

if (db_has_table(db$con, table = "sampled_objects")) {
  db$con %>% db_drop_table(table = "sampled_objects")
}  

sampled_objects <- dplyr::copy_to(db, sampled_objects)
  
sampled_objects %<>%
  inner_join(tbl(db, "Cells"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>%
  inner_join(tbl(db, "Cytoplasm"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>%
  inner_join(tbl(db, "Nuclei"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>%
  collect(n = Inf)

Peform Hartigan’s dip test for multimodality.

diptest_p_values <- 
  sampled_objects %>%
  select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>%
  map(function(x) diptest::dip.test(x)[["p.value"]]) %>%
  as_data_frame() %>%
  gather(feature, diptest_p_value)

diptest_p_values$diptest_p_value_adjusted <-
  p.adjust(diptest_p_values$diptest_p_value, method = "BH")

Compute skewness

skewnesses <- 
  sampled_objects %>%
  select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>%
  map(e1071::skewness) %>%
  as_data_frame() %>%
  gather(feature, skewness)

Collect, summarize and save statistics

features_statistics <- 
  inner_join(diptest_p_values, skewnesses, by = c("feature")) %>%
  mutate(is_multimodal = diptest_p_value_adjusted < 0.05) %>%
  mutate(is_skewed = skewness > 2) %>%
  mutate(is_skewed = ifelse(is_multimodal, NA, is_skewed))
  
features_statistics %>%
  group_by(is_skewed) %>%
  tally() %>% 
  knitr::kable(caption = "Feature skewness summary", digits = 2)
Feature skewness summary
is_skewed n
FALSE 601
TRUE 203
NA 20
features_statistics %>%
  group_by(is_multimodal) %>%
  tally() %>% 
  knitr::kable(caption = "Feature multimodality summary", digits = 2)
Feature multimodality summary
is_multimodal n
FALSE 808
TRUE 16
features_statistics %>% 
  mutate(neg_log_p_value_adjusted_dip = -log(diptest_p_value_adjusted, base = 10)) %>%
  filter(diptest_p_value_adjusted < 0.25) %>% 
  select(feature, neg_log_p_value_adjusted_dip) %>%
  arrange(-neg_log_p_value_adjusted_dip) %>% 
  knitr::kable(caption = "Feature multimodality table", digits = 3)
Feature multimodality table
feature neg_log_p_value_adjusted_dip
Cells_Intensity_MaxIntensityEdge_DNA Inf
Cells_Intensity_MinIntensityEdge_AGP Inf
Cells_Intensity_UpperQuartileIntensity_DNA Inf
Cells_Neighbors_FirstClosestObjectNumber_5 Inf
Cells_Neighbors_NumberOfNeighbors_5 Inf
Cells_Neighbors_SecondClosestObjectNumber_5 Inf
Cytoplasm_Intensity_MinIntensityEdge_AGP Inf
Nuclei_Intensity_IntegratedIntensity_DNA Inf
Nuclei_Neighbors_FirstClosestObjectNumber_1 Inf
Nuclei_Neighbors_NumberOfNeighbors_1 Inf
Nuclei_Neighbors_PercentTouching_1 Inf
Nuclei_Neighbors_SecondClosestObjectNumber_1 Inf
Cells_Parent_Nuclei 3.765
Cytoplasm_Parent_Cells 3.765
Cytoplasm_Parent_Nuclei 3.765
Nuclei_AreaShape_Orientation 2.316
Cells_AreaShape_FormFactor 0.663
features_statistics %>% 
  select(feature, skewness) %>%
  na.omit() %>%
  filter(abs(skewness) > 2) %>% 
  arrange(-abs(skewness)) %>% 
  knitr::kable(caption = "Feature skewness table", digits = 3)
Feature skewness table
feature skewness
Cells_AreaShape_EulerNumber -172.108
Cytoplasm_AreaShape_EulerNumber -19.323
Nuclei_Texture_AngularSecondMoment_DNA_5 15.635
Nuclei_Texture_AngularSecondMoment_DNA_3 14.804
Cytoplasm_Intensity_MaxIntensity_DNA 13.839
Cytoplasm_Intensity_StdIntensity_DNA 11.171
Cytoplasm_Intensity_MaxIntensityEdge_DNA 9.932
Cells_Intensity_LowerQuartileIntensity_DNA 8.895
Nuclei_Intensity_StdIntensity_AGP 7.923
Nuclei_Intensity_MassDisplacement_AGP 7.878
Nuclei_Intensity_MaxIntensityEdge_Mito 7.680
Nuclei_Intensity_StdIntensity_Mito 7.584
Nuclei_Intensity_MaxIntensity_Mito 7.374
Cells_Intensity_MaxIntensityEdge_Mito 7.200
Cytoplasm_Intensity_MaxIntensityEdge_Mito 7.000
Cytoplasm_Intensity_MaxIntensity_Mito 6.493
Cells_Intensity_MaxIntensity_Mito 6.303
Cells_Intensity_StdIntensity_AGP 6.125
Cells_Texture_AngularSecondMoment_AGP_5 6.037
Nuclei_Intensity_StdIntensityEdge_Mito 5.982
Cells_Intensity_UpperQuartileIntensity_ER 5.899
Nuclei_Intensity_UpperQuartileIntensity_Mito 5.847
Nuclei_Intensity_MassDisplacement_RNA 5.833
Nuclei_Intensity_MedianIntensity_ER 5.758
Nuclei_Intensity_LowerQuartileIntensity_ER 5.700
Cells_Texture_AngularSecondMoment_AGP_3 5.674
Nuclei_Intensity_MaxIntensityEdge_DNA 5.665
Nuclei_Intensity_MedianIntensity_Mito 5.646
Nuclei_Intensity_MassDisplacement_DNA 5.578
Nuclei_Texture_AngularSecondMoment_RNA_5 5.477
Cytoplasm_Texture_AngularSecondMoment_AGP_5 5.365
Nuclei_Intensity_MassDisplacement_Mito 5.355
Cells_Intensity_StdIntensity_Mito 5.352
Nuclei_Intensity_UpperQuartileIntensity_AGP 5.342
Nuclei_Intensity_StdIntensity_RNA 5.340
Nuclei_Intensity_UpperQuartileIntensity_ER 5.338
Cytoplasm_Intensity_StdIntensityEdge_DNA 5.312
Nuclei_Intensity_StdIntensity_DNA 5.259
Nuclei_Intensity_LowerQuartileIntensity_Mito 5.255
Nuclei_Intensity_MeanIntensity_Mito 5.246
Cytoplasm_Intensity_StdIntensity_Mito 5.238
Nuclei_Intensity_StdIntensityEdge_AGP 5.229
Nuclei_Texture_AngularSecondMoment_AGP_5 5.173
Nuclei_Intensity_MedianIntensity_AGP 5.145
Cytoplasm_Texture_AngularSecondMoment_AGP_3 5.135
Nuclei_Texture_AngularSecondMoment_AGP_3 5.047
Cells_Intensity_MaxIntensity_DNA 5.025
Nuclei_Intensity_MaxIntensity_AGP 5.011
Nuclei_Intensity_MaxIntensity_DNA 4.981
Nuclei_Intensity_LowerQuartileIntensity_AGP 4.912
Nuclei_Texture_AngularSecondMoment_RNA_3 4.866
Nuclei_Intensity_MeanIntensity_AGP 4.844
Nuclei_Intensity_MeanIntensity_ER 4.829
Cytoplasm_Intensity_StdIntensityEdge_AGP 4.825
Cells_Intensity_UpperQuartileIntensity_Mito 4.824
Cytoplasm_Intensity_StdIntensity_AGP 4.769
Nuclei_Intensity_MedianIntensity_RNA 4.743
Cells_Intensity_StdIntensityEdge_Mito 4.628
Nuclei_Intensity_UpperQuartileIntensity_RNA 4.585
Nuclei_Intensity_LowerQuartileIntensity_RNA 4.568
Cytoplasm_Intensity_StdIntensityEdge_Mito 4.530
Cells_Intensity_MaxIntensity_AGP 4.417
Cells_Intensity_MedianIntensity_DNA 4.416
Nuclei_Texture_AngularSecondMoment_Mito_5 4.416
Cells_Texture_AngularSecondMoment_Mito_5 4.318
Cells_Intensity_StdIntensity_RNA 4.317
Nuclei_Intensity_MeanIntensity_RNA 4.316
Cells_Intensity_StdIntensity_ER 4.314
Nuclei_Texture_AngularSecondMoment_Mito_3 4.273
Cytoplasm_Intensity_MedianIntensity_DNA 4.243
Cells_Intensity_UpperQuartileIntensity_RNA 4.192
Cytoplasm_Intensity_StdIntensity_ER 4.098
Cells_Texture_AngularSecondMoment_Mito_3 4.073
Nuclei_Intensity_StdIntensity_ER 4.054
Cells_Intensity_UpperQuartileIntensity_AGP 4.044
Cytoplasm_Intensity_MaxIntensity_AGP 4.022
Cells_Intensity_MaxIntensityEdge_ER 3.972
Cells_Intensity_StdIntensityEdge_ER 3.887
Cytoplasm_Intensity_UpperQuartileIntensity_DNA 3.877
Cytoplasm_Intensity_UpperQuartileIntensity_ER 3.841
Cells_Texture_AngularSecondMoment_RNA_5 3.809
Nuclei_Intensity_MaxIntensityEdge_AGP 3.778
Nuclei_Intensity_MeanIntensityEdge_RNA 3.776
Cytoplasm_Intensity_StdIntensityEdge_RNA 3.766
Nuclei_Intensity_MeanIntensityEdge_AGP 3.749
Cells_Intensity_StdIntensityEdge_AGP 3.746
Cells_RadialDistribution_RadialCV_AGP_3of4 3.727
Cells_Intensity_StdIntensityEdge_RNA 3.726
Cytoplasm_Intensity_MaxIntensityEdge_AGP 3.701
Cells_RadialDistribution_RadialCV_AGP_4of4 3.673
Nuclei_Intensity_MaxIntensity_RNA 3.630
Cytoplasm_Intensity_StdIntensity_RNA 3.623
Cells_Intensity_MaxIntensity_RNA 3.602
Nuclei_Intensity_StdIntensityEdge_DNA 3.541
Cells_Intensity_MaxIntensityEdge_AGP 3.498
Cells_Intensity_MeanIntensity_ER 3.468
Cells_Texture_AngularSecondMoment_RNA_3 3.429
Cytoplasm_Intensity_MassDisplacement_DNA 3.398
Cells_RadialDistribution_RadialCV_Mito_1of4 3.393
Cells_RadialDistribution_RadialCV_AGP_2of4 3.380
Cytoplasm_Intensity_StdIntensityEdge_ER 3.352
Nuclei_Intensity_UpperQuartileIntensity_DNA 3.313
Cells_Intensity_MassDisplacement_AGP 3.306
Cytoplasm_Texture_SumAverage_DNA_5 3.303
Cells_RadialDistribution_MeanFrac_AGP_1of4 3.289
Cells_Intensity_MaxIntensityEdge_RNA 3.278
Nuclei_Intensity_IntegratedIntensity_Mito 3.253
Cells_Intensity_StdIntensity_DNA 3.187
Cytoplasm_Texture_Contrast_DNA_3 3.181
Cytoplasm_Texture_AngularSecondMoment_RNA_5 3.167
Cells_RadialDistribution_RadialCV_AGP_1of4 3.144
Cytoplasm_Texture_AngularSecondMoment_Mito_5 3.144
Nuclei_Intensity_StdIntensityEdge_RNA 3.123
Nuclei_Intensity_MeanIntensityEdge_Mito 3.122
Cytoplasm_Texture_SumAverage_DNA_3 3.109
Cells_RadialDistribution_MeanFrac_AGP_4of4 -3.101
Cytoplasm_Texture_Contrast_RNA_5 3.071
Cells_RadialDistribution_MeanFrac_AGP_2of4 3.053
Cells_Texture_Correlation_AGP_3 3.049
Nuclei_Intensity_MassDisplacement_ER 3.036
Cytoplasm_Intensity_MeanIntensity_DNA 3.031
Cytoplasm_Texture_AngularSecondMoment_Mito_3 3.002
Cells_Intensity_MeanIntensity_Mito 3.001
Cytoplasm_Intensity_MeanIntensity_ER 2.943
Nuclei_Texture_SumVariance_ER_5 2.923
Cytoplasm_Texture_AngularSecondMoment_RNA_3 2.920
Nuclei_Intensity_IntegratedIntensity_AGP 2.891
Nuclei_Intensity_MinIntensity_Mito 2.874
Cytoplasm_Intensity_MaxIntensity_RNA 2.873
Cytoplasm_Intensity_MaxIntensityEdge_ER 2.866
Nuclei_AreaShape_Solidity -2.846
Nuclei_Texture_SumVariance_AGP_5 2.844
Cytoplasm_Intensity_MassDisplacement_AGP 2.834
Cells_RadialDistribution_MeanFrac_AGP_3of4 2.813
Nuclei_Texture_Contrast_DNA_5 2.795
Nuclei_Intensity_MinIntensity_AGP 2.793
Cells_Intensity_MeanIntensity_RNA 2.766
Nuclei_Intensity_MinIntensityEdge_AGP 2.753
Nuclei_Intensity_MeanIntensityEdge_ER 2.751
Cytoplasm_Intensity_MaxIntensityEdge_RNA 2.751
Cells_Texture_SumEntropy_Mito_5 -2.744
Cytoplasm_Intensity_LowerQuartileIntensity_DNA 2.737
Nuclei_Intensity_MaxIntensityEdge_RNA 2.723
Cytoplasm_Texture_DifferenceVariance_RNA_5 2.710
Cells_Texture_SumEntropy_Mito_3 -2.706
Cytoplasm_Texture_Contrast_DNA_5 2.705
Nuclei_Intensity_MinIntensityEdge_Mito 2.669
Nuclei_Intensity_IntegratedIntensity_RNA 2.663
Nuclei_Neighbors_PercentTouching_1 2.654
Cells_Intensity_MedianIntensity_AGP 2.650
Cytoplasm_Intensity_UpperQuartileIntensity_RNA 2.646
Cytoplasm_Texture_Contrast_RNA_3 2.643
Cytoplasm_Texture_Gabor_Mito_3 2.639
Nuclei_Intensity_IntegratedIntensityEdge_AGP 2.637
Cells_RadialDistribution_MeanFrac_Mito_1of4 2.626
Cytoplasm_Texture_Gabor_ER_3 2.619
Cells_Texture_Gabor_RNA_3 2.615
Cells_Intensity_MeanIntensity_AGP 2.592
Nuclei_Intensity_MinIntensity_ER 2.535
Cytoplasm_Texture_Contrast_ER_5 2.525
Nuclei_Texture_SumVariance_AGP_3 2.515
Cells_Intensity_MeanIntensityEdge_DNA 2.498
Nuclei_Intensity_IntegratedIntensityEdge_RNA 2.493
Cytoplasm_Intensity_UpperQuartileIntensity_AGP 2.492
Cytoplasm_Texture_Gabor_AGP_3 2.491
Cells_Texture_SumVariance_AGP_5 2.486
Nuclei_Intensity_IntegratedIntensityEdge_Mito 2.480
Cells_Texture_Correlation_DNA_3 -2.476
Nuclei_Intensity_IntegratedIntensity_ER 2.472
Nuclei_Texture_SumVariance_ER_3 2.467
Cells_Texture_InfoMeas2_Mito_3 -2.463
Cells_Texture_SumVariance_AGP_3 2.454
Cytoplasm_Texture_SumVariance_DNA_5 2.445
Cells_Intensity_StdIntensityEdge_DNA 2.441
Cells_RadialDistribution_RadialCV_Mito_2of4 2.411
Nuclei_Intensity_MinIntensity_RNA 2.391
Cells_Intensity_MaxIntensityEdge_DNA 2.384
Cells_Intensity_MedianIntensity_ER 2.379
Cells_Intensity_MaxIntensity_ER 2.369
Nuclei_Intensity_MinIntensityEdge_RNA 2.369
Cells_Texture_DifferenceVariance_RNA_5 2.360
Cytoplasm_Intensity_MaxIntensity_ER 2.355
Cytoplasm_Texture_InfoMeas2_Mito_3 -2.344
Cytoplasm_Texture_Gabor_AGP_5 2.337
Cytoplasm_Texture_Gabor_RNA_3 2.331
Nuclei_Texture_InfoMeas1_AGP_5 -2.329
Cells_RadialDistribution_RadialCV_RNA_4of4 2.324
Nuclei_AreaShape_Area 2.318
Cytoplasm_Texture_Gabor_ER_5 2.314
Cells_Texture_SumEntropy_AGP_3 -2.293
Cells_Texture_Entropy_Mito_5 -2.273
Cytoplasm_Texture_DifferenceVariance_ER_5 2.272
Cytoplasm_Texture_Contrast_ER_3 2.257
Nuclei_Intensity_StdIntensityEdge_ER 2.241
Cells_Texture_Entropy_Mito_3 -2.240
Nuclei_AreaShape_Perimeter 2.216
Cytoplasm_Texture_SumVariance_DNA_3 2.215
Nuclei_Texture_AngularSecondMoment_ER_5 2.212
Cells_RadialDistribution_RadialCV_Mito_4of4 2.189
Cytoplasm_Texture_Gabor_Mito_5 2.182
Nuclei_Intensity_IntegratedIntensity_DNA 2.175
Nuclei_Intensity_MeanIntensity_DNA 2.170
Nuclei_Intensity_IntegratedIntensityEdge_DNA 2.162
Cells_Texture_Gabor_Mito_3 2.158
Nuclei_Intensity_MinIntensityEdge_ER 2.157
Cells_RadialDistribution_RadialCV_Mito_3of4 2.156
Cytoplasm_Texture_DifferenceVariance_RNA_3 2.150
Cells_Intensity_MedianIntensity_Mito 2.139
Nuclei_Intensity_MaxIntensity_ER 2.134
Cytoplasm_Intensity_MedianIntensity_ER 2.126
Cells_Texture_SumEntropy_AGP_5 -2.104
Nuclei_Texture_AngularSecondMoment_ER_3 2.089
Cells_Texture_SumEntropy_RNA_5 -2.088
Cytoplasm_Texture_SumEntropy_Mito_5 -2.086
Cells_Texture_Correlation_AGP_5 2.081
Cytoplasm_Intensity_UpperQuartileIntensity_Mito 2.073
Cytoplasm_Texture_SumEntropy_Mito_3 -2.066
Cells_RadialDistribution_MeanFrac_ER_1of4 2.059
Cells_Texture_Gabor_DNA_3 2.058
Cells_Texture_SumEntropy_RNA_3 -2.057
Cells_Texture_Contrast_RNA_5 2.055
Cytoplasm_Texture_Gabor_RNA_5 2.025
Cytoplasm_Intensity_MeanIntensity_Mito 2.008
Nuclei_Texture_SumVariance_Mito_5 2.007
features_statistics %>% write_csv("feature_statistics.csv")

Plot histograms of features and save as PNGs

if(!dir.exists("feature_histograms")) {
  dir.create("feature_histograms")  
}

feature_names <- names(sampled_objects) %>% 
  str_subset("Cells_|Cytoplasm_|Nuclei_")

for (feature_name in feature_names) {
  g <- ggplot(sampled_objects, aes_string(feature_name)) + 
    geom_histogram(bins = 50)
  
  ggsave(plot = g, 
         filename = sprintf("feature_histograms/%s.png", feature_name), 
         width = 5, height = 5)
}