This notebook reproduces figures from the (Caicedo et al. 2017). This dataset is from (Ljosa 2013) and is part of the BBBC021 dataset:

This image set provides a basis for testing image-based profiling methods wrt. to their ability to predict the mechanisms of action of a compendium of drugs. The image set was collected using a typical set of morphological labels and uses a physiologically relevant p53-wildtype breast-cancer model system (MCF-7) and a mechanistically distinct set of targeted and cancer-relevant cytotoxic compounds that induces a broad range of gross and subtle phenotypes.

The images were analyzed using CellProfiler. The CellProfiler pipelines and complete set of instructions to generate the data are provided in (Ljosa 2013).

This notebooks assumes that the single-cell data https://s3.amazonaws.com/imaging-platform/projects/dp_treatment-classification_az/workspace/backend/ljosa_2013/analysis_batch_stable/ljosa_2013.sqlite has been downloaded into ~/Downloads.

library(dplyr)
library(ggplot2)
library(magrittr)
library(stringr)

Load data

First, load the data! The data is contained in 4 tables named Image, Cytoplasm, Cells, and Nuclei. The code below joins these tables to create a single table named object.

backend <- file.path(Sys.getenv("HOME"), "Downloads", "ljosa_2013.sqlite")
db <- src_sqlite(path = backend)
image <- tbl(src = db, "Image")
object <-
  tbl(src = db, "Cells") %>%
  inner_join(tbl(src = db, "Cytoplasm"),
                    by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>%
  inner_join(tbl(src = db, "Nuclei"),
                    by = c("TableNumber", "ImageNumber", "ObjectNumber"))
object %<>% inner_join(image, by = c("TableNumber", "ImageNumber"))

In this table, the measurement columns start with Nuclei_, Cells_, or Cytoplasm_.

variables <-
  colnames(object) %>%
  stringr::str_subset("^Nuclei_|^Cells_|^Cytoplasm_")

How many variables?

print(length(variables))
[1] 516

How many cells?

object %>%
  count() %>%
  knitr::kable(caption = "No. of cells")
n
424927

Sample cells

# sample images 
sampled_images <-
  image %>%
  filter(ImageNumber < 10) %>%
  collect(n = Inf)
if (db_has_table(db$con, table = "sampled_images")) {
  db$con %>% db_drop_table(table = "sampled_images")
}
sampled_images <- dplyr::copy_to(db, sampled_images)
# sample cells from the sampled images
sampled_objects <-
  sampled_images %>%
  inner_join(
    tbl(db, "Cells") %>% select(TableNumber, ImageNumber, ObjectNumber),
    by = c("TableNumber", "ImageNumber")) %>%
  collect(n = Inf) 
if (db_has_table(db$con, table = "sampled_objects")) {
  db$con %>% db_drop_table(table = "sampled_objects")
}
sampled_objects <- dplyr::copy_to(db, sampled_objects)
sampled_objects %<>%
  inner_join(tbl(db, "Cells"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>%
  inner_join(tbl(db, "Cytoplasm"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>%
  inner_join(tbl(db, "Nuclei"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>%
  collect(n = Inf)
Column `Cells_AreaShape_Compactness`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cells_AreaShape_Eccentricity`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cells_AreaShape_FormFactor`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cells_AreaShape_MajorAxisLength`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cells_AreaShape_MinorAxisLength`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cells_AreaShape_Orientation`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cells_Location_Center_X`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cells_Location_Center_Y`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cytoplasm_AreaShape_Compactness`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cytoplasm_AreaShape_Eccentricity`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cytoplasm_AreaShape_FormFactor`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cytoplasm_AreaShape_MajorAxisLength`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cytoplasm_AreaShape_MinorAxisLength`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cytoplasm_AreaShape_Orientation`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cytoplasm_Location_Center_X`: mixed type, first seen values of type real, coercing other values of type stringColumn `Cytoplasm_Location_Center_Y`: mixed type, first seen values of type real, coercing other values of type string

Plot densities of features

if(!dir.exists("variable_densities")) {
  dir.create("variable_densities")
}
variables <- c("Cells_Texture_InfoMeas2_CorrTub_10_0",
               "Nuclei_AreaShape_Solidity",
               "Nuclei_AreaShape_Orientation",
               "Nuclei_Neighbors_FirstClosestDistance_20",
               "Nuclei_AreaShape_Zernike_9_7",
               "Nuclei_Intensity_MinIntensity_CorrDAPI",
               "Nuclei_Neighbors_NumberOfNeighbors_10")
for (variable in variables) {
  g <- ggplot(sampled_objects, aes_string(variable)) +
    stat_density(adjust = 0.8, alpha = 0.5)
  print(g)
  ggsave(plot = g,
         filename = sprintf("variable_densities/%s.png", variable),
         width = 5, height = 5)
}

LS0tCnRpdGxlOiAiUmVwcm9kdWNlIGZpZ3VyZXMgZnJvbSBDYWljZWRvIGV0IGFsLiAyMDE3IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpUaGlzIG5vdGVib29rIHJlcHJvZHVjZXMgZmlndXJlcyBmcm9tIHRoZSBbKENhaWNlZG8gZXQgYWwuIDIwMTcpXShodHRwczovL3d3dy5uYXR1cmUuY29tL2FydGljbGVzL25tZXRoLjQzOTcpLgpUaGlzIGRhdGFzZXQgaXMgZnJvbSBbKExqb3NhIDIwMTMpXShodHRwOi8vamJ4LnNhZ2VwdWIuY29tL2NvbnRlbnQvMTgvMTAvMTMyMSkgYW5kIGlzIHBhcnQgb2YgdGhlCltCQkJDMDIxXShodHRwczovL2RhdGEuYnJvYWRpbnN0aXR1dGUub3JnL2JiYmMvQkJCQzAyMS8pIGRhdGFzZXQ6CgoqVGhpcyBpbWFnZSBzZXQgcHJvdmlkZXMgYSBiYXNpcyBmb3IgdGVzdGluZyBpbWFnZS1iYXNlZCBwcm9maWxpbmcgbWV0aG9kcyB3cnQuIHRvIHRoZWlyIGFiaWxpdHkgdG8gcHJlZGljdCB0aGUgbWVjaGFuaXNtcyBvZiBhY3Rpb24gb2YgYSBjb21wZW5kaXVtIG9mIGRydWdzLiBUaGUgaW1hZ2Ugc2V0IHdhcyBjb2xsZWN0ZWQgdXNpbmcgYSB0eXBpY2FsIHNldCBvZiBtb3JwaG9sb2dpY2FsIGxhYmVscyBhbmQgdXNlcyBhIHBoeXNpb2xvZ2ljYWxseSByZWxldmFudCBwNTMtd2lsZHR5cGUgYnJlYXN0LWNhbmNlciBtb2RlbCBzeXN0ZW0gKE1DRi03KSBhbmQgYSBtZWNoYW5pc3RpY2FsbHkgZGlzdGluY3Qgc2V0IG9mIHRhcmdldGVkIGFuZCBjYW5jZXItcmVsZXZhbnQgY3l0b3RveGljIGNvbXBvdW5kcyB0aGF0IGluZHVjZXMgYSBicm9hZCByYW5nZSBvZiBncm9zcyBhbmQgc3VidGxlIHBoZW5vdHlwZXMuKgoKVGhlIGltYWdlcyB3ZXJlIGFuYWx5emVkIHVzaW5nIFtDZWxsUHJvZmlsZXJdKGh0dHA6Ly9jZWxscHJvZmlsZXIub3JnKS4gVGhlCkNlbGxQcm9maWxlciBwaXBlbGluZXMgYW5kIGNvbXBsZXRlIHNldCBvZiBpbnN0cnVjdGlvbnMgdG8gZ2VuZXJhdGUgdGhlIGRhdGEKYXJlIHByb3ZpZGVkIGluIFsoTGpvc2EgMjAxMyldKGh0dHA6Ly9qYnguc2FnZXB1Yi5jb20vY29udGVudC8xOC8xMC8xMzIxKS4KClRoaXMgbm90ZWJvb2tzIGFzc3VtZXMgdGhhdCB0aGUgc2luZ2xlLWNlbGwgZGF0YSA8aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2ltYWdpbmctcGxhdGZvcm0vcHJvamVjdHMvZHBfdHJlYXRtZW50LWNsYXNzaWZpY2F0aW9uX2F6L3dvcmtzcGFjZS9iYWNrZW5kL2xqb3NhXzIwMTMvYW5hbHlzaXNfYmF0Y2hfc3RhYmxlL2xqb3NhXzIwMTMuc3FsaXRlPiBoYXMgYmVlbiBkb3dubG9hZGVkIGludG8gYH4vRG93bmxvYWRzYC4KCgpgYGB7ciBsaWJyYXJpZXMsIG1lc3NhZ2U9RkFMU0V9CmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShtYWdyaXR0cikKbGlicmFyeShzdHJpbmdyKQpgYGAKCgojIExvYWQgZGF0YQoKRmlyc3QsIGxvYWQgdGhlIGRhdGEhIFRoZSBkYXRhIGlzIGNvbnRhaW5lZCBpbiA0IHRhYmxlcyBuYW1lZCBgSW1hZ2VgLApgQ3l0b3BsYXNtYCwgYENlbGxzYCwgYW5kIGBOdWNsZWlgLiBUaGUgY29kZSBiZWxvdyBqb2lucyB0aGVzZSB0YWJsZXMgdG8KY3JlYXRlIGEgc2luZ2xlIHRhYmxlIG5hbWVkIGBvYmplY3RgLgoKCmBgYHtyIGxvYWQsIG1lc3NhZ2U9RkFMU0V9CgpiYWNrZW5kIDwtIGZpbGUucGF0aChTeXMuZ2V0ZW52KCJIT01FIiksICJEb3dubG9hZHMiLCAibGpvc2FfMjAxMy5zcWxpdGUiKQoKZGIgPC0gc3JjX3NxbGl0ZShwYXRoID0gYmFja2VuZCkKCmltYWdlIDwtIHRibChzcmMgPSBkYiwgIkltYWdlIikKCm9iamVjdCA8LQogIHRibChzcmMgPSBkYiwgIkNlbGxzIikgJT4lCiAgaW5uZXJfam9pbih0Ymwoc3JjID0gZGIsICJDeXRvcGxhc20iKSwKICAgICAgICAgICAgICAgICAgICBieSA9IGMoIlRhYmxlTnVtYmVyIiwgIkltYWdlTnVtYmVyIiwgIk9iamVjdE51bWJlciIpKSAlPiUKICBpbm5lcl9qb2luKHRibChzcmMgPSBkYiwgIk51Y2xlaSIpLAogICAgICAgICAgICAgICAgICAgIGJ5ID0gYygiVGFibGVOdW1iZXIiLCAiSW1hZ2VOdW1iZXIiLCAiT2JqZWN0TnVtYmVyIikpCgpvYmplY3QgJTw+JSBpbm5lcl9qb2luKGltYWdlLCBieSA9IGMoIlRhYmxlTnVtYmVyIiwgIkltYWdlTnVtYmVyIikpCgpgYGAKCgpJbiB0aGlzIHRhYmxlLCB0aGUgbWVhc3VyZW1lbnQgY29sdW1ucyBzdGFydCB3aXRoIGBOdWNsZWlfYCwgYENlbGxzX2AsIG9yCmBDeXRvcGxhc21fYC4KCmBgYHtyfQp2YXJpYWJsZXMgPC0KICBjb2xuYW1lcyhvYmplY3QpICU+JQogIHN0cmluZ3I6OnN0cl9zdWJzZXQoIl5OdWNsZWlffF5DZWxsc198XkN5dG9wbGFzbV8iKQpgYGAKCkhvdyBtYW55IHZhcmlhYmxlcz8KCmBgYHtyfQpwcmludChsZW5ndGgodmFyaWFibGVzKSkKYGBgCgpIb3cgbWFueSBjZWxscz8KCmBgYHtyfQpvYmplY3QgJT4lCiAgY291bnQoKSAlPiUKICBrbml0cjo6a2FibGUoY2FwdGlvbiA9ICJOby4gb2YgY2VsbHMiKQpgYGAKCiMjIFNhbXBsZSBjZWxscwoKYGBge3Igc2FtcGxlX2NlbGxzLCBtZXNzYWdlPUZBTFNFfQoKIyBzYW1wbGUgaW1hZ2VzIApzYW1wbGVkX2ltYWdlcyA8LQogIGltYWdlICU+JQogIGZpbHRlcihJbWFnZU51bWJlciA8IDEwKSAlPiUKICBjb2xsZWN0KG4gPSBJbmYpCgppZiAoZGJfaGFzX3RhYmxlKGRiJGNvbiwgdGFibGUgPSAic2FtcGxlZF9pbWFnZXMiKSkgewogIGRiJGNvbiAlPiUgZGJfZHJvcF90YWJsZSh0YWJsZSA9ICJzYW1wbGVkX2ltYWdlcyIpCn0KCnNhbXBsZWRfaW1hZ2VzIDwtIGRwbHlyOjpjb3B5X3RvKGRiLCBzYW1wbGVkX2ltYWdlcykKCiMgc2FtcGxlIGNlbGxzIGZyb20gdGhlIHNhbXBsZWQgaW1hZ2VzCnNhbXBsZWRfb2JqZWN0cyA8LQogIHNhbXBsZWRfaW1hZ2VzICU+JQogIGlubmVyX2pvaW4oCiAgICB0YmwoZGIsICJDZWxscyIpICU+JSBzZWxlY3QoVGFibGVOdW1iZXIsIEltYWdlTnVtYmVyLCBPYmplY3ROdW1iZXIpLAogICAgYnkgPSBjKCJUYWJsZU51bWJlciIsICJJbWFnZU51bWJlciIpKSAlPiUKICBjb2xsZWN0KG4gPSBJbmYpIAoKaWYgKGRiX2hhc190YWJsZShkYiRjb24sIHRhYmxlID0gInNhbXBsZWRfb2JqZWN0cyIpKSB7CiAgZGIkY29uICU+JSBkYl9kcm9wX3RhYmxlKHRhYmxlID0gInNhbXBsZWRfb2JqZWN0cyIpCn0KCnNhbXBsZWRfb2JqZWN0cyA8LSBkcGx5cjo6Y29weV90byhkYiwgc2FtcGxlZF9vYmplY3RzKQoKc2FtcGxlZF9vYmplY3RzICU8PiUKICBpbm5lcl9qb2luKHRibChkYiwgIkNlbGxzIiksIGJ5ID0gYygiVGFibGVOdW1iZXIiLCAiSW1hZ2VOdW1iZXIiLCAiT2JqZWN0TnVtYmVyIikpICU+JQogIGlubmVyX2pvaW4odGJsKGRiLCAiQ3l0b3BsYXNtIiksIGJ5ID0gYygiVGFibGVOdW1iZXIiLCAiSW1hZ2VOdW1iZXIiLCAiT2JqZWN0TnVtYmVyIikpICU+JQogIGlubmVyX2pvaW4odGJsKGRiLCAiTnVjbGVpIiksIGJ5ID0gYygiVGFibGVOdW1iZXIiLCAiSW1hZ2VOdW1iZXIiLCAiT2JqZWN0TnVtYmVyIikpICU+JQogIGNvbGxlY3QobiA9IEluZikKCmBgYAojIyBQbG90IGRlbnNpdGllcyBvZiBmZWF0dXJlcwoKYGBge3IgcGxvdF9kZW5zaXRpZXMsIGV2YWw9VFJVRX0KCmlmKCFkaXIuZXhpc3RzKCJ2YXJpYWJsZV9kZW5zaXRpZXMiKSkgewogIGRpci5jcmVhdGUoInZhcmlhYmxlX2RlbnNpdGllcyIpCn0KCnZhcmlhYmxlcyA8LSBjKCJDZWxsc19UZXh0dXJlX0luZm9NZWFzMl9Db3JyVHViXzEwXzAiLAogICAgICAgICAgICAgICAiTnVjbGVpX0FyZWFTaGFwZV9Tb2xpZGl0eSIsCiAgICAgICAgICAgICAgICJOdWNsZWlfQXJlYVNoYXBlX09yaWVudGF0aW9uIiwKICAgICAgICAgICAgICAgIk51Y2xlaV9OZWlnaGJvcnNfRmlyc3RDbG9zZXN0RGlzdGFuY2VfMjAiLAogICAgICAgICAgICAgICAiTnVjbGVpX0FyZWFTaGFwZV9aZXJuaWtlXzlfNyIsCiAgICAgICAgICAgICAgICJOdWNsZWlfSW50ZW5zaXR5X01pbkludGVuc2l0eV9Db3JyREFQSSIsCiAgICAgICAgICAgICAgICJOdWNsZWlfTmVpZ2hib3JzX051bWJlck9mTmVpZ2hib3JzXzEwIikKCmZvciAodmFyaWFibGUgaW4gdmFyaWFibGVzKSB7CiAgZyA8LSBnZ3Bsb3Qoc2FtcGxlZF9vYmplY3RzLCBhZXNfc3RyaW5nKHZhcmlhYmxlKSkgKwogICAgc3RhdF9kZW5zaXR5KGFkanVzdCA9IDAuOCwgYWxwaGEgPSAwLjUpCgogIHByaW50KGcpCiAgCiAgZ2dzYXZlKHBsb3QgPSBnLAogICAgICAgICBmaWxlbmFtZSA9IHNwcmludGYoInZhcmlhYmxlX2RlbnNpdGllcy8lcy5wbmciLCB2YXJpYWJsZSksCiAgICAgICAgIHdpZHRoID0gNSwgaGVpZ2h0ID0gNSkKfQoKYGBgCgo=