library(magrittr)
library(tidyverse)

1 Read

metadata <- 
  read_tsv("https://raw.githubusercontent.com/broadinstitute/lincs-cell-painting/master/metadata/moa/repurposing_info_external_moa_map_resolved.tsv") 

metadata %<>% select(broad_sample, moa)

2 Tidy

metadata %<>%
  group_by(broad_sample) %>%
  nest() %>%
  mutate(x = map(
    data,
    function(y)
      data.frame(moa = str_split(y$moa, "\\|")[[1]])
  )) %>% 
  select(-data) %>%
  unnest(x) %>%
  distinct() %>%
  na.omit()

3 Report count

Number of MOAs (n_moa) with the specified number of compounds (n_compound)

compound_count_per_moa <- 
  metadata %>% 
  group_by(moa) %>% 
  tally(name = "n_compound") %>% 
  ungroup() %>% 
  group_by(n_compound) %>% 
  tally(name = "n_moa")

compound_count_per_moa

ggplot(compound_count_per_moa, aes(n_compound, n_moa)) + 
  geom_col() +
  ggtitle("Num. of MOAs (n_moa) with the specified num. of compounds (n_compound)")

Number of compounds (n_compound) with the specified number of MOA labels (n_moa)

moa_count_per_compound <- 
  metadata %>% 
  group_by(broad_sample) %>% 
  tally(name = "n_moa") %>% 
  ungroup() %>%
  group_by(n_moa) %>%
  tally(name = "n_compound")

moa_count_per_compound

ggplot(moa_count_per_compound, aes(n_moa, n_compound)) + 
  geom_col() +
  ggtitle("Num. of compounds (n_compound) with the specified num. of MOA labels (n_moa)")

4 Assign

metadata %<>% 
  mutate(train = FALSE, 
         test = FALSE)

metadata %<>%
  mutate(marked = train | test)
metadata_init <- metadata

Get list of MOA sorted in ascending order of class size

moas <- 
  metadata %>% 
  group_by(moa) %>% 
  tally(name = "n_compound") %>%
  arrange(n_compound) %>% 
  extract2("moa")

Assign compounds to train / test ensuring that the assignments are consistent across MOAs

# loop over moa's
for (moa_i in moas) {
  
  moa_current <- metadata %>% filter(moa == moa_i)
  
  # number of compounds in this MOA
  n_compounds <- nrow(moa_current)
  
  # number of train, test to be picked
  
  if (n_compounds == 1) {
    # for singleton MOAs, assign to train
    
    n_test <- 0
    n_train <- 1
    
  } else {
    n_test <- ceiling(n_compounds * 0.2)
    n_train <- floor(n_compounds * 0.8)
  }
  
  # number that are already assigned as train or test
  n_test_marked <- sum(moa_current$test)
  n_train_marked <- sum(moa_current$train)
  
  # number that need to be assigned as train or test
  n_train_needed <- n_train - n_train_marked
  n_test_needed <- n_test - n_test_marked
  
  # split compounds into those that have already been previously marked vs not
  moa_current_notmarked <- moa_current %>% filter(!marked)
  moa_current_marked    <- moa_current %>% filter( marked)
  
  # number of compounds in this MOA that are not marked
  n_compounds_notmarked <- nrow(moa_current_notmarked)
  
  n_compounds_needed <- n_test_needed + n_train_needed
  
  # if you hit this, we have a problem and would need to randomly shuffle the moa list 
  # and start over
  stopifnot(n_compounds_needed == n_compounds_notmarked)
  
  n_test_needed <- max(n_test_needed, 0)
  
  n_train_needed <- max(n_train_needed, 0)
  
  train_flag <- c(rep(TRUE, n_train_needed), rep(FALSE, n_test_needed))
  
  train_flag <- sample(train_flag, n_compounds_needed)
  
  test_flag <- 
    !train_flag
  
  moa_current_notmarked$train <- train_flag
  
  moa_current_notmarked$test <- test_flag
  
  moa_current_notmarked$marked <- TRUE
  
  moa_current <-
    bind_rows(
      moa_current_notmarked,
      moa_current_marked
    )
  
  moa_other <- metadata %>% filter(moa != moa_i)
  
  moa_other_marked    <- metadata %>% filter((moa != moa_i) &  (marked))
  
  moa_other_notmarked <- metadata %>% filter((moa != moa_i) & (!marked))
  
  moa_other_notmarked %<>% 
    select(broad_sample, moa) %>% 
    left_join(
      moa_current_notmarked %>% select(-moa), 
      by = "broad_sample") %>%
    replace_na(list(train = FALSE, test = FALSE, marked = FALSE))
  
  metadata <- 
    bind_rows(
      moa_current,
      moa_other_marked,
      moa_other_notmarked
    )
  
  #message(sprintf("%s:%d", moa_i, nrow(metadata)))
}
metadata %>% 
  group_by(moa) %>%
  summarise(n_train = sum(train), n_test = sum(test)) %>% 
  mutate(n = n_train + n_test) %>%
  arrange(desc(n))

Make sure that a compounds is either in train or in test (we need to check this because a compound can occur in more than one MOA)

consistent <- function(x) all(x) || all(!x)

# should have zero rows if consistent
metadata %>% 
  group_by(broad_sample) %>%
  summarise(train = consistent(train),
            test = consistent(test)) %>%
  filter(!train | !test)
metadata %>% 
  group_by(broad_sample) %>%
  summarise(train = all(train),
            test = all(test)) %>%
  ungroup() %>%
  summarize(train = sum(train),
            test = sum(test)) %>%
  mutate(total = train + test)

metadata %>% 
  group_by(moa) %>%
  summarize(train = sum(train),
            test = sum(test)) %>%
  mutate(test_ratio = test/(train + test)) %>% 
  filter(test_ratio == 0 & train > 1) %>%
  arrange(desc(train))

metadata %>%
  ungroup() %>%
  distinct(broad_sample) %>%
  count()
metadata %>% write_csv("~/Desktop/lincs_split.csv")
LS0tCnRpdGxlOiAiU3BsaXQgTElOQ1MgQ2VsbCBQYWludGluZyBQaWxvdCAxIGRhdGEgaW50byB0cmFpbiAvIHRlc3QiCm91dHB1dDogCiAgaHRtbF9ub3RlYm9vazoKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OiB0cnVlCiAgICB0b2NfZGVwdGg6IDMKICAgIG51bWJlcl9zZWN0aW9uczogdHJ1ZQogICAgdGhlbWU6IGx1bWVuCi0tLQoKYGBge3IgbWVzc2FnZT1GQUxTRX0KbGlicmFyeShtYWdyaXR0cikKbGlicmFyeSh0aWR5dmVyc2UpCmBgYAoKIyBSZWFkCgpgYGB7ciBtZXNzYWdlPUZBTFNFfQptZXRhZGF0YSA8LSAKICByZWFkX3RzdigiaHR0cHM6Ly9yYXcuZ2l0aHVidXNlcmNvbnRlbnQuY29tL2Jyb2FkaW5zdGl0dXRlL2xpbmNzLWNlbGwtcGFpbnRpbmcvbWFzdGVyL21ldGFkYXRhL21vYS9yZXB1cnBvc2luZ19pbmZvX2V4dGVybmFsX21vYV9tYXBfcmVzb2x2ZWQudHN2IikgCgptZXRhZGF0YSAlPD4lIHNlbGVjdChicm9hZF9zYW1wbGUsIG1vYSkKYGBgCgojIFRpZHkKCmBgYHtyfQptZXRhZGF0YSAlPD4lCiAgZ3JvdXBfYnkoYnJvYWRfc2FtcGxlKSAlPiUKICBuZXN0KCkgJT4lCiAgbXV0YXRlKHggPSBtYXAoCiAgICBkYXRhLAogICAgZnVuY3Rpb24oeSkKICAgICAgZGF0YS5mcmFtZShtb2EgPSBzdHJfc3BsaXQoeSRtb2EsICJcXHwiKVtbMV1dKQogICkpICU+JSAKICBzZWxlY3QoLWRhdGEpICU+JQogIHVubmVzdCh4KSAlPiUKICBkaXN0aW5jdCgpICU+JQogIG5hLm9taXQoKQpgYGAKCiMgUmVwb3J0IGNvdW50CgpOdW1iZXIgb2YgTU9BcyAoYG5fbW9hYCkgd2l0aCB0aGUgc3BlY2lmaWVkIG51bWJlciBvZiBjb21wb3VuZHMgKGBuX2NvbXBvdW5kYCkKCmBgYHtyfQpjb21wb3VuZF9jb3VudF9wZXJfbW9hIDwtIAogIG1ldGFkYXRhICU+JSAKICBncm91cF9ieShtb2EpICU+JSAKICB0YWxseShuYW1lID0gIm5fY29tcG91bmQiKSAlPiUgCiAgdW5ncm91cCgpICU+JSAKICBncm91cF9ieShuX2NvbXBvdW5kKSAlPiUgCiAgdGFsbHkobmFtZSA9ICJuX21vYSIpCgpjb21wb3VuZF9jb3VudF9wZXJfbW9hCgpnZ3Bsb3QoY29tcG91bmRfY291bnRfcGVyX21vYSwgYWVzKG5fY29tcG91bmQsIG5fbW9hKSkgKyAKICBnZW9tX2NvbCgpICsKICBnZ3RpdGxlKCJOdW0uIG9mIE1PQXMgKG5fbW9hKSB3aXRoIHRoZSBzcGVjaWZpZWQgbnVtLiBvZiBjb21wb3VuZHMgKG5fY29tcG91bmQpIikKYGBgCgpOdW1iZXIgb2YgY29tcG91bmRzIChgbl9jb21wb3VuZGApIHdpdGggdGhlIHNwZWNpZmllZCBudW1iZXIgb2YgTU9BIGxhYmVscyAoYG5fbW9hYCkKCmBgYHtyfQptb2FfY291bnRfcGVyX2NvbXBvdW5kIDwtIAogIG1ldGFkYXRhICU+JSAKICBncm91cF9ieShicm9hZF9zYW1wbGUpICU+JSAKICB0YWxseShuYW1lID0gIm5fbW9hIikgJT4lIAogIHVuZ3JvdXAoKSAlPiUKICBncm91cF9ieShuX21vYSkgJT4lCiAgdGFsbHkobmFtZSA9ICJuX2NvbXBvdW5kIikKCm1vYV9jb3VudF9wZXJfY29tcG91bmQKCmdncGxvdChtb2FfY291bnRfcGVyX2NvbXBvdW5kLCBhZXMobl9tb2EsIG5fY29tcG91bmQpKSArIAogIGdlb21fY29sKCkgKwogIGdndGl0bGUoIk51bS4gb2YgY29tcG91bmRzIChuX2NvbXBvdW5kKSB3aXRoIHRoZSBzcGVjaWZpZWQgbnVtLiBvZiBNT0EgbGFiZWxzIChuX21vYSkiKQoKYGBgCgojIEFzc2lnbgoKYGBge3J9Cm1ldGFkYXRhICU8PiUgCiAgbXV0YXRlKHRyYWluID0gRkFMU0UsIAogICAgICAgICB0ZXN0ID0gRkFMU0UpCgptZXRhZGF0YSAlPD4lCiAgbXV0YXRlKG1hcmtlZCA9IHRyYWluIHwgdGVzdCkKYGBgCgpgYGB7cn0KbWV0YWRhdGFfaW5pdCA8LSBtZXRhZGF0YQpgYGAKCkdldCBsaXN0IG9mIE1PQSBzb3J0ZWQgaW4gYXNjZW5kaW5nIG9yZGVyIG9mIGNsYXNzIHNpemUKCmBgYHtyfQptb2FzIDwtIAogIG1ldGFkYXRhICU+JSAKICBncm91cF9ieShtb2EpICU+JSAKICB0YWxseShuYW1lID0gIm5fY29tcG91bmQiKSAlPiUKICBhcnJhbmdlKG5fY29tcG91bmQpICU+JSAKICBleHRyYWN0MigibW9hIikKYGBgCgpBc3NpZ24gY29tcG91bmRzIHRvIHRyYWluIC8gdGVzdCBlbnN1cmluZyB0aGF0IHRoZSBhc3NpZ25tZW50cyBhcmUgY29uc2lzdGVudCBhY3Jvc3MgTU9BcwoKYGBge3J9CiMgbG9vcCBvdmVyIG1vYSdzCmZvciAobW9hX2kgaW4gbW9hcykgewogIAogIG1vYV9jdXJyZW50IDwtIG1ldGFkYXRhICU+JSBmaWx0ZXIobW9hID09IG1vYV9pKQogIAogICMgbnVtYmVyIG9mIGNvbXBvdW5kcyBpbiB0aGlzIE1PQQogIG5fY29tcG91bmRzIDwtIG5yb3cobW9hX2N1cnJlbnQpCiAgCiAgIyBudW1iZXIgb2YgdHJhaW4sIHRlc3QgdG8gYmUgcGlja2VkCiAgCiAgaWYgKG5fY29tcG91bmRzID09IDEpIHsKICAgICMgZm9yIHNpbmdsZXRvbiBNT0FzLCBhc3NpZ24gdG8gdHJhaW4KICAgIAogICAgbl90ZXN0IDwtIDAKICAgIG5fdHJhaW4gPC0gMQogICAgCiAgfSBlbHNlIHsKICAgIG5fdGVzdCA8LSBjZWlsaW5nKG5fY29tcG91bmRzICogMC4yKQogICAgbl90cmFpbiA8LSBmbG9vcihuX2NvbXBvdW5kcyAqIDAuOCkKICB9CiAgCiAgIyBudW1iZXIgdGhhdCBhcmUgYWxyZWFkeSBhc3NpZ25lZCBhcyB0cmFpbiBvciB0ZXN0CiAgbl90ZXN0X21hcmtlZCA8LSBzdW0obW9hX2N1cnJlbnQkdGVzdCkKICBuX3RyYWluX21hcmtlZCA8LSBzdW0obW9hX2N1cnJlbnQkdHJhaW4pCiAgCiAgIyBudW1iZXIgdGhhdCBuZWVkIHRvIGJlIGFzc2lnbmVkIGFzIHRyYWluIG9yIHRlc3QKICBuX3RyYWluX25lZWRlZCA8LSBuX3RyYWluIC0gbl90cmFpbl9tYXJrZWQKICBuX3Rlc3RfbmVlZGVkIDwtIG5fdGVzdCAtIG5fdGVzdF9tYXJrZWQKICAKICAjIHNwbGl0IGNvbXBvdW5kcyBpbnRvIHRob3NlIHRoYXQgaGF2ZSBhbHJlYWR5IGJlZW4gcHJldmlvdXNseSBtYXJrZWQgdnMgbm90CiAgbW9hX2N1cnJlbnRfbm90bWFya2VkIDwtIG1vYV9jdXJyZW50ICU+JSBmaWx0ZXIoIW1hcmtlZCkKICBtb2FfY3VycmVudF9tYXJrZWQgICAgPC0gbW9hX2N1cnJlbnQgJT4lIGZpbHRlciggbWFya2VkKQogIAogICMgbnVtYmVyIG9mIGNvbXBvdW5kcyBpbiB0aGlzIE1PQSB0aGF0IGFyZSBub3QgbWFya2VkCiAgbl9jb21wb3VuZHNfbm90bWFya2VkIDwtIG5yb3cobW9hX2N1cnJlbnRfbm90bWFya2VkKQogIAogIG5fY29tcG91bmRzX25lZWRlZCA8LSBuX3Rlc3RfbmVlZGVkICsgbl90cmFpbl9uZWVkZWQKICAKICAjIGlmIHlvdSBoaXQgdGhpcywgd2UgaGF2ZSBhIHByb2JsZW0gYW5kIHdvdWxkIG5lZWQgdG8gcmFuZG9tbHkgc2h1ZmZsZSB0aGUgbW9hIGxpc3QgCiAgIyBhbmQgc3RhcnQgb3ZlcgogIHN0b3BpZm5vdChuX2NvbXBvdW5kc19uZWVkZWQgPT0gbl9jb21wb3VuZHNfbm90bWFya2VkKQogIAogIG5fdGVzdF9uZWVkZWQgPC0gbWF4KG5fdGVzdF9uZWVkZWQsIDApCiAgCiAgbl90cmFpbl9uZWVkZWQgPC0gbWF4KG5fdHJhaW5fbmVlZGVkLCAwKQogIAogIHRyYWluX2ZsYWcgPC0gYyhyZXAoVFJVRSwgbl90cmFpbl9uZWVkZWQpLCByZXAoRkFMU0UsIG5fdGVzdF9uZWVkZWQpKQogIAogIHRyYWluX2ZsYWcgPC0gc2FtcGxlKHRyYWluX2ZsYWcsIG5fY29tcG91bmRzX25lZWRlZCkKICAKICB0ZXN0X2ZsYWcgPC0gCiAgICAhdHJhaW5fZmxhZwogIAogIG1vYV9jdXJyZW50X25vdG1hcmtlZCR0cmFpbiA8LSB0cmFpbl9mbGFnCiAgCiAgbW9hX2N1cnJlbnRfbm90bWFya2VkJHRlc3QgPC0gdGVzdF9mbGFnCiAgCiAgbW9hX2N1cnJlbnRfbm90bWFya2VkJG1hcmtlZCA8LSBUUlVFCiAgCiAgbW9hX2N1cnJlbnQgPC0KICAgIGJpbmRfcm93cygKICAgICAgbW9hX2N1cnJlbnRfbm90bWFya2VkLAogICAgICBtb2FfY3VycmVudF9tYXJrZWQKICAgICkKICAKICBtb2Ffb3RoZXIgPC0gbWV0YWRhdGEgJT4lIGZpbHRlcihtb2EgIT0gbW9hX2kpCiAgCiAgbW9hX290aGVyX21hcmtlZCAgICA8LSBtZXRhZGF0YSAlPiUgZmlsdGVyKChtb2EgIT0gbW9hX2kpICYgIChtYXJrZWQpKQogIAogIG1vYV9vdGhlcl9ub3RtYXJrZWQgPC0gbWV0YWRhdGEgJT4lIGZpbHRlcigobW9hICE9IG1vYV9pKSAmICghbWFya2VkKSkKICAKICBtb2Ffb3RoZXJfbm90bWFya2VkICU8PiUgCiAgICBzZWxlY3QoYnJvYWRfc2FtcGxlLCBtb2EpICU+JSAKICAgIGxlZnRfam9pbigKICAgICAgbW9hX2N1cnJlbnRfbm90bWFya2VkICU+JSBzZWxlY3QoLW1vYSksIAogICAgICBieSA9ICJicm9hZF9zYW1wbGUiKSAlPiUKICAgIHJlcGxhY2VfbmEobGlzdCh0cmFpbiA9IEZBTFNFLCB0ZXN0ID0gRkFMU0UsIG1hcmtlZCA9IEZBTFNFKSkKICAKICBtZXRhZGF0YSA8LSAKICAgIGJpbmRfcm93cygKICAgICAgbW9hX2N1cnJlbnQsCiAgICAgIG1vYV9vdGhlcl9tYXJrZWQsCiAgICAgIG1vYV9vdGhlcl9ub3RtYXJrZWQKICAgICkKICAKICAjbWVzc2FnZShzcHJpbnRmKCIlczolZCIsIG1vYV9pLCBucm93KG1ldGFkYXRhKSkpCn0KYGBgCgpgYGB7cn0KbWV0YWRhdGEgJT4lIAogIGdyb3VwX2J5KG1vYSkgJT4lCiAgc3VtbWFyaXNlKG5fdHJhaW4gPSBzdW0odHJhaW4pLCBuX3Rlc3QgPSBzdW0odGVzdCkpICU+JSAKICBtdXRhdGUobiA9IG5fdHJhaW4gKyBuX3Rlc3QpICU+JQogIGFycmFuZ2UoZGVzYyhuKSkKYGBgCgpNYWtlIHN1cmUgdGhhdCBhIGNvbXBvdW5kcyBpcyBlaXRoZXIgaW4gdHJhaW4gb3IgaW4gdGVzdCAod2UgbmVlZCB0byBjaGVjayB0aGlzIGJlY2F1c2UgYSBjb21wb3VuZCBjYW4gb2NjdXIgaW4gbW9yZSB0aGFuIG9uZSBNT0EpCgpgYGB7cn0KY29uc2lzdGVudCA8LSBmdW5jdGlvbih4KSBhbGwoeCkgfHwgYWxsKCF4KQoKIyBzaG91bGQgaGF2ZSB6ZXJvIHJvd3MgaWYgY29uc2lzdGVudAptZXRhZGF0YSAlPiUgCiAgZ3JvdXBfYnkoYnJvYWRfc2FtcGxlKSAlPiUKICBzdW1tYXJpc2UodHJhaW4gPSBjb25zaXN0ZW50KHRyYWluKSwKICAgICAgICAgICAgdGVzdCA9IGNvbnNpc3RlbnQodGVzdCkpICU+JQogIGZpbHRlcighdHJhaW4gfCAhdGVzdCkKYGBgCmBgYHtyfQptZXRhZGF0YSAlPiUgCiAgZ3JvdXBfYnkoYnJvYWRfc2FtcGxlKSAlPiUKICBzdW1tYXJpc2UodHJhaW4gPSBhbGwodHJhaW4pLAogICAgICAgICAgICB0ZXN0ID0gYWxsKHRlc3QpKSAlPiUKICB1bmdyb3VwKCkgJT4lCiAgc3VtbWFyaXplKHRyYWluID0gc3VtKHRyYWluKSwKICAgICAgICAgICAgdGVzdCA9IHN1bSh0ZXN0KSkgJT4lCiAgbXV0YXRlKHRvdGFsID0gdHJhaW4gKyB0ZXN0KQoKbWV0YWRhdGEgJT4lIAogIGdyb3VwX2J5KG1vYSkgJT4lCiAgc3VtbWFyaXplKHRyYWluID0gc3VtKHRyYWluKSwKICAgICAgICAgICAgdGVzdCA9IHN1bSh0ZXN0KSkgJT4lCiAgbXV0YXRlKHRlc3RfcmF0aW8gPSB0ZXN0Lyh0cmFpbiArIHRlc3QpKSAlPiUgCiAgZmlsdGVyKHRlc3RfcmF0aW8gPT0gMCAmIHRyYWluID4gMSkgJT4lCiAgYXJyYW5nZShkZXNjKHRyYWluKSkKCm1ldGFkYXRhICU+JQogIHVuZ3JvdXAoKSAlPiUKICBkaXN0aW5jdChicm9hZF9zYW1wbGUpICU+JQogIGNvdW50KCkKYGBgCgpgYGB7cn0KbWV0YWRhdGEgJT4lIHdyaXRlX2Nzdigifi9EZXNrdG9wL2xpbmNzX3NwbGl0LmNzdiIpCmBgYAo=