library(magrittr)
library(tidyverse)
Read
metadata <-
read_tsv("https://raw.githubusercontent.com/broadinstitute/lincs-cell-painting/master/metadata/moa/repurposing_info_external_moa_map_resolved.tsv")
metadata %<>% select(broad_sample, moa)
Tidy
metadata %<>%
group_by(broad_sample) %>%
nest() %>%
mutate(x = map(
data,
function(y)
data.frame(moa = str_split(y$moa, "\\|")[[1]])
)) %>%
select(-data) %>%
unnest(x) %>%
distinct() %>%
na.omit()
Report count
Number of MOAs (n_moa) with the specified number of compounds (n_compound)
compound_count_per_moa <-
metadata %>%
group_by(moa) %>%
tally(name = "n_compound") %>%
ungroup() %>%
group_by(n_compound) %>%
tally(name = "n_moa")
compound_count_per_moa
ggplot(compound_count_per_moa, aes(n_compound, n_moa)) +
geom_col() +
ggtitle("Num. of MOAs (n_moa) with the specified num. of compounds (n_compound)")

Number of compounds (n_compound) with the specified number of MOA labels (n_moa)
moa_count_per_compound <-
metadata %>%
group_by(broad_sample) %>%
tally(name = "n_moa") %>%
ungroup() %>%
group_by(n_moa) %>%
tally(name = "n_compound")
moa_count_per_compound
ggplot(moa_count_per_compound, aes(n_moa, n_compound)) +
geom_col() +
ggtitle("Num. of compounds (n_compound) with the specified num. of MOA labels (n_moa)")

Assign
metadata %<>%
mutate(train = FALSE,
test = FALSE)
metadata %<>%
mutate(marked = train | test)
metadata_init <- metadata
Get list of MOA sorted in ascending order of class size
moas <-
metadata %>%
group_by(moa) %>%
tally(name = "n_compound") %>%
arrange(n_compound) %>%
extract2("moa")
Assign compounds to train / test ensuring that the assignments are consistent across MOAs
# loop over moa's
for (moa_i in moas) {
moa_current <- metadata %>% filter(moa == moa_i)
# number of compounds in this MOA
n_compounds <- nrow(moa_current)
# number of train, test to be picked
if (n_compounds == 1) {
# for singleton MOAs, assign to train
n_test <- 0
n_train <- 1
} else {
n_test <- ceiling(n_compounds * 0.2)
n_train <- floor(n_compounds * 0.8)
}
# number that are already assigned as train or test
n_test_marked <- sum(moa_current$test)
n_train_marked <- sum(moa_current$train)
# number that need to be assigned as train or test
n_train_needed <- n_train - n_train_marked
n_test_needed <- n_test - n_test_marked
# split compounds into those that have already been previously marked vs not
moa_current_notmarked <- moa_current %>% filter(!marked)
moa_current_marked <- moa_current %>% filter( marked)
# number of compounds in this MOA that are not marked
n_compounds_notmarked <- nrow(moa_current_notmarked)
n_compounds_needed <- n_test_needed + n_train_needed
# if you hit this, we have a problem and would need to randomly shuffle the moa list
# and start over
stopifnot(n_compounds_needed == n_compounds_notmarked)
n_test_needed <- max(n_test_needed, 0)
n_train_needed <- max(n_train_needed, 0)
train_flag <- c(rep(TRUE, n_train_needed), rep(FALSE, n_test_needed))
train_flag <- sample(train_flag, n_compounds_needed)
test_flag <-
!train_flag
moa_current_notmarked$train <- train_flag
moa_current_notmarked$test <- test_flag
moa_current_notmarked$marked <- TRUE
moa_current <-
bind_rows(
moa_current_notmarked,
moa_current_marked
)
moa_other <- metadata %>% filter(moa != moa_i)
moa_other_marked <- metadata %>% filter((moa != moa_i) & (marked))
moa_other_notmarked <- metadata %>% filter((moa != moa_i) & (!marked))
moa_other_notmarked %<>%
select(broad_sample, moa) %>%
left_join(
moa_current_notmarked %>% select(-moa),
by = "broad_sample") %>%
replace_na(list(train = FALSE, test = FALSE, marked = FALSE))
metadata <-
bind_rows(
moa_current,
moa_other_marked,
moa_other_notmarked
)
#message(sprintf("%s:%d", moa_i, nrow(metadata)))
}
metadata %>%
group_by(moa) %>%
summarise(n_train = sum(train), n_test = sum(test)) %>%
mutate(n = n_train + n_test) %>%
arrange(desc(n))
Make sure that a compounds is either in train or in test (we need to check this because a compound can occur in more than one MOA)
consistent <- function(x) all(x) || all(!x)
# should have zero rows if consistent
metadata %>%
group_by(broad_sample) %>%
summarise(train = consistent(train),
test = consistent(test)) %>%
filter(!train | !test)
metadata %>%
group_by(broad_sample) %>%
summarise(train = all(train),
test = all(test)) %>%
ungroup() %>%
summarize(train = sum(train),
test = sum(test)) %>%
mutate(total = train + test)
metadata %>%
group_by(moa) %>%
summarize(train = sum(train),
test = sum(test)) %>%
mutate(test_ratio = test/(train + test)) %>%
filter(test_ratio == 0 & train > 1) %>%
arrange(desc(train))
metadata %>%
ungroup() %>%
distinct(broad_sample) %>%
count()
metadata %>% write_csv("~/Desktop/lincs_split.csv")
LS0tCnRpdGxlOiAiU3BsaXQgTElOQ1MgQ2VsbCBQYWludGluZyBQaWxvdCAxIGRhdGEgaW50byB0cmFpbiAvIHRlc3QiCm91dHB1dDogCiAgaHRtbF9ub3RlYm9vazoKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OiB0cnVlCiAgICB0b2NfZGVwdGg6IDMKICAgIG51bWJlcl9zZWN0aW9uczogdHJ1ZQogICAgdGhlbWU6IGx1bWVuCi0tLQoKYGBge3IgbWVzc2FnZT1GQUxTRX0KbGlicmFyeShtYWdyaXR0cikKbGlicmFyeSh0aWR5dmVyc2UpCmBgYAoKIyBSZWFkCgpgYGB7ciBtZXNzYWdlPUZBTFNFfQptZXRhZGF0YSA8LSAKICByZWFkX3RzdigiaHR0cHM6Ly9yYXcuZ2l0aHVidXNlcmNvbnRlbnQuY29tL2Jyb2FkaW5zdGl0dXRlL2xpbmNzLWNlbGwtcGFpbnRpbmcvbWFzdGVyL21ldGFkYXRhL21vYS9yZXB1cnBvc2luZ19pbmZvX2V4dGVybmFsX21vYV9tYXBfcmVzb2x2ZWQudHN2IikgCgptZXRhZGF0YSAlPD4lIHNlbGVjdChicm9hZF9zYW1wbGUsIG1vYSkKYGBgCgojIFRpZHkKCmBgYHtyfQptZXRhZGF0YSAlPD4lCiAgZ3JvdXBfYnkoYnJvYWRfc2FtcGxlKSAlPiUKICBuZXN0KCkgJT4lCiAgbXV0YXRlKHggPSBtYXAoCiAgICBkYXRhLAogICAgZnVuY3Rpb24oeSkKICAgICAgZGF0YS5mcmFtZShtb2EgPSBzdHJfc3BsaXQoeSRtb2EsICJcXHwiKVtbMV1dKQogICkpICU+JSAKICBzZWxlY3QoLWRhdGEpICU+JQogIHVubmVzdCh4KSAlPiUKICBkaXN0aW5jdCgpICU+JQogIG5hLm9taXQoKQpgYGAKCiMgUmVwb3J0IGNvdW50CgpOdW1iZXIgb2YgTU9BcyAoYG5fbW9hYCkgd2l0aCB0aGUgc3BlY2lmaWVkIG51bWJlciBvZiBjb21wb3VuZHMgKGBuX2NvbXBvdW5kYCkKCmBgYHtyfQpjb21wb3VuZF9jb3VudF9wZXJfbW9hIDwtIAogIG1ldGFkYXRhICU+JSAKICBncm91cF9ieShtb2EpICU+JSAKICB0YWxseShuYW1lID0gIm5fY29tcG91bmQiKSAlPiUgCiAgdW5ncm91cCgpICU+JSAKICBncm91cF9ieShuX2NvbXBvdW5kKSAlPiUgCiAgdGFsbHkobmFtZSA9ICJuX21vYSIpCgpjb21wb3VuZF9jb3VudF9wZXJfbW9hCgpnZ3Bsb3QoY29tcG91bmRfY291bnRfcGVyX21vYSwgYWVzKG5fY29tcG91bmQsIG5fbW9hKSkgKyAKICBnZW9tX2NvbCgpICsKICBnZ3RpdGxlKCJOdW0uIG9mIE1PQXMgKG5fbW9hKSB3aXRoIHRoZSBzcGVjaWZpZWQgbnVtLiBvZiBjb21wb3VuZHMgKG5fY29tcG91bmQpIikKYGBgCgpOdW1iZXIgb2YgY29tcG91bmRzIChgbl9jb21wb3VuZGApIHdpdGggdGhlIHNwZWNpZmllZCBudW1iZXIgb2YgTU9BIGxhYmVscyAoYG5fbW9hYCkKCmBgYHtyfQptb2FfY291bnRfcGVyX2NvbXBvdW5kIDwtIAogIG1ldGFkYXRhICU+JSAKICBncm91cF9ieShicm9hZF9zYW1wbGUpICU+JSAKICB0YWxseShuYW1lID0gIm5fbW9hIikgJT4lIAogIHVuZ3JvdXAoKSAlPiUKICBncm91cF9ieShuX21vYSkgJT4lCiAgdGFsbHkobmFtZSA9ICJuX2NvbXBvdW5kIikKCm1vYV9jb3VudF9wZXJfY29tcG91bmQKCmdncGxvdChtb2FfY291bnRfcGVyX2NvbXBvdW5kLCBhZXMobl9tb2EsIG5fY29tcG91bmQpKSArIAogIGdlb21fY29sKCkgKwogIGdndGl0bGUoIk51bS4gb2YgY29tcG91bmRzIChuX2NvbXBvdW5kKSB3aXRoIHRoZSBzcGVjaWZpZWQgbnVtLiBvZiBNT0EgbGFiZWxzIChuX21vYSkiKQoKYGBgCgojIEFzc2lnbgoKYGBge3J9Cm1ldGFkYXRhICU8PiUgCiAgbXV0YXRlKHRyYWluID0gRkFMU0UsIAogICAgICAgICB0ZXN0ID0gRkFMU0UpCgptZXRhZGF0YSAlPD4lCiAgbXV0YXRlKG1hcmtlZCA9IHRyYWluIHwgdGVzdCkKYGBgCgpgYGB7cn0KbWV0YWRhdGFfaW5pdCA8LSBtZXRhZGF0YQpgYGAKCkdldCBsaXN0IG9mIE1PQSBzb3J0ZWQgaW4gYXNjZW5kaW5nIG9yZGVyIG9mIGNsYXNzIHNpemUKCmBgYHtyfQptb2FzIDwtIAogIG1ldGFkYXRhICU+JSAKICBncm91cF9ieShtb2EpICU+JSAKICB0YWxseShuYW1lID0gIm5fY29tcG91bmQiKSAlPiUKICBhcnJhbmdlKG5fY29tcG91bmQpICU+JSAKICBleHRyYWN0MigibW9hIikKYGBgCgpBc3NpZ24gY29tcG91bmRzIHRvIHRyYWluIC8gdGVzdCBlbnN1cmluZyB0aGF0IHRoZSBhc3NpZ25tZW50cyBhcmUgY29uc2lzdGVudCBhY3Jvc3MgTU9BcwoKYGBge3J9CiMgbG9vcCBvdmVyIG1vYSdzCmZvciAobW9hX2kgaW4gbW9hcykgewogIAogIG1vYV9jdXJyZW50IDwtIG1ldGFkYXRhICU+JSBmaWx0ZXIobW9hID09IG1vYV9pKQogIAogICMgbnVtYmVyIG9mIGNvbXBvdW5kcyBpbiB0aGlzIE1PQQogIG5fY29tcG91bmRzIDwtIG5yb3cobW9hX2N1cnJlbnQpCiAgCiAgIyBudW1iZXIgb2YgdHJhaW4sIHRlc3QgdG8gYmUgcGlja2VkCiAgCiAgaWYgKG5fY29tcG91bmRzID09IDEpIHsKICAgICMgZm9yIHNpbmdsZXRvbiBNT0FzLCBhc3NpZ24gdG8gdHJhaW4KICAgIAogICAgbl90ZXN0IDwtIDAKICAgIG5fdHJhaW4gPC0gMQogICAgCiAgfSBlbHNlIHsKICAgIG5fdGVzdCA8LSBjZWlsaW5nKG5fY29tcG91bmRzICogMC4yKQogICAgbl90cmFpbiA8LSBmbG9vcihuX2NvbXBvdW5kcyAqIDAuOCkKICB9CiAgCiAgIyBudW1iZXIgdGhhdCBhcmUgYWxyZWFkeSBhc3NpZ25lZCBhcyB0cmFpbiBvciB0ZXN0CiAgbl90ZXN0X21hcmtlZCA8LSBzdW0obW9hX2N1cnJlbnQkdGVzdCkKICBuX3RyYWluX21hcmtlZCA8LSBzdW0obW9hX2N1cnJlbnQkdHJhaW4pCiAgCiAgIyBudW1iZXIgdGhhdCBuZWVkIHRvIGJlIGFzc2lnbmVkIGFzIHRyYWluIG9yIHRlc3QKICBuX3RyYWluX25lZWRlZCA8LSBuX3RyYWluIC0gbl90cmFpbl9tYXJrZWQKICBuX3Rlc3RfbmVlZGVkIDwtIG5fdGVzdCAtIG5fdGVzdF9tYXJrZWQKICAKICAjIHNwbGl0IGNvbXBvdW5kcyBpbnRvIHRob3NlIHRoYXQgaGF2ZSBhbHJlYWR5IGJlZW4gcHJldmlvdXNseSBtYXJrZWQgdnMgbm90CiAgbW9hX2N1cnJlbnRfbm90bWFya2VkIDwtIG1vYV9jdXJyZW50ICU+JSBmaWx0ZXIoIW1hcmtlZCkKICBtb2FfY3VycmVudF9tYXJrZWQgICAgPC0gbW9hX2N1cnJlbnQgJT4lIGZpbHRlciggbWFya2VkKQogIAogICMgbnVtYmVyIG9mIGNvbXBvdW5kcyBpbiB0aGlzIE1PQSB0aGF0IGFyZSBub3QgbWFya2VkCiAgbl9jb21wb3VuZHNfbm90bWFya2VkIDwtIG5yb3cobW9hX2N1cnJlbnRfbm90bWFya2VkKQogIAogIG5fY29tcG91bmRzX25lZWRlZCA8LSBuX3Rlc3RfbmVlZGVkICsgbl90cmFpbl9uZWVkZWQKICAKICAjIGlmIHlvdSBoaXQgdGhpcywgd2UgaGF2ZSBhIHByb2JsZW0gYW5kIHdvdWxkIG5lZWQgdG8gcmFuZG9tbHkgc2h1ZmZsZSB0aGUgbW9hIGxpc3QgCiAgIyBhbmQgc3RhcnQgb3ZlcgogIHN0b3BpZm5vdChuX2NvbXBvdW5kc19uZWVkZWQgPT0gbl9jb21wb3VuZHNfbm90bWFya2VkKQogIAogIG5fdGVzdF9uZWVkZWQgPC0gbWF4KG5fdGVzdF9uZWVkZWQsIDApCiAgCiAgbl90cmFpbl9uZWVkZWQgPC0gbWF4KG5fdHJhaW5fbmVlZGVkLCAwKQogIAogIHRyYWluX2ZsYWcgPC0gYyhyZXAoVFJVRSwgbl90cmFpbl9uZWVkZWQpLCByZXAoRkFMU0UsIG5fdGVzdF9uZWVkZWQpKQogIAogIHRyYWluX2ZsYWcgPC0gc2FtcGxlKHRyYWluX2ZsYWcsIG5fY29tcG91bmRzX25lZWRlZCkKICAKICB0ZXN0X2ZsYWcgPC0gCiAgICAhdHJhaW5fZmxhZwogIAogIG1vYV9jdXJyZW50X25vdG1hcmtlZCR0cmFpbiA8LSB0cmFpbl9mbGFnCiAgCiAgbW9hX2N1cnJlbnRfbm90bWFya2VkJHRlc3QgPC0gdGVzdF9mbGFnCiAgCiAgbW9hX2N1cnJlbnRfbm90bWFya2VkJG1hcmtlZCA8LSBUUlVFCiAgCiAgbW9hX2N1cnJlbnQgPC0KICAgIGJpbmRfcm93cygKICAgICAgbW9hX2N1cnJlbnRfbm90bWFya2VkLAogICAgICBtb2FfY3VycmVudF9tYXJrZWQKICAgICkKICAKICBtb2Ffb3RoZXIgPC0gbWV0YWRhdGEgJT4lIGZpbHRlcihtb2EgIT0gbW9hX2kpCiAgCiAgbW9hX290aGVyX21hcmtlZCAgICA8LSBtZXRhZGF0YSAlPiUgZmlsdGVyKChtb2EgIT0gbW9hX2kpICYgIChtYXJrZWQpKQogIAogIG1vYV9vdGhlcl9ub3RtYXJrZWQgPC0gbWV0YWRhdGEgJT4lIGZpbHRlcigobW9hICE9IG1vYV9pKSAmICghbWFya2VkKSkKICAKICBtb2Ffb3RoZXJfbm90bWFya2VkICU8PiUgCiAgICBzZWxlY3QoYnJvYWRfc2FtcGxlLCBtb2EpICU+JSAKICAgIGxlZnRfam9pbigKICAgICAgbW9hX2N1cnJlbnRfbm90bWFya2VkICU+JSBzZWxlY3QoLW1vYSksIAogICAgICBieSA9ICJicm9hZF9zYW1wbGUiKSAlPiUKICAgIHJlcGxhY2VfbmEobGlzdCh0cmFpbiA9IEZBTFNFLCB0ZXN0ID0gRkFMU0UsIG1hcmtlZCA9IEZBTFNFKSkKICAKICBtZXRhZGF0YSA8LSAKICAgIGJpbmRfcm93cygKICAgICAgbW9hX2N1cnJlbnQsCiAgICAgIG1vYV9vdGhlcl9tYXJrZWQsCiAgICAgIG1vYV9vdGhlcl9ub3RtYXJrZWQKICAgICkKICAKICAjbWVzc2FnZShzcHJpbnRmKCIlczolZCIsIG1vYV9pLCBucm93KG1ldGFkYXRhKSkpCn0KYGBgCgpgYGB7cn0KbWV0YWRhdGEgJT4lIAogIGdyb3VwX2J5KG1vYSkgJT4lCiAgc3VtbWFyaXNlKG5fdHJhaW4gPSBzdW0odHJhaW4pLCBuX3Rlc3QgPSBzdW0odGVzdCkpICU+JSAKICBtdXRhdGUobiA9IG5fdHJhaW4gKyBuX3Rlc3QpICU+JQogIGFycmFuZ2UoZGVzYyhuKSkKYGBgCgpNYWtlIHN1cmUgdGhhdCBhIGNvbXBvdW5kcyBpcyBlaXRoZXIgaW4gdHJhaW4gb3IgaW4gdGVzdCAod2UgbmVlZCB0byBjaGVjayB0aGlzIGJlY2F1c2UgYSBjb21wb3VuZCBjYW4gb2NjdXIgaW4gbW9yZSB0aGFuIG9uZSBNT0EpCgpgYGB7cn0KY29uc2lzdGVudCA8LSBmdW5jdGlvbih4KSBhbGwoeCkgfHwgYWxsKCF4KQoKIyBzaG91bGQgaGF2ZSB6ZXJvIHJvd3MgaWYgY29uc2lzdGVudAptZXRhZGF0YSAlPiUgCiAgZ3JvdXBfYnkoYnJvYWRfc2FtcGxlKSAlPiUKICBzdW1tYXJpc2UodHJhaW4gPSBjb25zaXN0ZW50KHRyYWluKSwKICAgICAgICAgICAgdGVzdCA9IGNvbnNpc3RlbnQodGVzdCkpICU+JQogIGZpbHRlcighdHJhaW4gfCAhdGVzdCkKYGBgCmBgYHtyfQptZXRhZGF0YSAlPiUgCiAgZ3JvdXBfYnkoYnJvYWRfc2FtcGxlKSAlPiUKICBzdW1tYXJpc2UodHJhaW4gPSBhbGwodHJhaW4pLAogICAgICAgICAgICB0ZXN0ID0gYWxsKHRlc3QpKSAlPiUKICB1bmdyb3VwKCkgJT4lCiAgc3VtbWFyaXplKHRyYWluID0gc3VtKHRyYWluKSwKICAgICAgICAgICAgdGVzdCA9IHN1bSh0ZXN0KSkgJT4lCiAgbXV0YXRlKHRvdGFsID0gdHJhaW4gKyB0ZXN0KQoKbWV0YWRhdGEgJT4lIAogIGdyb3VwX2J5KG1vYSkgJT4lCiAgc3VtbWFyaXplKHRyYWluID0gc3VtKHRyYWluKSwKICAgICAgICAgICAgdGVzdCA9IHN1bSh0ZXN0KSkgJT4lCiAgbXV0YXRlKHRlc3RfcmF0aW8gPSB0ZXN0Lyh0cmFpbiArIHRlc3QpKSAlPiUgCiAgZmlsdGVyKHRlc3RfcmF0aW8gPT0gMCAmIHRyYWluID4gMSkgJT4lCiAgYXJyYW5nZShkZXNjKHRyYWluKSkKCm1ldGFkYXRhICU+JQogIHVuZ3JvdXAoKSAlPiUKICBkaXN0aW5jdChicm9hZF9zYW1wbGUpICU+JQogIGNvdW50KCkKYGBgCgpgYGB7cn0KbWV0YWRhdGEgJT4lIHdyaXRlX2Nzdigifi9EZXNrdG9wL2xpbmNzX3NwbGl0LmNzdiIpCmBgYAo=