Load sample information (updated 9/7/2018). Contains physical sample-level information including Broad Institute ID, compound name, QC confirmation, purity, vendor catalog number, vendor name, expected mass, SMILES, inChiKey, and Pubchem ID.
repurposing_samples <-
read_tsv("https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_samples_20180907.txt", comment = "!", guess_max = 20000) %>%
mutate(pert_id = str_sub(broad_id, 1, 13)) %>%
select(-broad_id, -purity, -deprecated_broad_id, -vendor, -vendor_name, -catalog_no, -qc_incompatible) %>%
distinct()
How many entries with duplicate pert_ids?
duplicate_pert_ids <-
repurposing_samples %>%
group_by(pert_id) %>%
tally() %>%
arrange(desc(n)) %>%
filter(n > 1)
duplicate_pert_ids
nrow(duplicate_pert_ids)
[1] 16
Do the duplicates differ on any other parameter other than pert_iname? Zero rows means no.
duplicate_pert_ids %>%
inner_join(repurposing_samples) %>%
select(-pert_iname, -n) %>%
distinct() %>%
group_by(pert_id) %>%
tally() %>%
arrange(desc(n)) %>%
filter(n > 1)
Load drug information (updated 9/7/2018). Contains drug annotation-level information including compound name, clinical phase, mechanism of action, and protein target.
repurposing_annotations <-
read_tsv("https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20180907.txt", comment = "!")
Make a table of all duplicates
duplicate_annotations <-
duplicate_pert_ids %>%
inner_join(repurposing_samples) %>%
select(pert_id, pert_iname) %>%
distinct() %>%
inner_join(repurposing_annotations)
Joining, by = "pert_id"
Joining, by = "pert_iname"
duplicate_annotations
Merge the annotations for each pert_id
merge_col <- function(column) {
paste(column, collapse = "|") %>%
str_split("\\|") %>%
unlist %>%
sort %>%
unique %>%
.[. != "NA"] %>%
paste(collapse = "|")
}
duplicate_annotations %>%
group_by(pert_id) %>%
summarise_all(funs(merge_col))
LS0tCnRpdGxlOiAiQW5hbHl6aW5nIGFubm90YXRpb25zIGluIHRoZSBCcm9hZCBEcnVnIFJlcHVycG9zaW5nIGNvbGxlY3Rpb24iCmF1dGhvcjogIlNoYW50YW51IFNpbmdoIgpkYXRlOiAiSnVseSAyMDE5IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgoKTG9hZCBzYW1wbGUgaW5mb3JtYXRpb24gKHVwZGF0ZWQgOS83LzIwMTgpLgpDb250YWlucyBwaHlzaWNhbCBzYW1wbGUtbGV2ZWwgaW5mb3JtYXRpb24gaW5jbHVkaW5nIEJyb2FkIEluc3RpdHV0ZSBJRCwgY29tcG91bmQgbmFtZSwgUUMgY29uZmlybWF0aW9uLCBwdXJpdHksIHZlbmRvciBjYXRhbG9nIG51bWJlciwgdmVuZG9yIG5hbWUsIGV4cGVjdGVkIG1hc3MsIFNNSUxFUywgaW5DaGlLZXksIGFuZCBQdWJjaGVtIElELgoKYGBge3IgbWVzc2FnZT1GQUxTRX0KcmVwdXJwb3Npbmdfc2FtcGxlcyA8LSAKICByZWFkX3RzdigiaHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2RhdGEuY2x1ZS5pby9yZXB1cnBvc2luZy9kb3dubG9hZHMvcmVwdXJwb3Npbmdfc2FtcGxlc18yMDE4MDkwNy50eHQiLCBjb21tZW50ID0gIiEiLCBndWVzc19tYXggPSAyMDAwMCkgJT4lCiAgbXV0YXRlKHBlcnRfaWQgPSBzdHJfc3ViKGJyb2FkX2lkLCAxLCAxMykpICU+JQogIHNlbGVjdCgtYnJvYWRfaWQsIC1wdXJpdHksIC1kZXByZWNhdGVkX2Jyb2FkX2lkLCAtdmVuZG9yLCAtdmVuZG9yX25hbWUsIC1jYXRhbG9nX25vLCAtcWNfaW5jb21wYXRpYmxlKSAlPiUKICBkaXN0aW5jdCgpCmBgYAoKSG93IG1hbnkgZW50cmllcyB3aXRoIGR1cGxpY2F0ZSBgcGVydF9pZGBzPwoKYGBge3IgbWVzc2FnZT1GQUxTRX0KZHVwbGljYXRlX3BlcnRfaWRzIDwtIAogIHJlcHVycG9zaW5nX3NhbXBsZXMgJT4lCiAgZ3JvdXBfYnkocGVydF9pZCkgJT4lCiAgdGFsbHkoKSAlPiUKICBhcnJhbmdlKGRlc2MobikpICU+JQogIGZpbHRlcihuID4gMSkgCgpkdXBsaWNhdGVfcGVydF9pZHMKCm5yb3coZHVwbGljYXRlX3BlcnRfaWRzKQpgYGAKCkRvIHRoZSBkdXBsaWNhdGVzIGRpZmZlciBvbiBhbnkgb3RoZXIgcGFyYW1ldGVyIG90aGVyIHRoYW4gYHBlcnRfaW5hbWVgPwpaZXJvIHJvd3MgbWVhbnMgbm8uCgpgYGB7ciBtZXNzYWdlPUZBTFNFfQpkdXBsaWNhdGVfcGVydF9pZHMgJT4lCiAgaW5uZXJfam9pbihyZXB1cnBvc2luZ19zYW1wbGVzKSAlPiUKICBzZWxlY3QoLXBlcnRfaW5hbWUsIC1uKSAlPiUKICBkaXN0aW5jdCgpICU+JQogIGdyb3VwX2J5KHBlcnRfaWQpICU+JQogIHRhbGx5KCkgJT4lCiAgYXJyYW5nZShkZXNjKG4pKSAlPiUKICBmaWx0ZXIobiA+IDEpCmBgYAoKTG9hZCBkcnVnIGluZm9ybWF0aW9uICh1cGRhdGVkIDkvNy8yMDE4KS4KQ29udGFpbnMgZHJ1ZyBhbm5vdGF0aW9uLWxldmVsIGluZm9ybWF0aW9uIGluY2x1ZGluZyBjb21wb3VuZCBuYW1lLCBjbGluaWNhbCBwaGFzZSwgbWVjaGFuaXNtIG9mIGFjdGlvbiwgYW5kIHByb3RlaW4gdGFyZ2V0LiAgCgpgYGB7ciBtZXNzYWdlPUZBTFNFfQpyZXB1cnBvc2luZ19hbm5vdGF0aW9ucyA8LSAKICByZWFkX3RzdigiaHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2RhdGEuY2x1ZS5pby9yZXB1cnBvc2luZy9kb3dubG9hZHMvcmVwdXJwb3NpbmdfZHJ1Z3NfMjAxODA5MDcudHh0IiwgY29tbWVudCA9ICIhIikKYGBgCgpNYWtlIGEgdGFibGUgb2YgYWxsIGR1cGxpY2F0ZXMKCmBgYHtyfQpkdXBsaWNhdGVfYW5ub3RhdGlvbnMgPC0gCiAgZHVwbGljYXRlX3BlcnRfaWRzICU+JQogIGlubmVyX2pvaW4ocmVwdXJwb3Npbmdfc2FtcGxlcykgJT4lCiAgc2VsZWN0KHBlcnRfaWQsIHBlcnRfaW5hbWUpICU+JQogIGRpc3RpbmN0KCkgJT4lCiAgaW5uZXJfam9pbihyZXB1cnBvc2luZ19hbm5vdGF0aW9ucykKCmR1cGxpY2F0ZV9hbm5vdGF0aW9ucwpgYGAKCk1lcmdlIHRoZSBhbm5vdGF0aW9ucyBmb3IgZWFjaCBgcGVydF9pZGAgCgpgYGB7cn0KbWVyZ2VfY29sIDwtIGZ1bmN0aW9uKGNvbHVtbikgewogIHBhc3RlKGNvbHVtbiwgY29sbGFwc2UgPSAifCIpICU+JQogICAgc3RyX3NwbGl0KCJcXHwiKSAlPiUKICAgIHVubGlzdCAlPiUKICAgIHNvcnQgJT4lCiAgICB1bmlxdWUgJT4lCiAgICAuWy4gIT0gIk5BIl0gJT4lCiAgICBwYXN0ZShjb2xsYXBzZSA9ICJ8IikKfQoKZHVwbGljYXRlX2Fubm90YXRpb25zICU+JQogIGdyb3VwX2J5KHBlcnRfaWQpICU+JQogIHN1bW1hcmlzZV9hbGwoZnVucyhtZXJnZV9jb2wpKQpgYGAKCg==