library(tidyverse)
library(magrittr)

I created *_metadata.csv by selecting just the Metadata columns

bbbc022_test  <- read_csv("../../metadata_day2/bbbc022_test_metadata.csv")  %>% mutate(Metadata_group = "test", Metadata_dataset = "bbbc022")
bbbc022_train <- read_csv("../../metadata_day2/bbbc022_train_metadata.csv") %>% mutate(Metadata_group = "train", Metadata_dataset = "bbbc022")
bbbc036_test  <- read_csv("../../metadata_day2/bbbc036_test_metadata.csv")  %>% mutate(Metadata_group = "test", Metadata_dataset = "bbbc036")
bbbc036_train <- read_csv("../../metadata_day2/bbbc036_train_metadata.csv") %>% mutate(Metadata_group = "train", Metadata_dataset = "bbbc036")
dataset <- 
  bind_rows(
    bbbc022_test,
    bbbc022_train,
    bbbc036_test,
    bbbc036_train)
dataset %>%
  select(Metadata_Plate, Metadata_Well, Metadata_Plate_Map_Name, Metadata_pert_id, Metadata_broad_sample_type,
         Metadata_group, Metadata_dataset) %>%
  mutate(Metadata_Well = toupper(Metadata_Well)) %>%
  mutate(Metadata_Plate_Map_Name = str_sub(Metadata_Plate_Map_Name, 1, 10)) %>%
  write_csv("compound_dataset_metadata.csv")
dataset <- read_csv("compound_dataset_metadata.csv")
Parsed with column specification:
cols(
  Metadata_Plate = col_integer(),
  Metadata_Well = col_character(),
  Metadata_Plate_Map_Name = col_character(),
  Metadata_pert_id = col_character(),
  Metadata_broad_sample_type = col_character(),
  Metadata_group = col_character(),
  Metadata_dataset = col_character()
)

How many wells?

dataset %>% count()

How many wells, grouped by plate?

dataset %>%
  group_by(Metadata_dataset, Metadata_Plate) %>%
  count()

How replicate plates per platemap, split by dataset?

dataset %>%
  distinct(Metadata_dataset, Metadata_Plate_Map_Name, Metadata_Plate) %>%
  group_by(Metadata_dataset, Metadata_Plate_Map_Name) %>%
  count() %>% 
  arrange(Metadata_dataset, n)

How are the platemaps distributed across the two datasets? As seen below, all platemaps except H-BIOA-001 and H-BIOA-007 are both datasets.

dataset %>%
  distinct(Metadata_dataset, Metadata_Plate_Map_Name) %>%
  group_by(Metadata_Plate_Map_Name) %>%
  count() %>%
  arrange(n)

How are the train and test split? As seen below, one platemap is held out for testing.

dataset %>%
  distinct(Metadata_group, Metadata_dataset, Metadata_Plate_Map_Name) %>%
  group_by(Metadata_group, Metadata_Plate_Map_Name) %>%
  count() %>%
  arrange(Metadata_group)
LS0tCnRpdGxlOiAiQ29tcG91bmQgRGF0YXNldCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3IgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkobWFncml0dHIpCmBgYAoKSSBjcmVhdGVkICpfbWV0YWRhdGEuY3N2IGJ5IHNlbGVjdGluZyBqdXN0IHRoZSBNZXRhZGF0YSBjb2x1bW5zCgpgYGB7ciBtZXNzYWdlPUZBTFNFfQpiYmJjMDIyX3Rlc3QgIDwtIHJlYWRfY3N2KCIuLi8uLi9tZXRhZGF0YV9kYXkyL2JiYmMwMjJfdGVzdF9tZXRhZGF0YS5jc3YiKSAgJT4lIG11dGF0ZShNZXRhZGF0YV9ncm91cCA9ICJ0ZXN0IiwgTWV0YWRhdGFfZGF0YXNldCA9ICJiYmJjMDIyIikKYmJiYzAyMl90cmFpbiA8LSByZWFkX2NzdigiLi4vLi4vbWV0YWRhdGFfZGF5Mi9iYmJjMDIyX3RyYWluX21ldGFkYXRhLmNzdiIpICU+JSBtdXRhdGUoTWV0YWRhdGFfZ3JvdXAgPSAidHJhaW4iLCBNZXRhZGF0YV9kYXRhc2V0ID0gImJiYmMwMjIiKQpiYmJjMDM2X3Rlc3QgIDwtIHJlYWRfY3N2KCIuLi8uLi9tZXRhZGF0YV9kYXkyL2JiYmMwMzZfdGVzdF9tZXRhZGF0YS5jc3YiKSAgJT4lIG11dGF0ZShNZXRhZGF0YV9ncm91cCA9ICJ0ZXN0IiwgTWV0YWRhdGFfZGF0YXNldCA9ICJiYmJjMDM2IikKYmJiYzAzNl90cmFpbiA8LSByZWFkX2NzdigiLi4vLi4vbWV0YWRhdGFfZGF5Mi9iYmJjMDM2X3RyYWluX21ldGFkYXRhLmNzdiIpICU+JSBtdXRhdGUoTWV0YWRhdGFfZ3JvdXAgPSAidHJhaW4iLCBNZXRhZGF0YV9kYXRhc2V0ID0gImJiYmMwMzYiKQoKZGF0YXNldCA8LSAKICBiaW5kX3Jvd3MoCiAgICBiYmJjMDIyX3Rlc3QsCiAgICBiYmJjMDIyX3RyYWluLAogICAgYmJiYzAzNl90ZXN0LAogICAgYmJiYzAzNl90cmFpbikKCmRhdGFzZXQgJT4lCiAgc2VsZWN0KE1ldGFkYXRhX1BsYXRlLCBNZXRhZGF0YV9XZWxsLCBNZXRhZGF0YV9QbGF0ZV9NYXBfTmFtZSwgTWV0YWRhdGFfcGVydF9pZCwgTWV0YWRhdGFfYnJvYWRfc2FtcGxlX3R5cGUsCiAgICAgICAgIE1ldGFkYXRhX2dyb3VwLCBNZXRhZGF0YV9kYXRhc2V0KSAlPiUKICBtdXRhdGUoTWV0YWRhdGFfV2VsbCA9IHRvdXBwZXIoTWV0YWRhdGFfV2VsbCkpICU+JQogIG11dGF0ZShNZXRhZGF0YV9QbGF0ZV9NYXBfTmFtZSA9IHN0cl9zdWIoTWV0YWRhdGFfUGxhdGVfTWFwX05hbWUsIDEsIDEwKSkgJT4lCiAgd3JpdGVfY3N2KCJjb21wb3VuZF9kYXRhc2V0X21ldGFkYXRhLmNzdiIpCmBgYAoKCmBgYHtyfQpkYXRhc2V0IDwtIHJlYWRfY3N2KCJjb21wb3VuZF9kYXRhc2V0X21ldGFkYXRhLmNzdiIpCgpgYGAKCkhvdyBtYW55IHdlbGxzPwoKYGBge3J9CmRhdGFzZXQgJT4lIGNvdW50KCkKYGBgCkhvdyBtYW55IHdlbGxzLCBncm91cGVkIGJ5IHBsYXRlPwoKYGBge3J9CmRhdGFzZXQgJT4lCiAgZ3JvdXBfYnkoTWV0YWRhdGFfZGF0YXNldCwgTWV0YWRhdGFfUGxhdGUpICU+JQogIGNvdW50KCkKYGBgCgpIb3cgcmVwbGljYXRlIHBsYXRlcyBwZXIgcGxhdGVtYXAsIHNwbGl0IGJ5IGRhdGFzZXQ/CgpgYGB7cn0KZGF0YXNldCAlPiUKICBkaXN0aW5jdChNZXRhZGF0YV9kYXRhc2V0LCBNZXRhZGF0YV9QbGF0ZV9NYXBfTmFtZSwgTWV0YWRhdGFfUGxhdGUpICU+JQogIGdyb3VwX2J5KE1ldGFkYXRhX2RhdGFzZXQsIE1ldGFkYXRhX1BsYXRlX01hcF9OYW1lKSAlPiUKICBjb3VudCgpICU+JSAKICBhcnJhbmdlKE1ldGFkYXRhX2RhdGFzZXQsIG4pCmBgYAoKSG93IGFyZSB0aGUgcGxhdGVtYXBzIGRpc3RyaWJ1dGVkIGFjcm9zcyB0aGUgdHdvIGRhdGFzZXRzPwpBcyBzZWVuIGJlbG93LCBhbGwgcGxhdGVtYXBzIGV4Y2VwdCBgSC1CSU9BLTAwMWAgYW5kIGBILUJJT0EtMDA3YCBhcmUgYm90aCBkYXRhc2V0cy4KCmBgYHtyfQpkYXRhc2V0ICU+JQogIGRpc3RpbmN0KE1ldGFkYXRhX2RhdGFzZXQsIE1ldGFkYXRhX1BsYXRlX01hcF9OYW1lKSAlPiUKICBncm91cF9ieShNZXRhZGF0YV9QbGF0ZV9NYXBfTmFtZSkgJT4lCiAgY291bnQoKSAlPiUKICBhcnJhbmdlKG4pCmBgYAoKSG93IGFyZSB0aGUgdHJhaW4gYW5kIHRlc3Qgc3BsaXQ/CkFzIHNlZW4gYmVsb3csIG9uZSBwbGF0ZW1hcCBpcyBoZWxkIG91dCBmb3IgdGVzdGluZy4KCmBgYHtyfQpkYXRhc2V0ICU+JQogIGRpc3RpbmN0KE1ldGFkYXRhX2dyb3VwLCBNZXRhZGF0YV9kYXRhc2V0LCBNZXRhZGF0YV9QbGF0ZV9NYXBfTmFtZSkgJT4lCiAgZ3JvdXBfYnkoTWV0YWRhdGFfZ3JvdXAsIE1ldGFkYXRhX1BsYXRlX01hcF9OYW1lKSAlPiUKICBjb3VudCgpICU+JQogIGFycmFuZ2UoTWV0YWRhdGFfZ3JvdXApCmBgYAoK