output: 
  html_notebook:
    toc: true
    toc_float: true
    toc_depth: 3
    number_sections: true
    theme: lumen
output: github_document

1 Boilerplate

If running interactively in RStudio,

  • change output in the header of this markdown to html_notebook and
  • change to eval=TRUE below

When knitting for pushing to GitHub,

  • change output in the header of this markdown to github_document and
  • change to eval=FALSE below
show_table <- knitr::kable
knitr::opts_chunk$set(fig.width = 6, fig.asp = 2/3)

2 Libraries

library(magrittr)
library(tidyverse)
library(arrow)
source("../../R/cytominer-eval.R")

3 Load data

# read these features
# https://github.com/broadinstitute/grit-benchmark/blob/31d9812e0c267d3535c85a72ef6bd0104d1a2130/1.calculate-metrics/cell-health/0.calculate-grit.ipynb
# (See chunk 4)

profiles <- read_csv("data/cell_health_merged_feature_select.csv.gz")

metadata <- get_annotation(profiles)
if (file.exists("data/sim.parquet")) {
  message("Loading existing similarity file...")
  sim_df <- arrow::read_parquet("data/sim.parquet")

} else {
  sim_df <- sim_calculate(profiles)
  sim_df %>% arrow::write_parquet("data/sim.parquet")
}
Loading existing similarity file...

4 Compute similarity sets


# ---- 0. Filter out some rows ----

drop_group <-
  data.frame(Metadata_gene_name = "EMPTY")

# ---- 1. Similarity to reference ----

# Fetch similarities between
# a. all rows (except, optionally those containing `reference`)
# and
# b. all rows containing `reference`
# Do so only for those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_ref`
    
reference <-
  data.frame(Metadata_gene_name = c("Chr2", "Luc", "LacZ"))

# all_same_cols_ref <-
#   c("Metadata_cell_line")

all_same_cols_ref <-
  c("Metadata_cell_line",
    "Metadata_Plate")

# ---- 2. Similarity to replicates (no references) ----

# Fetch similarities between
# a. all rows except `reference` rows
# and
# b. all rows except `reference` rows (i.e. to each other)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_rep
#
# Keep, both, (a, b) and (b, a)

all_same_cols_rep <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

# ---- 3. Similarity to replicates (only references) ----

# Fetch similarities between
# a. all rows containing `reference`
# to
# b. all rows containing `reference` (i.e. to each other)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_rep_ref`.
#
# Keep, both, (a, b) and (b, a)

all_same_cols_rep_ref <-
  c(
    "Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name",
    "Metadata_Plate"
  )

# all_same_cols_rep_ref <-
#   c(
#     "Metadata_cell_line",
#     "Metadata_gene_name",
#     "Metadata_pert_name"
#   )
# ---- 4. Similarity to non-replicates ----

# Fetch similarities between
# a. all rows (except, optionally, `reference` rows)
# to
# b. all rows except `reference` rows
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_non_rep`
# - have *different* values in *all* columns `all_different_cols_non_rep`
# - have *different* values in *at least one* column of `any_different_cols_non_rep`
#
# Keep, both, (a, b) and (b, a)

any_different_cols_non_rep <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

all_same_cols_non_rep <-
  c("Metadata_cell_line", 
    "Metadata_Plate")

all_different_cols_non_rep <-
  c("Metadata_gene_name")

# ---- 5. Similarity to group ----

# Fetch similarities between
# a. all rows (except, optionally, `reference` rows)
# and
# b. all rows (except, optionally, `reference` rows)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_group`
# - have *different* values in *at least one* column of `any_different_cols_group`
#
# Keep, both, (a, b) and (b, a)

all_same_cols_group <-
  c("Metadata_cell_line",
    "Metadata_gene_name")

any_different_cols_group <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

# ---- 6. Combine 1-5 and annotate the similarity matrix ----

annotation_cols <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

munged_sim <-
  sim_munge(
    sim_df,
    metadata,
    reference,
    all_same_cols_rep = all_same_cols_rep,
    all_same_cols_rep_ref = all_same_cols_rep_ref,
    all_same_cols_ref = all_same_cols_ref,
    any_different_cols_non_rep = any_different_cols_non_rep,
    all_same_cols_non_rep = all_same_cols_non_rep,
    all_different_cols_non_rep = all_different_cols_non_rep,
    any_different_cols_group = any_different_cols_group,
    all_same_cols_group = all_same_cols_group,
    annotation_cols = annotation_cols,
    drop_group = drop_group
  )

5 Compute metrics

norm_non_rep_metrics <- 
  sim_metrics(munged_sim, "non_rep", calculate_grouped = TRUE)

norm_ref_metrics <- 
  sim_metrics(munged_sim, "ref", calculate_grouped = TRUE)

per_row_metrics <-
  inner_join(
    norm_non_rep_metrics[["per_row"]],
    norm_ref_metrics[["per_row"]],
    by = c("id1", all_same_cols_rep, "sim_mean_agg")
  )

per_set_metrics <-
  inner_join(
    norm_non_rep_metrics[["per_set"]],
    norm_ref_metrics[["per_set"]],
    by = c(all_same_cols_rep, "sim_mean_agg_mean_agg", "sim_mean_agg_median_agg")
  )

per_set_group_metrics <-
  inner_join(
    norm_non_rep_metrics[["per_set_group"]],
    norm_ref_metrics[["per_set_group"]],
    by = c(all_same_cols_rep, "sim_mean_agg", "sim_median_agg")
  )
per_set_all_metrics <-
  per_set_group_metrics %>%
  rename_with( ~ paste0(., "_group"), starts_with("sim")) %>%
  inner_join(
    per_set_metrics %>%
      rename_with( ~ paste0(., "_indiv"), starts_with("sim")),
    by = c(all_same_cols_rep)
  ) 

grit <-
  per_set_all_metrics %>%
  mutate(
    grit_group = sim_scaled_mean_agg_ref_group,
    grit_indiv = sim_scaled_mean_agg_ref_mean_agg_indiv
  ) %>%
  select(all_of(all_same_cols_rep), grit_group, grit_indiv)

6 Plot distributions

target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "ITGAV",
  Metadata_pert_name = "ITGAV-1")

sister_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "ITGAV",
  Metadata_pert_name = "ITGAV-2")

cutting_controls <- data.frame(
  Metadata_cell_line = c("A549", "A549", "A549"),
  Metadata_gene_name = c("Chr2", "Luc", "LacZ")
)

metadata <- 
  profiles %>%
  select(matches("Metadata")) %>% 
  select(-Metadata_WellCol, -Metadata_WellRow)

features <- function(df) t(as.matrix(df %>% select(-matches("^Metadata_"))))

plate <- function(df) df$Metadata_Plate

p_target_guide <- profiles %>% inner_join(target_guide)
Joining, by = c("Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name")
p_sister_guide <- profiles %>% inner_join(sister_guide)
Joining, by = c("Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name")
p_cutting_controls <- profiles %>% inner_join(cutting_controls)
Joining, by = c("Metadata_cell_line", "Metadata_gene_name")
p_target_guide %>% group_by(across(all_of(unique(c(all_same_cols_rep, all_same_cols_ref))))) %>% tally()
p_sister_guide %>% group_by(across(all_of(unique(c(all_same_cols_rep, all_same_cols_ref))))) %>% tally()
p_cutting_controls %>% group_by(across(all_of(all_same_cols_ref))) %>% tally()
all_sim_counts <-
  munged_sim %>%
  group_by(across(all_of(c("id1", all_same_cols_rep, "type")))) %>%
  tally() %>%
  arrange(across(all_of(all_same_cols_rep))) %>%
  pivot_wider(names_from = "type",
              names_prefix = "n_",
              values_from = n,
              values_fill = 0)
all_sim_counts %>%
  inner_join(target_guide)
Joining, by = c("Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name")
target_distributions <-
  munged_sim %>%
  inner_join(target_guide)
Joining, by = c("Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name")
target_distributions %>%
  group_by(type) %>%
  tally()
target_distributions %>%
  group_by(type) %>%
  tally() %>%
  ggplot(aes(type, n)) + geom_col()

grit %>%
  inner_join(target_guide, by = colnames(target_guide))

target_distributions %>%
  filter(type %in% c("ref", "rep_group", "non_rep")) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)

target_distributions %>%
  filter(type %in% c("ref", "rep", "non_rep")) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)

target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "CCND1",
  Metadata_pert_name = "CCND1-1")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  


grit %>%
  inner_join(target_guide, by = colnames(target_guide)) 
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "CCND1",
  Metadata_pert_name = "CCND1-2")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  


grit %>%
  inner_join(target_guide, by = colnames(target_guide))
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "STAT3",
  Metadata_pert_name = "STAT3-1")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  


grit %>%
  inner_join(target_guide, by = colnames(target_guide))
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "MYC",
  Metadata_pert_name = "MYC-1")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  


grit %>%
  inner_join(target_guide, by = colnames(target_guide))
---
title: "Explain Grit"
author: "Shantanu Singh"
output: 
  html_notebook:
    toc: true
    toc_float: true
    toc_depth: 3
    number_sections: true
    theme: lumen
---

```
output: 
  html_notebook:
    toc: true
    toc_float: true
    toc_depth: 3
    number_sections: true
    theme: lumen
```

```
output: github_document
```

# Boilerplate 

```{r echo=FALSE}
show_table <- print
```

If running interactively in RStudio, 

- change `output` in the header of this markdown to `html_notebook` and
- change to `eval=TRUE` below

When knitting for pushing to GitHub,

- change `output` in the header of this markdown to `github_document` and
- change to `eval=FALSE` below

```{r eval=FALSE}
show_table <- knitr::kable
knitr::opts_chunk$set(fig.width = 6, fig.asp = 2/3)
```

# Libraries

```{r message=FALSE}
library(magrittr)
library(tidyverse)
library(arrow)
```


```{r}
source("../../R/cytominer-eval.R")
```

# Load data

```{r message=FALSE}
# read these features
# https://github.com/broadinstitute/grit-benchmark/blob/31d9812e0c267d3535c85a72ef6bd0104d1a2130/1.calculate-metrics/cell-health/0.calculate-grit.ipynb
# (See chunk 4)

profiles <- read_csv("data/cell_health_merged_feature_select.csv.gz")

metadata <- get_annotation(profiles)
```


```{r}
if (file.exists("data/sim.parquet")) {
  message("Loading existing similarity file...")
  sim_df <- arrow::read_parquet("data/sim.parquet")

} else {
  sim_df <- sim_calculate(profiles)
  sim_df %>% arrow::write_parquet("data/sim.parquet")
}
```

# Compute similarity sets

```{r}

# ---- 0. Filter out some rows ----

drop_group <-
  data.frame(Metadata_gene_name = "EMPTY")

# ---- 1. Similarity to reference ----

# Fetch similarities between
# a. all rows (except, optionally those containing `reference`)
# and
# b. all rows containing `reference`
# Do so only for those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_ref`
    
reference <-
  data.frame(Metadata_gene_name = c("Chr2", "Luc", "LacZ"))

# all_same_cols_ref <-
#   c("Metadata_cell_line")

all_same_cols_ref <-
  c("Metadata_cell_line",
    "Metadata_Plate")

# ---- 2. Similarity to replicates (no references) ----

# Fetch similarities between
# a. all rows except `reference` rows
# and
# b. all rows except `reference` rows (i.e. to each other)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_rep
#
# Keep, both, (a, b) and (b, a)

all_same_cols_rep <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

# ---- 3. Similarity to replicates (only references) ----

# Fetch similarities between
# a. all rows containing `reference`
# to
# b. all rows containing `reference` (i.e. to each other)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_rep_ref`.
#
# Keep, both, (a, b) and (b, a)

all_same_cols_rep_ref <-
  c(
    "Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name",
    "Metadata_Plate"
  )

# all_same_cols_rep_ref <-
#   c(
#     "Metadata_cell_line",
#     "Metadata_gene_name",
#     "Metadata_pert_name"
#   )
# ---- 4. Similarity to non-replicates ----

# Fetch similarities between
# a. all rows (except, optionally, `reference` rows)
# to
# b. all rows except `reference` rows
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_non_rep`
# - have *different* values in *all* columns `all_different_cols_non_rep`
# - have *different* values in *at least one* column of `any_different_cols_non_rep`
#
# Keep, both, (a, b) and (b, a)

any_different_cols_non_rep <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

all_same_cols_non_rep <-
  c("Metadata_cell_line", 
    "Metadata_Plate")

all_different_cols_non_rep <-
  c("Metadata_gene_name")

# ---- 5. Similarity to group ----

# Fetch similarities between
# a. all rows (except, optionally, `reference` rows)
# and
# b. all rows (except, optionally, `reference` rows)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_group`
# - have *different* values in *at least one* column of `any_different_cols_group`
#
# Keep, both, (a, b) and (b, a)

all_same_cols_group <-
  c("Metadata_cell_line",
    "Metadata_gene_name")

any_different_cols_group <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

# ---- 6. Combine 1-5 and annotate the similarity matrix ----

annotation_cols <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

munged_sim <-
  sim_munge(
    sim_df,
    metadata,
    reference,
    all_same_cols_rep = all_same_cols_rep,
    all_same_cols_rep_ref = all_same_cols_rep_ref,
    all_same_cols_ref = all_same_cols_ref,
    any_different_cols_non_rep = any_different_cols_non_rep,
    all_same_cols_non_rep = all_same_cols_non_rep,
    all_different_cols_non_rep = all_different_cols_non_rep,
    any_different_cols_group = any_different_cols_group,
    all_same_cols_group = all_same_cols_group,
    annotation_cols = annotation_cols,
    drop_group = drop_group
  )

```

# Compute metrics

```{r}
norm_non_rep_metrics <- 
  sim_metrics(munged_sim, "non_rep", calculate_grouped = TRUE)

norm_ref_metrics <- 
  sim_metrics(munged_sim, "ref", calculate_grouped = TRUE)

per_row_metrics <-
  inner_join(
    norm_non_rep_metrics[["per_row"]],
    norm_ref_metrics[["per_row"]],
    by = c("id1", all_same_cols_rep, "sim_mean_agg")
  )

per_set_metrics <-
  inner_join(
    norm_non_rep_metrics[["per_set"]],
    norm_ref_metrics[["per_set"]],
    by = c(all_same_cols_rep, "sim_mean_agg_mean_agg", "sim_mean_agg_median_agg")
  )

per_set_group_metrics <-
  inner_join(
    norm_non_rep_metrics[["per_set_group"]],
    norm_ref_metrics[["per_set_group"]],
    by = c(all_same_cols_rep, "sim_mean_agg", "sim_median_agg")
  )
```


```{r}
per_set_all_metrics <-
  per_set_group_metrics %>%
  rename_with( ~ paste0(., "_group"), starts_with("sim")) %>%
  inner_join(
    per_set_metrics %>%
      rename_with( ~ paste0(., "_indiv"), starts_with("sim")),
    by = c(all_same_cols_rep)
  ) 

grit <-
  per_set_all_metrics %>%
  mutate(
    grit_group = sim_scaled_mean_agg_ref_group,
    grit_indiv = sim_scaled_mean_agg_ref_mean_agg_indiv
  ) %>%
  select(all_of(all_same_cols_rep), grit_group, grit_indiv)
```

# Plot distributions

```{r}
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "ITGAV",
  Metadata_pert_name = "ITGAV-1")

sister_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "ITGAV",
  Metadata_pert_name = "ITGAV-2")

cutting_controls <- data.frame(
  Metadata_cell_line = c("A549", "A549", "A549"),
  Metadata_gene_name = c("Chr2", "Luc", "LacZ")
)

metadata <- 
  profiles %>%
  select(matches("Metadata")) %>% 
  select(-Metadata_WellCol, -Metadata_WellRow)

features <- function(df) t(as.matrix(df %>% select(-matches("^Metadata_"))))

plate <- function(df) df$Metadata_Plate

p_target_guide <- profiles %>% inner_join(target_guide)

p_sister_guide <- profiles %>% inner_join(sister_guide)

p_cutting_controls <- profiles %>% inner_join(cutting_controls)
```

```{r}
p_target_guide %>% group_by(across(all_of(unique(c(all_same_cols_rep, all_same_cols_ref))))) %>% tally()
```


```{r}
p_sister_guide %>% group_by(across(all_of(unique(c(all_same_cols_rep, all_same_cols_ref))))) %>% tally()
```


```{r}
p_cutting_controls %>% group_by(across(all_of(all_same_cols_ref))) %>% tally()
```


```{r}
all_sim_counts <-
  munged_sim %>%
  group_by(across(all_of(c("id1", all_same_cols_rep, "type")))) %>%
  tally() %>%
  arrange(across(all_of(all_same_cols_rep))) %>%
  pivot_wider(names_from = "type",
              names_prefix = "n_",
              values_from = n,
              values_fill = 0)
```

```{r}
all_sim_counts %>%
  inner_join(target_guide)
```
```{r}
target_distributions <-
  munged_sim %>%
  inner_join(target_guide)

target_distributions %>%
  group_by(type) %>%
  tally()
```

```{r}
target_distributions %>%
  group_by(type) %>%
  tally() %>%
  ggplot(aes(type, n)) + geom_col()
```

```{r}
grit %>%
  inner_join(target_guide, by = colnames(target_guide))

target_distributions %>%
  filter(type %in% c("ref", "rep_group", "non_rep")) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)
```


```{r}
target_distributions %>%
  filter(type %in% c("ref", "rep", "non_rep")) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)
```

```{r}
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "CCND1",
  Metadata_pert_name = "CCND1-1")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  

grit %>%
  inner_join(target_guide, by = colnames(target_guide)) 
```
```{r}
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "CCND1",
  Metadata_pert_name = "CCND1-2")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  

grit %>%
  inner_join(target_guide, by = colnames(target_guide))
```

```{r}
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "STAT3",
  Metadata_pert_name = "STAT3-1")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  

grit %>%
  inner_join(target_guide, by = colnames(target_guide))
```
```{r}
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "MYC",
  Metadata_pert_name = "MYC-1")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  

grit %>%
  inner_join(target_guide, by = colnames(target_guide))
```

