output:
html_notebook:
toc: true
toc_float: true
toc_depth: 3
number_sections: true
theme: lumen
output: github_document
Boilerplate
If running interactively in RStudio,
- change
output in the header of this markdown to html_notebook and
- change to
eval=TRUE below
When knitting for pushing to GitHub,
- change
output in the header of this markdown to github_document and
- change to
eval=FALSE below
show_table <- knitr::kable
knitr::opts_chunk$set(fig.width = 6, fig.asp = 2/3)
Libraries
library(magrittr)
library(tidyverse)
library(arrow)
source("../../R/cytominer-eval.R")
Load data
# read these features
# https://github.com/broadinstitute/grit-benchmark/blob/31d9812e0c267d3535c85a72ef6bd0104d1a2130/1.calculate-metrics/cell-health/0.calculate-grit.ipynb
# (See chunk 4)
profiles <- read_csv("data/cell_health_merged_feature_select.csv.gz")
metadata <- get_annotation(profiles)
if (file.exists("data/sim.parquet")) {
message("Loading existing similarity file...")
sim_df <- arrow::read_parquet("data/sim.parquet")
} else {
sim_df <- sim_calculate(profiles)
sim_df %>% arrow::write_parquet("data/sim.parquet")
}
Loading existing similarity file...
Compute similarity sets
# ---- 0. Filter out some rows ----
drop_group <-
data.frame(Metadata_gene_name = "EMPTY")
# ---- 1. Similarity to reference ----
# Fetch similarities between
# a. all rows (except, optionally those containing `reference`)
# and
# b. all rows containing `reference`
# Do so only for those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_ref`
reference <-
data.frame(Metadata_gene_name = c("Chr2", "Luc", "LacZ"))
# all_same_cols_ref <-
# c("Metadata_cell_line")
all_same_cols_ref <-
c("Metadata_cell_line",
"Metadata_Plate")
# ---- 2. Similarity to replicates (no references) ----
# Fetch similarities between
# a. all rows except `reference` rows
# and
# b. all rows except `reference` rows (i.e. to each other)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_rep
#
# Keep, both, (a, b) and (b, a)
all_same_cols_rep <-
c("Metadata_cell_line",
"Metadata_gene_name",
"Metadata_pert_name")
# ---- 3. Similarity to replicates (only references) ----
# Fetch similarities between
# a. all rows containing `reference`
# to
# b. all rows containing `reference` (i.e. to each other)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_rep_ref`.
#
# Keep, both, (a, b) and (b, a)
all_same_cols_rep_ref <-
c(
"Metadata_cell_line",
"Metadata_gene_name",
"Metadata_pert_name",
"Metadata_Plate"
)
# all_same_cols_rep_ref <-
# c(
# "Metadata_cell_line",
# "Metadata_gene_name",
# "Metadata_pert_name"
# )
# ---- 4. Similarity to non-replicates ----
# Fetch similarities between
# a. all rows (except, optionally, `reference` rows)
# to
# b. all rows except `reference` rows
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_non_rep`
# - have *different* values in *all* columns `all_different_cols_non_rep`
# - have *different* values in *at least one* column of `any_different_cols_non_rep`
#
# Keep, both, (a, b) and (b, a)
any_different_cols_non_rep <-
c("Metadata_cell_line",
"Metadata_gene_name",
"Metadata_pert_name")
all_same_cols_non_rep <-
c("Metadata_cell_line",
"Metadata_Plate")
all_different_cols_non_rep <-
c("Metadata_gene_name")
# ---- 5. Similarity to group ----
# Fetch similarities between
# a. all rows (except, optionally, `reference` rows)
# and
# b. all rows (except, optionally, `reference` rows)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_group`
# - have *different* values in *at least one* column of `any_different_cols_group`
#
# Keep, both, (a, b) and (b, a)
all_same_cols_group <-
c("Metadata_cell_line",
"Metadata_gene_name")
any_different_cols_group <-
c("Metadata_cell_line",
"Metadata_gene_name",
"Metadata_pert_name")
# ---- 6. Combine 1-5 and annotate the similarity matrix ----
annotation_cols <-
c("Metadata_cell_line",
"Metadata_gene_name",
"Metadata_pert_name")
munged_sim <-
sim_munge(
sim_df,
metadata,
reference,
all_same_cols_rep = all_same_cols_rep,
all_same_cols_rep_ref = all_same_cols_rep_ref,
all_same_cols_ref = all_same_cols_ref,
any_different_cols_non_rep = any_different_cols_non_rep,
all_same_cols_non_rep = all_same_cols_non_rep,
all_different_cols_non_rep = all_different_cols_non_rep,
any_different_cols_group = any_different_cols_group,
all_same_cols_group = all_same_cols_group,
annotation_cols = annotation_cols,
drop_group = drop_group
)
Compute metrics
norm_non_rep_metrics <-
sim_metrics(munged_sim, "non_rep", calculate_grouped = TRUE)
norm_ref_metrics <-
sim_metrics(munged_sim, "ref", calculate_grouped = TRUE)
per_row_metrics <-
inner_join(
norm_non_rep_metrics[["per_row"]],
norm_ref_metrics[["per_row"]],
by = c("id1", all_same_cols_rep, "sim_mean_agg")
)
per_set_metrics <-
inner_join(
norm_non_rep_metrics[["per_set"]],
norm_ref_metrics[["per_set"]],
by = c(all_same_cols_rep, "sim_mean_agg_mean_agg", "sim_mean_agg_median_agg")
)
per_set_group_metrics <-
inner_join(
norm_non_rep_metrics[["per_set_group"]],
norm_ref_metrics[["per_set_group"]],
by = c(all_same_cols_rep, "sim_mean_agg", "sim_median_agg")
)
per_set_all_metrics <-
per_set_group_metrics %>%
rename_with( ~ paste0(., "_group"), starts_with("sim")) %>%
inner_join(
per_set_metrics %>%
rename_with( ~ paste0(., "_indiv"), starts_with("sim")),
by = c(all_same_cols_rep)
)
grit <-
per_set_all_metrics %>%
mutate(
grit_group = sim_scaled_mean_agg_ref_group,
grit_indiv = sim_scaled_mean_agg_ref_mean_agg_indiv
) %>%
select(all_of(all_same_cols_rep), grit_group, grit_indiv)
Plot distributions
target_guide <- data.frame(
Metadata_cell_line = "A549",
Metadata_gene_name = "ITGAV",
Metadata_pert_name = "ITGAV-1")
sister_guide <- data.frame(
Metadata_cell_line = "A549",
Metadata_gene_name = "ITGAV",
Metadata_pert_name = "ITGAV-2")
cutting_controls <- data.frame(
Metadata_cell_line = c("A549", "A549", "A549"),
Metadata_gene_name = c("Chr2", "Luc", "LacZ")
)
metadata <-
profiles %>%
select(matches("Metadata")) %>%
select(-Metadata_WellCol, -Metadata_WellRow)
features <- function(df) t(as.matrix(df %>% select(-matches("^Metadata_"))))
plate <- function(df) df$Metadata_Plate
p_target_guide <- profiles %>% inner_join(target_guide)
Joining, by = c("Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name")
p_sister_guide <- profiles %>% inner_join(sister_guide)
Joining, by = c("Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name")
p_cutting_controls <- profiles %>% inner_join(cutting_controls)
Joining, by = c("Metadata_cell_line", "Metadata_gene_name")
p_target_guide %>% group_by(across(all_of(unique(c(all_same_cols_rep, all_same_cols_ref))))) %>% tally()
p_sister_guide %>% group_by(across(all_of(unique(c(all_same_cols_rep, all_same_cols_ref))))) %>% tally()
p_cutting_controls %>% group_by(across(all_of(all_same_cols_ref))) %>% tally()
all_sim_counts <-
munged_sim %>%
group_by(across(all_of(c("id1", all_same_cols_rep, "type")))) %>%
tally() %>%
arrange(across(all_of(all_same_cols_rep))) %>%
pivot_wider(names_from = "type",
names_prefix = "n_",
values_from = n,
values_fill = 0)
all_sim_counts %>%
inner_join(target_guide)
Joining, by = c("Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name")
target_distributions <-
munged_sim %>%
inner_join(target_guide)
Joining, by = c("Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name")
target_distributions %>%
group_by(type) %>%
tally()
target_distributions %>%
group_by(type) %>%
tally() %>%
ggplot(aes(type, n)) + geom_col()

grit %>%
inner_join(target_guide, by = colnames(target_guide))
target_distributions %>%
filter(type %in% c("ref", "rep_group", "non_rep")) %>%
ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)

target_distributions %>%
filter(type %in% c("ref", "rep", "non_rep")) %>%
ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)

target_guide <- data.frame(
Metadata_cell_line = "A549",
Metadata_gene_name = "CCND1",
Metadata_pert_name = "CCND1-1")
munged_sim %>%
inner_join(target_guide, by = colnames(target_guide)) %>%
ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)

grit %>%
inner_join(target_guide, by = colnames(target_guide))
target_guide <- data.frame(
Metadata_cell_line = "A549",
Metadata_gene_name = "CCND1",
Metadata_pert_name = "CCND1-2")
munged_sim %>%
inner_join(target_guide, by = colnames(target_guide)) %>%
ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)

grit %>%
inner_join(target_guide, by = colnames(target_guide))
target_guide <- data.frame(
Metadata_cell_line = "A549",
Metadata_gene_name = "STAT3",
Metadata_pert_name = "STAT3-1")
munged_sim %>%
inner_join(target_guide, by = colnames(target_guide)) %>%
ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)

grit %>%
inner_join(target_guide, by = colnames(target_guide))
target_guide <- data.frame(
Metadata_cell_line = "A549",
Metadata_gene_name = "MYC",
Metadata_pert_name = "MYC-1")
munged_sim %>%
inner_join(target_guide, by = colnames(target_guide)) %>%
ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)

grit %>%
inner_join(target_guide, by = colnames(target_guide))
---
title: "Explain Grit"
author: "Shantanu Singh"
output: 
  html_notebook:
    toc: true
    toc_float: true
    toc_depth: 3
    number_sections: true
    theme: lumen
---

```
output: 
  html_notebook:
    toc: true
    toc_float: true
    toc_depth: 3
    number_sections: true
    theme: lumen
```

```
output: github_document
```

# Boilerplate 

```{r echo=FALSE}
show_table <- print
```

If running interactively in RStudio, 

- change `output` in the header of this markdown to `html_notebook` and
- change to `eval=TRUE` below

When knitting for pushing to GitHub,

- change `output` in the header of this markdown to `github_document` and
- change to `eval=FALSE` below

```{r eval=FALSE}
show_table <- knitr::kable
knitr::opts_chunk$set(fig.width = 6, fig.asp = 2/3)
```

# Libraries

```{r message=FALSE}
library(magrittr)
library(tidyverse)
library(arrow)
```


```{r}
source("../../R/cytominer-eval.R")
```

# Load data

```{r message=FALSE}
# read these features
# https://github.com/broadinstitute/grit-benchmark/blob/31d9812e0c267d3535c85a72ef6bd0104d1a2130/1.calculate-metrics/cell-health/0.calculate-grit.ipynb
# (See chunk 4)

profiles <- read_csv("data/cell_health_merged_feature_select.csv.gz")

metadata <- get_annotation(profiles)
```


```{r}
if (file.exists("data/sim.parquet")) {
  message("Loading existing similarity file...")
  sim_df <- arrow::read_parquet("data/sim.parquet")

} else {
  sim_df <- sim_calculate(profiles)
  sim_df %>% arrow::write_parquet("data/sim.parquet")
}
```

# Compute similarity sets

```{r}

# ---- 0. Filter out some rows ----

drop_group <-
  data.frame(Metadata_gene_name = "EMPTY")

# ---- 1. Similarity to reference ----

# Fetch similarities between
# a. all rows (except, optionally those containing `reference`)
# and
# b. all rows containing `reference`
# Do so only for those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_ref`
    
reference <-
  data.frame(Metadata_gene_name = c("Chr2", "Luc", "LacZ"))

# all_same_cols_ref <-
#   c("Metadata_cell_line")

all_same_cols_ref <-
  c("Metadata_cell_line",
    "Metadata_Plate")

# ---- 2. Similarity to replicates (no references) ----

# Fetch similarities between
# a. all rows except `reference` rows
# and
# b. all rows except `reference` rows (i.e. to each other)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_rep
#
# Keep, both, (a, b) and (b, a)

all_same_cols_rep <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

# ---- 3. Similarity to replicates (only references) ----

# Fetch similarities between
# a. all rows containing `reference`
# to
# b. all rows containing `reference` (i.e. to each other)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_rep_ref`.
#
# Keep, both, (a, b) and (b, a)

all_same_cols_rep_ref <-
  c(
    "Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name",
    "Metadata_Plate"
  )

# all_same_cols_rep_ref <-
#   c(
#     "Metadata_cell_line",
#     "Metadata_gene_name",
#     "Metadata_pert_name"
#   )
# ---- 4. Similarity to non-replicates ----

# Fetch similarities between
# a. all rows (except, optionally, `reference` rows)
# to
# b. all rows except `reference` rows
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_non_rep`
# - have *different* values in *all* columns `all_different_cols_non_rep`
# - have *different* values in *at least one* column of `any_different_cols_non_rep`
#
# Keep, both, (a, b) and (b, a)

any_different_cols_non_rep <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

all_same_cols_non_rep <-
  c("Metadata_cell_line", 
    "Metadata_Plate")

all_different_cols_non_rep <-
  c("Metadata_gene_name")

# ---- 5. Similarity to group ----

# Fetch similarities between
# a. all rows (except, optionally, `reference` rows)
# and
# b. all rows (except, optionally, `reference` rows)
#
# Do so for only those (a, b) pairs that
# - have *same* values in *all* columns of `all_same_cols_group`
# - have *different* values in *at least one* column of `any_different_cols_group`
#
# Keep, both, (a, b) and (b, a)

all_same_cols_group <-
  c("Metadata_cell_line",
    "Metadata_gene_name")

any_different_cols_group <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

# ---- 6. Combine 1-5 and annotate the similarity matrix ----

annotation_cols <-
  c("Metadata_cell_line",
    "Metadata_gene_name",
    "Metadata_pert_name")

munged_sim <-
  sim_munge(
    sim_df,
    metadata,
    reference,
    all_same_cols_rep = all_same_cols_rep,
    all_same_cols_rep_ref = all_same_cols_rep_ref,
    all_same_cols_ref = all_same_cols_ref,
    any_different_cols_non_rep = any_different_cols_non_rep,
    all_same_cols_non_rep = all_same_cols_non_rep,
    all_different_cols_non_rep = all_different_cols_non_rep,
    any_different_cols_group = any_different_cols_group,
    all_same_cols_group = all_same_cols_group,
    annotation_cols = annotation_cols,
    drop_group = drop_group
  )

```

# Compute metrics

```{r}
norm_non_rep_metrics <- 
  sim_metrics(munged_sim, "non_rep", calculate_grouped = TRUE)

norm_ref_metrics <- 
  sim_metrics(munged_sim, "ref", calculate_grouped = TRUE)

per_row_metrics <-
  inner_join(
    norm_non_rep_metrics[["per_row"]],
    norm_ref_metrics[["per_row"]],
    by = c("id1", all_same_cols_rep, "sim_mean_agg")
  )

per_set_metrics <-
  inner_join(
    norm_non_rep_metrics[["per_set"]],
    norm_ref_metrics[["per_set"]],
    by = c(all_same_cols_rep, "sim_mean_agg_mean_agg", "sim_mean_agg_median_agg")
  )

per_set_group_metrics <-
  inner_join(
    norm_non_rep_metrics[["per_set_group"]],
    norm_ref_metrics[["per_set_group"]],
    by = c(all_same_cols_rep, "sim_mean_agg", "sim_median_agg")
  )
```


```{r}
per_set_all_metrics <-
  per_set_group_metrics %>%
  rename_with( ~ paste0(., "_group"), starts_with("sim")) %>%
  inner_join(
    per_set_metrics %>%
      rename_with( ~ paste0(., "_indiv"), starts_with("sim")),
    by = c(all_same_cols_rep)
  ) 

grit <-
  per_set_all_metrics %>%
  mutate(
    grit_group = sim_scaled_mean_agg_ref_group,
    grit_indiv = sim_scaled_mean_agg_ref_mean_agg_indiv
  ) %>%
  select(all_of(all_same_cols_rep), grit_group, grit_indiv)
```

# Plot distributions

```{r}
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "ITGAV",
  Metadata_pert_name = "ITGAV-1")

sister_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "ITGAV",
  Metadata_pert_name = "ITGAV-2")

cutting_controls <- data.frame(
  Metadata_cell_line = c("A549", "A549", "A549"),
  Metadata_gene_name = c("Chr2", "Luc", "LacZ")
)

metadata <- 
  profiles %>%
  select(matches("Metadata")) %>% 
  select(-Metadata_WellCol, -Metadata_WellRow)

features <- function(df) t(as.matrix(df %>% select(-matches("^Metadata_"))))

plate <- function(df) df$Metadata_Plate

p_target_guide <- profiles %>% inner_join(target_guide)

p_sister_guide <- profiles %>% inner_join(sister_guide)

p_cutting_controls <- profiles %>% inner_join(cutting_controls)
```

```{r}
p_target_guide %>% group_by(across(all_of(unique(c(all_same_cols_rep, all_same_cols_ref))))) %>% tally()
```


```{r}
p_sister_guide %>% group_by(across(all_of(unique(c(all_same_cols_rep, all_same_cols_ref))))) %>% tally()
```


```{r}
p_cutting_controls %>% group_by(across(all_of(all_same_cols_ref))) %>% tally()
```


```{r}
all_sim_counts <-
  munged_sim %>%
  group_by(across(all_of(c("id1", all_same_cols_rep, "type")))) %>%
  tally() %>%
  arrange(across(all_of(all_same_cols_rep))) %>%
  pivot_wider(names_from = "type",
              names_prefix = "n_",
              values_from = n,
              values_fill = 0)
```

```{r}
all_sim_counts %>%
  inner_join(target_guide)
```
```{r}
target_distributions <-
  munged_sim %>%
  inner_join(target_guide)

target_distributions %>%
  group_by(type) %>%
  tally()
```

```{r}
target_distributions %>%
  group_by(type) %>%
  tally() %>%
  ggplot(aes(type, n)) + geom_col()
```

```{r}
grit %>%
  inner_join(target_guide, by = colnames(target_guide))

target_distributions %>%
  filter(type %in% c("ref", "rep_group", "non_rep")) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)
```


```{r}
target_distributions %>%
  filter(type %in% c("ref", "rep", "non_rep")) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)
```

```{r}
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "CCND1",
  Metadata_pert_name = "CCND1-1")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  

grit %>%
  inner_join(target_guide, by = colnames(target_guide)) 
```
```{r}
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "CCND1",
  Metadata_pert_name = "CCND1-2")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  

grit %>%
  inner_join(target_guide, by = colnames(target_guide))
```

```{r}
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "STAT3",
  Metadata_pert_name = "STAT3-1")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  

grit %>%
  inner_join(target_guide, by = colnames(target_guide))
```
```{r}
target_guide <- data.frame(
  Metadata_cell_line = "A549",
  Metadata_gene_name = "MYC",
  Metadata_pert_name = "MYC-1")

munged_sim %>%
  inner_join(target_guide, by = colnames(target_guide)) %>%
  ggplot(aes(sim)) + geom_histogram(binwidth = .02) + facet_wrap(~type, ncol = 1)  

grit %>%
  inner_join(target_guide, by = colnames(target_guide))
```

