Functions
Utils
Get feature matrix
feats <- function(X, pattern = "Cells_AreaShape_Zernike", ...) {
X %>% dplyr::select(matches(pattern))
}
Statistics
Difference between means
Compute difference between mean of specified group and everything else
mean_diff <-
function(X, label_i, label_col = "Metadata_pert_name", ...) {
A <- X %>% filter(.data[[label_col]] == label_i)
B <- X %>% filter(.data[[label_col]] != label_i)
sqrt(sum((colMeans(feats(
A, ...
)) - colMeans(feats(
B, ...
))) ** 2))
}
Inv of mean pairwise distance
invmpd <-
function(X, label_i, label_col = "Metadata_pert_name", ...) {
A <- X %>% filter(.data[[label_col]] == label_i)
1 / mean(dist(as.matrix(feats(A, ...))))
}
Trace of precision matrix
Compute trace of precision matrix of specified group (indicated by label_i)
trace_precision <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
A <- X %>% filter(.data[[label_col]] == label_i)
sv <- svd(as.matrix(feats(A, ...)), nu = 0, nv = 0)$d
sum(1/(sv**2))
}
Inverse of major axis length
pc1 <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
A <- X %>% filter(.data[[label_col]] == label_i)
sv <- svd(as.matrix(feats(A, ...)), nu = 0, nv = 0)$d
1/sv[1]
}
Precision
beta <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
A <- X %>% filter(.data[[label_col]] == label_i)
1 / sd(as.matrix(feats(A, ...)))
}
Testing
Permute label
permute <- function(X, label_col = "Metadata_pert_name") {
X[[label_col]] <- sample(X[[label_col]])
X
}
Compute null distribution i.e. the sampling distribution of the test statistic under the null hypothesis
generate_null <-
function(X,
label,
test_statistic = mean_diff,
n = 100,
label_col = "Metadata_pert_name",
...) {
test_statistic(permute(X, label_col), label, ...)
times(n) %dopar% test_statistic(permute(X, label_col), label, ...)
}
Run permutation test and plot
run_test <- function(test_statistic, ...) {
null_distribution <- generate_null(test_statistic = test_statistic, ...)
test_statistic_value <- test_statistic(...)
p_value <- sum(null_distribution > test_statistic_value) / length(null_distribution)
p <-
data.frame(x = null_distribution) %>%
ggplot(aes(x)) + geom_histogram(bins = 30) +
geom_vline(xintercept = test_statistic_value, color = "red") +
labs(caption = glue("p-value = {round(p_value, 2)}"))
p
}
Load
Load cell health data
url <-
"https://github.com/broadinstitute/grit-benchmark/raw/main/1.calculate-metrics/cell-health/data/cell_health_merged_feature_select.csv.gz"
cellhealth <-
read_csv(
url,
col_types = cols(
.default = col_double(),
Metadata_Plate = col_character(),
Metadata_Well = col_character(),
Metadata_WellRow = col_character(),
Metadata_WellCol = col_character(),
Metadata_cell_line = col_character(),
Metadata_gene_name = col_character(),
Metadata_pert_name = col_character()
)
)
cellhealth <-
cellhealth %>%
filter(Metadata_cell_line == "A549") %>%
filter(!(Metadata_gene_name %in% c("EMPTY", "Chr2")))
cellhealth %>%
count(Metadata_pert_name, name = "n_replicates") %>%
count(n_replicates)
Run
X <- cellhealth
n <- 300
label_col <- "Metadata_pert_name"
pattern <- "Cells_Intensity"
Difference between means
label <- "LacZ-3"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"
run_test(
X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col
) +
xlab(test_statistic_name) +
ggtitle(
glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
)
)

label <- "CDK4-2"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"
run_test(
X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col
) +
xlab(test_statistic_name) +
ggtitle(
glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
)
)

Inv of mean pairwise distance
label <- "LacZ-3"
test_statistic <- invmpd
test_statistic_name <- "invmpd"
run_test(X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col) +
xlab(test_statistic_name) +
ggtitle(glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
))

label <- "CDK4-2"
test_statistic <- invmpd
test_statistic_name <- "invmpd"
run_test(X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col) +
xlab(test_statistic_name) +
ggtitle(glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
))

Trace of precision matrix
label <- "LacZ-3"
test_statistic <- trace_precision
test_statistic_name <- "trace_precision"
run_test(X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col) +
xlab(test_statistic_name) +
ggtitle(glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
))

label <- "CDK4-2"
test_statistic <- trace_precision
test_statistic_name <- "trace_precision"
run_test(X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col) +
xlab(test_statistic_name) +
ggtitle(glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
))

Inverse of major axis length
label <- "LacZ-3"
test_statistic <- pc1
test_statistic_name <- "pc1"
run_test(X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col) +
xlab(test_statistic_name) +
ggtitle(glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
))

label <- "CDK4-2"
test_statistic <- pc1
test_statistic_name <- "pc1"
run_test(X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col) +
xlab(test_statistic_name) +
ggtitle(glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
))

Precision - single variable
label <- "LacZ-3"
test_statistic <- beta
test_statistic_name <- "beta"
pattern <- "Cells_AreaShape_Zernike_0_0"
run_test(X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col,
pattern = pattern) +
xlab(test_statistic_name) +
ggtitle(glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
))

label <- "CDK4-2"
test_statistic <- beta
test_statistic_name <- "beta"
pattern <- "Cells_AreaShape_Zernike_0_0"
run_test(X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col,
pattern = pattern) +
xlab(test_statistic_name) +
ggtitle(glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
))

Difference between means - single variable
label <- "LacZ-3"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"
pattern <- "Cells_AreaShape_Zernike_0_0"
run_test(X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col,
pattern = pattern) +
xlab(test_statistic_name) +
ggtitle(glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
))

label <- "CDK4-2"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"
pattern <- "Cells_AreaShape_Zernike_0_0"
run_test(X = X,
label = label,
test_statistic = test_statistic,
n = n,
label_col = label_col,
pattern = pattern) +
xlab(test_statistic_name) +
ggtitle(glue(
"{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
))

---
title: "Percent replicating through the lens of permutation testing"
output: html_notebook
---

# Setup

```{r}
set.seed(42)
library(tidyverse)
library(foreach)
library(glue)
```


```{r}
doParallel::registerDoParallel(cores = 8)
```

# Functions

## Utils

Get feature matrix

```{r}
feats <- function(X, pattern = "Cells_AreaShape_Zernike", ...) {
  X %>% dplyr::select(matches(pattern))
}
```

## Statistics

### Difference between means

Compute difference between mean of specified group and everything else

```{r}
mean_diff <-
  function(X, label_i, label_col = "Metadata_pert_name", ...) {
    A <- X %>% filter(.data[[label_col]] == label_i)
    B <- X %>% filter(.data[[label_col]] != label_i)
    
    sqrt(sum((colMeans(feats(
      A, ...
    )) - colMeans(feats(
      B, ...
    ))) ** 2))
  }
```

### Median replicate correlation

Compute median replicate correlation of specified group (indicated by `label_i`)

```{r}
mrc <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  corvec <- function(m) {
    cm <- cor(m)
    cm[upper.tri(cm)]
  }
  
  median(corvec(t(as.matrix(feats(
    A, ...
  )))))
}
```

### Inv of mean pairwise distance

```{r}
invmpd <-
  function(X, label_i, label_col = "Metadata_pert_name", ...) {
    
    A <- X %>% filter(.data[[label_col]] == label_i)
    
    1 / mean(dist(as.matrix(feats(A, ...))))
    
  }
```

### Trace of precision matrix

Compute trace of precision matrix of specified group (indicated by `label_i`)

```{r}
trace_precision <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  sv <- svd(as.matrix(feats(A, ...)), nu = 0, nv = 0)$d

  sum(1/(sv**2))
  
}
```

### Inverse of major axis length

```{r}
pc1 <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  sv <- svd(as.matrix(feats(A, ...)), nu = 0, nv = 0)$d

  1/sv[1]
  
}
```

### Precision

```{r}
beta <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  1 / sd(as.matrix(feats(A, ...)))

}
```

## Testing

Permute label

```{r}
permute <- function(X, label_col = "Metadata_pert_name") {
  X[[label_col]] <- sample(X[[label_col]])
  X
}
```

Compute null distribution i.e. the sampling distribution of the test statistic under the null hypothesis

```{r}
generate_null <-
  function(X,
           label,
           test_statistic = mean_diff,
           n = 100,
           label_col = "Metadata_pert_name",
           ...) {
    
    test_statistic(permute(X, label_col), label, ...)
    
    times(n) %dopar% test_statistic(permute(X, label_col), label, ...)
    
  }

```

Run permutation test and plot

```{r}
run_test <- function(test_statistic, ...) {

  null_distribution <- generate_null(test_statistic = test_statistic, ...)
  
  test_statistic_value <- test_statistic(...)
  
  p_value <- sum(null_distribution > test_statistic_value) / length(null_distribution)
  
  p <- 
    data.frame(x = null_distribution) %>%
    ggplot(aes(x)) + geom_histogram(bins = 30) + 
    geom_vline(xintercept = test_statistic_value, color = "red") + 
    labs(caption = glue("p-value = {round(p_value, 2)}"))
  
  p
}
```


# Load 

Load cell health data

```{r}
url <-
  "https://github.com/broadinstitute/grit-benchmark/raw/main/1.calculate-metrics/cell-health/data/cell_health_merged_feature_select.csv.gz"

cellhealth <-
  read_csv(
    url,
    col_types = cols(
      .default = col_double(),
      Metadata_Plate = col_character(),
      Metadata_Well = col_character(),
      Metadata_WellRow = col_character(),
      Metadata_WellCol = col_character(),
      Metadata_cell_line = col_character(),
      Metadata_gene_name = col_character(),
      Metadata_pert_name = col_character()
    )
  )

cellhealth <-
  cellhealth %>%
  filter(Metadata_cell_line == "A549") %>%
  filter(!(Metadata_gene_name %in% c("EMPTY", "Chr2")))
```


```{r}
cellhealth %>%
  count(Metadata_pert_name, name = "n_replicates") %>% 
  count(n_replicates)
```

# Run

```{r}
X <- cellhealth
n <- 300
label_col <- "Metadata_pert_name"
pattern <- "Cells_Intensity"
```

## Difference between means

```{r}
label <- "LacZ-3"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"

run_test(
  X = X,
  label = label,
  test_statistic = test_statistic,
  n = n,
  label_col = label_col
) +
  xlab(test_statistic_name) +
  ggtitle(
    glue(
      "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    )
  )
```


```{r}
label <- "CDK4-2"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"

run_test(
  X = X,
  label = label,
  test_statistic = test_statistic,
  n = n,
  label_col = label_col
) +
  xlab(test_statistic_name) +
  ggtitle(
    glue(
      "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    )
  )
```

## Median replicate correlation

```{r}
label <- "LacZ-3"
test_statistic <- mrc
test_statistic_name <- "mrc"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- mrc
test_statistic_name <- "mrc"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    ))
```
## Inv of mean pairwise distance


```{r}
label <- "LacZ-3"
test_statistic <- invmpd
test_statistic_name <- "invmpd"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- invmpd
test_statistic_name <- "invmpd"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```

## Trace of precision matrix

```{r}
label <- "LacZ-3"
test_statistic <- trace_precision
test_statistic_name <- "trace_precision"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- trace_precision
test_statistic_name <- "trace_precision"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```

## Inverse of major axis length

```{r}
label <- "LacZ-3"
test_statistic <- pc1
test_statistic_name <- "pc1"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- pc1
test_statistic_name <- "pc1"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```
## Precision - single variable

```{r}
label <- "LacZ-3"
test_statistic <- beta
test_statistic_name <- "beta"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- beta
test_statistic_name <- "beta"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```
## Difference between means - single variable

```{r}
label <- "LacZ-3"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```

