1 Setup

doParallel::registerDoParallel(cores = 8)

2 Functions

2.1 Utils

Get feature matrix

feats <- function(X, pattern = "Cells_AreaShape_Zernike", ...) {
  X %>% dplyr::select(matches(pattern))
}

2.2 Statistics

2.2.1 Difference between means

Compute difference between mean of specified group and everything else

mean_diff <-
  function(X, label_i, label_col = "Metadata_pert_name", ...) {
    A <- X %>% filter(.data[[label_col]] == label_i)
    B <- X %>% filter(.data[[label_col]] != label_i)
    
    sqrt(sum((colMeans(feats(
      A, ...
    )) - colMeans(feats(
      B, ...
    ))) ** 2))
  }

2.2.2 Median replicate correlation

Compute median replicate correlation of specified group (indicated by label_i)

mrc <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  corvec <- function(m) {
    cm <- cor(m)
    cm[upper.tri(cm)]
  }
  
  median(corvec(t(as.matrix(feats(
    A, ...
  )))))
}

2.2.3 Inv of mean pairwise distance

invmpd <-
  function(X, label_i, label_col = "Metadata_pert_name", ...) {
    
    A <- X %>% filter(.data[[label_col]] == label_i)
    
    1 / mean(dist(as.matrix(feats(A, ...))))
    
  }

2.2.4 Trace of precision matrix

Compute trace of precision matrix of specified group (indicated by label_i)

trace_precision <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  sv <- svd(as.matrix(feats(A, ...)), nu = 0, nv = 0)$d

  sum(1/(sv**2))
  
}

2.2.5 Inverse of major axis length

pc1 <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  sv <- svd(as.matrix(feats(A, ...)), nu = 0, nv = 0)$d

  1/sv[1]
  
}

2.2.6 Precision

beta <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  1 / sd(as.matrix(feats(A, ...)))

}

2.3 Testing

Permute label

permute <- function(X, label_col = "Metadata_pert_name") {
  X[[label_col]] <- sample(X[[label_col]])
  X
}

Compute null distribution i.e. the sampling distribution of the test statistic under the null hypothesis

generate_null <-
  function(X,
           label,
           test_statistic = mean_diff,
           n = 100,
           label_col = "Metadata_pert_name",
           ...) {
    
    test_statistic(permute(X, label_col), label, ...)
    
    times(n) %dopar% test_statistic(permute(X, label_col), label, ...)
    
  }

Run permutation test and plot

run_test <- function(test_statistic, ...) {

  null_distribution <- generate_null(test_statistic = test_statistic, ...)
  
  test_statistic_value <- test_statistic(...)
  
  p_value <- sum(null_distribution > test_statistic_value) / length(null_distribution)
  
  p <- 
    data.frame(x = null_distribution) %>%
    ggplot(aes(x)) + geom_histogram(bins = 30) + 
    geom_vline(xintercept = test_statistic_value, color = "red") + 
    labs(caption = glue("p-value = {round(p_value, 2)}"))
  
  p
}

3 Load

Load cell health data

url <-
  "https://github.com/broadinstitute/grit-benchmark/raw/main/1.calculate-metrics/cell-health/data/cell_health_merged_feature_select.csv.gz"

cellhealth <-
  read_csv(
    url,
    col_types = cols(
      .default = col_double(),
      Metadata_Plate = col_character(),
      Metadata_Well = col_character(),
      Metadata_WellRow = col_character(),
      Metadata_WellCol = col_character(),
      Metadata_cell_line = col_character(),
      Metadata_gene_name = col_character(),
      Metadata_pert_name = col_character()
    )
  )

cellhealth <-
  cellhealth %>%
  filter(Metadata_cell_line == "A549") %>%
  filter(!(Metadata_gene_name %in% c("EMPTY", "Chr2")))
cellhealth %>%
  count(Metadata_pert_name, name = "n_replicates") %>% 
  count(n_replicates)

4 Run

X <- cellhealth
n <- 300
label_col <- "Metadata_pert_name"
pattern <- "Cells_Intensity"

4.1 Difference between means

label <- "LacZ-3"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"

run_test(
  X = X,
  label = label,
  test_statistic = test_statistic,
  n = n,
  label_col = label_col
) +
  xlab(test_statistic_name) +
  ggtitle(
    glue(
      "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    )
  )

label <- "CDK4-2"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"

run_test(
  X = X,
  label = label,
  test_statistic = test_statistic,
  n = n,
  label_col = label_col
) +
  xlab(test_statistic_name) +
  ggtitle(
    glue(
      "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    )
  )

4.2 Median replicate correlation

label <- "LacZ-3"
test_statistic <- mrc
test_statistic_name <- "mrc"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    ))

label <- "CDK4-2"
test_statistic <- mrc
test_statistic_name <- "mrc"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    ))

4.3 Inv of mean pairwise distance

label <- "LacZ-3"
test_statistic <- invmpd
test_statistic_name <- "invmpd"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))

label <- "CDK4-2"
test_statistic <- invmpd
test_statistic_name <- "invmpd"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))

4.4 Trace of precision matrix

label <- "LacZ-3"
test_statistic <- trace_precision
test_statistic_name <- "trace_precision"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))

label <- "CDK4-2"
test_statistic <- trace_precision
test_statistic_name <- "trace_precision"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))

4.5 Inverse of major axis length

label <- "LacZ-3"
test_statistic <- pc1
test_statistic_name <- "pc1"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))

label <- "CDK4-2"
test_statistic <- pc1
test_statistic_name <- "pc1"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))

4.6 Precision - single variable

label <- "LacZ-3"
test_statistic <- beta
test_statistic_name <- "beta"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))

label <- "CDK4-2"
test_statistic <- beta
test_statistic_name <- "beta"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))

4.7 Difference between means - single variable

label <- "LacZ-3"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))

label <- "CDK4-2"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))

---
title: "Percent replicating through the lens of permutation testing"
output: html_notebook
---

# Setup

```{r}
set.seed(42)
library(tidyverse)
library(foreach)
library(glue)
```


```{r}
doParallel::registerDoParallel(cores = 8)
```

# Functions

## Utils

Get feature matrix

```{r}
feats <- function(X, pattern = "Cells_AreaShape_Zernike", ...) {
  X %>% dplyr::select(matches(pattern))
}
```

## Statistics

### Difference between means

Compute difference between mean of specified group and everything else

```{r}
mean_diff <-
  function(X, label_i, label_col = "Metadata_pert_name", ...) {
    A <- X %>% filter(.data[[label_col]] == label_i)
    B <- X %>% filter(.data[[label_col]] != label_i)
    
    sqrt(sum((colMeans(feats(
      A, ...
    )) - colMeans(feats(
      B, ...
    ))) ** 2))
  }
```

### Median replicate correlation

Compute median replicate correlation of specified group (indicated by `label_i`)

```{r}
mrc <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  corvec <- function(m) {
    cm <- cor(m)
    cm[upper.tri(cm)]
  }
  
  median(corvec(t(as.matrix(feats(
    A, ...
  )))))
}
```

### Inv of mean pairwise distance

```{r}
invmpd <-
  function(X, label_i, label_col = "Metadata_pert_name", ...) {
    
    A <- X %>% filter(.data[[label_col]] == label_i)
    
    1 / mean(dist(as.matrix(feats(A, ...))))
    
  }
```

### Trace of precision matrix

Compute trace of precision matrix of specified group (indicated by `label_i`)

```{r}
trace_precision <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  sv <- svd(as.matrix(feats(A, ...)), nu = 0, nv = 0)$d

  sum(1/(sv**2))
  
}
```

### Inverse of major axis length

```{r}
pc1 <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  sv <- svd(as.matrix(feats(A, ...)), nu = 0, nv = 0)$d

  1/sv[1]
  
}
```

### Precision

```{r}
beta <- function(X, label_i, label_col = "Metadata_pert_name", ...) {
  
  A <- X %>% filter(.data[[label_col]] == label_i)
  
  1 / sd(as.matrix(feats(A, ...)))

}
```

## Testing

Permute label

```{r}
permute <- function(X, label_col = "Metadata_pert_name") {
  X[[label_col]] <- sample(X[[label_col]])
  X
}
```

Compute null distribution i.e. the sampling distribution of the test statistic under the null hypothesis

```{r}
generate_null <-
  function(X,
           label,
           test_statistic = mean_diff,
           n = 100,
           label_col = "Metadata_pert_name",
           ...) {
    
    test_statistic(permute(X, label_col), label, ...)
    
    times(n) %dopar% test_statistic(permute(X, label_col), label, ...)
    
  }

```

Run permutation test and plot

```{r}
run_test <- function(test_statistic, ...) {

  null_distribution <- generate_null(test_statistic = test_statistic, ...)
  
  test_statistic_value <- test_statistic(...)
  
  p_value <- sum(null_distribution > test_statistic_value) / length(null_distribution)
  
  p <- 
    data.frame(x = null_distribution) %>%
    ggplot(aes(x)) + geom_histogram(bins = 30) + 
    geom_vline(xintercept = test_statistic_value, color = "red") + 
    labs(caption = glue("p-value = {round(p_value, 2)}"))
  
  p
}
```


# Load 

Load cell health data

```{r}
url <-
  "https://github.com/broadinstitute/grit-benchmark/raw/main/1.calculate-metrics/cell-health/data/cell_health_merged_feature_select.csv.gz"

cellhealth <-
  read_csv(
    url,
    col_types = cols(
      .default = col_double(),
      Metadata_Plate = col_character(),
      Metadata_Well = col_character(),
      Metadata_WellRow = col_character(),
      Metadata_WellCol = col_character(),
      Metadata_cell_line = col_character(),
      Metadata_gene_name = col_character(),
      Metadata_pert_name = col_character()
    )
  )

cellhealth <-
  cellhealth %>%
  filter(Metadata_cell_line == "A549") %>%
  filter(!(Metadata_gene_name %in% c("EMPTY", "Chr2")))
```


```{r}
cellhealth %>%
  count(Metadata_pert_name, name = "n_replicates") %>% 
  count(n_replicates)
```

# Run

```{r}
X <- cellhealth
n <- 300
label_col <- "Metadata_pert_name"
pattern <- "Cells_Intensity"
```

## Difference between means

```{r}
label <- "LacZ-3"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"

run_test(
  X = X,
  label = label,
  test_statistic = test_statistic,
  n = n,
  label_col = label_col
) +
  xlab(test_statistic_name) +
  ggtitle(
    glue(
      "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    )
  )
```


```{r}
label <- "CDK4-2"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"

run_test(
  X = X,
  label = label,
  test_statistic = test_statistic,
  n = n,
  label_col = label_col
) +
  xlab(test_statistic_name) +
  ggtitle(
    glue(
      "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    )
  )
```

## Median replicate correlation

```{r}
label <- "LacZ-3"
test_statistic <- mrc
test_statistic_name <- "mrc"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- mrc
test_statistic_name <- "mrc"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 2)}"
    ))
```
## Inv of mean pairwise distance


```{r}
label <- "LacZ-3"
test_statistic <- invmpd
test_statistic_name <- "invmpd"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- invmpd
test_statistic_name <- "invmpd"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```

## Trace of precision matrix

```{r}
label <- "LacZ-3"
test_statistic <- trace_precision
test_statistic_name <- "trace_precision"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- trace_precision
test_statistic_name <- "trace_precision"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```

## Inverse of major axis length

```{r}
label <- "LacZ-3"
test_statistic <- pc1
test_statistic_name <- "pc1"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- pc1
test_statistic_name <- "pc1"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```
## Precision - single variable

```{r}
label <- "LacZ-3"
test_statistic <- beta
test_statistic_name <- "beta"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- beta
test_statistic_name <- "beta"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```
## Difference between means - single variable

```{r}
label <- "LacZ-3"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```


```{r}
label <- "CDK4-2"
test_statistic <- mean_diff
test_statistic_name <- "mean_diff"
pattern <- "Cells_AreaShape_Zernike_0_0"

run_test(X = X,
         label = label,
         test_statistic = test_statistic,
         n = n,
         label_col = label_col,
         pattern = pattern) +
  xlab(test_statistic_name) +
  ggtitle(glue(
    "{label_col} = {label}\n{test_statistic_name} = {round(test_statistic(X, label), 3)}"
    ))
```

