Setup
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(MASS))
future::plan(future::multisession, workers = 8)
Specify parameters
#set.seed(42)
d <- 100 # dimensions
n_null_reps <- 10 # number of times to repeat shuffling to create the null distribution
n_means <- 20 # number of centers (components)
n_repeats <- 9 # number of times to repeat the sampling
n_samples_per_mean_l <- seq(3, 20) # number of samples per component (iterate over each)
null_threshold_pc <- 0.95 # threshold above which a component is considered to be replicating
sds <- seq(2, 4, length.out = n_means) ** 3 # s.d. of each component
#sds <- 1 + abs(rnorm(n_means)) / 10
means <- matrix(rnorm(d * n_means), n_means, d) # generate centers
Functions
Elements of upper triangular matrix
corvec <- function(m) {cm <- cor(m); cm[upper.tri(cm)]}
Compute median replicate correlation (mrc)
f_mrc <- function(data_x, n_means, n_samples_per_mean) {
sapply(1:n_means,
function(i) {
start <- (i - 1) * n_samples_per_mean + 1
end <- i * n_samples_per_mean
data_i <- data_x[start:end,]
mean(corvec(t(data_i)))
})
}
Sample the data and compute signal and null distributions of mrc
f_signal_null <-
function(n_means,
n_samples_per_mean,
d,
n_null_reps) {
data <- lapply(1:n_means,
function(i)
mvrnorm(n = n_samples_per_mean,
means[i, ],
sds[i] * diag(rep(1, d))))
data <- do.call(rbind, data)
signal <- f_mrc(data,
n_means,
n_samples_per_mean)
null <-
as.vector(sapply(1:n_null_reps,
function(i)
f_mrc(data[sample(n_samples_per_mean * n_means),],
n_means,
n_samples_per_mean)))
df <- tibble(
label =
c(rep("signal", length(signal)),
rep("null", length(null))),
class =
c(sds,
rep(NA, length(null))),
sim = c(signal, null)
)
df
}
Generate data
Generate mrc of signal and null, n_repeats times.
df <- furrr::future_map_dfr(1:n_repeats,
function(repeat_id)
map_dfr(n_samples_per_mean_l,
function(n_samples_per_mean_i)
f_signal_null(
n_means = n_means,
n_samples_per_mean = n_samples_per_mean_i,
d = d,
n_null_reps = n_null_reps
) %>%
mutate(n_samples_per_mean =
n_samples_per_mean_i,
repeat_id = repeat_id)),
.options = furrr::furrr_options(seed = TRUE))
Compute percent replicating, for each n_samples_per_mean (i.e. replicate cardinality) and each repeat_id (i.e. each shuffling)
dfn <- df %>%
group_by(n_samples_per_mean, repeat_id) %>%
nest() %>%
mutate(percent_rep = map_dbl(data, function(signal_null_df) {
null_threshold <-
signal_null_df %>%
filter(label == "null") %>%
pull("sim") %>%
quantile(null_threshold_pc, names = FALSE)
replicating <-
signal_null_df %>%
filter(label == "signal") %>%
mutate(is_replicating = sim > null_threshold)
percent_rep_i <-
(replicating %>% filter(is_replicating) %>% nrow()) /
(replicating %>% nrow())
percent_rep_i
})) %>%
dplyr::select(-data) %>%
ungroup() %>%
group_by(n_samples_per_mean)
Plot
Dependence of MRC on replicate cardinality
Combine data across repeats
df %>%
ggplot(aes(sim, fill = label)) +
stat_density(position = "identity", alpha = 0.6) +
facet_wrap(~ n_samples_per_mean)

dfn %>%
ggplot(aes(n_samples_per_mean, percent_rep, group = n_samples_per_mean)) +
geom_boxplot(notch=FALSE, outlier.shape=NA, fill="red", alpha=0.2) +
stat_summary(fun=median, geom="line", aes(group=1)) +
ylab("Percent replicating") +
xlab("Number of replicates") +
cowplot::theme_cowplot()

#df %>% count(n_samples_per_mean, repeat_id, label) %>% count(n_samples_per_mean, label, n)
Visualize data using PCA
n_samples_per_mean <- n_samples_per_mean_l[length(n_samples_per_mean_l)]
n_samples_per_mean <- n_samples_per_mean * 10
data <- lapply(1:n_means,
function(i)
mvrnorm(n = n_samples_per_mean,
means[i,],
sds[i] * diag(rep(1, d))))
data <- do.call(rbind, data)
m <- prcomp(data)
data_pc <- as_tibble(m$x[,1:2])
data_pc$sdv <- rep(sds, each = n_samples_per_mean)
round_it <- function(x) round(as.numeric(x), 1)
data_pc %>%
ggplot(aes(PC1, PC2)) +
geom_hex() +
theme_void() +
theme(legend.position = "none") +
facet_wrap(~sdv, labeller = labeller(sdv = round_it)) +
coord_equal()

Plot spread in PC1 wrt s.d.
data_pc %>%
group_by(sdv) %>%
summarise(across(everything(), list(sd = sd))) %>%
ggplot(aes(sdv, PC1_sd)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ x)

MRC vs cardinality across different dispersion levels
df %>%
filter(label == "signal") %>%
group_by(n_samples_per_mean, class) %>%
summarize(across(c("sim"), list(mean = mean, sd = sd)), .groups = "keep") %>%
ggplot(aes(n_samples_per_mean, sim_mean, color = class, group = class)) +
geom_line(alpha = 0.5) #+

df %>%
filter(label == "signal") %>%
group_by(n_samples_per_mean, class) %>%
summarize(across(c("sim"), list(mean = mean, sd = sd)), .groups = "keep") %>%
ggplot(aes(n_samples_per_mean, sim_sd, color = class, group = class)) +
geom_line(alpha = 0.5)

custom.config = umap::umap.defaults
custom.config$random_state = 123
umap_embedding <-
umap::umap(data,
config = custom.config,
n_neighbors = 20)
umap_embedding <-
as.data.frame(umap_embedding$layout)
umap_embedding$sdv <- rep(sds, each = n_samples_per_mean)
round_it <- function(x) round(as.numeric(x), 1)
umap_embedding %>%
ggplot(aes(V1,V2)) +
geom_hex() +
theme_void() +
theme(legend.position = "none") +
facet_wrap(~sdv, labeller = labeller(sdv = round_it)) +
coord_equal()
umap_embedding %>%
ggplot(aes(V1, V2, color = as.factor(sdv))) +
geom_point() +
theme_void() +
theme(legend.position = "none") +
coord_equal()
---
title: "Percent replicating"
output: html_notebook
---

# Setup

```{r}
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(MASS))
```


```{r}
future::plan(future::multisession, workers = 8)
```

# Specify parameters

```{r}
#set.seed(42)
d <- 100 # dimensions
n_null_reps <- 10 # number of times to repeat shuffling to create the null distribution
n_means <- 20 # number of centers (components)
n_repeats <- 9 # number of times to repeat the sampling
n_samples_per_mean_l <- seq(3, 20) # number of samples per component (iterate over each)
null_threshold_pc <- 0.95 # threshold above which a component is considered to be replicating
sds <- seq(2, 4, length.out = n_means) ** 3  # s.d. of each component
#sds <- 1 + abs(rnorm(n_means)) / 10
```


```{r}
means <- matrix(rnorm(d * n_means), n_means, d) # generate centers
```

# Functions

Elements of upper triangular matrix

```{r}
corvec <- function(m) {cm <- cor(m); cm[upper.tri(cm)]}
```

Compute median replicate correlation (mrc)

```{r}
f_mrc <- function(data_x, n_means, n_samples_per_mean) {
  sapply(1:n_means,
         function(i) {
           start <- (i - 1) * n_samples_per_mean + 1
           end <- i * n_samples_per_mean
           data_i <- data_x[start:end,]
           mean(corvec(t(data_i)))
         })
}
```

Sample the data and compute signal and null distributions of mrc

```{r}
f_signal_null <-
  function(n_means,
           n_samples_per_mean,
           d,
           n_null_reps) {
    data <- lapply(1:n_means,
                   function(i)
                     mvrnorm(n = n_samples_per_mean,
                             means[i, ],
                             sds[i] * diag(rep(1, d))))
    
    data <- do.call(rbind, data)
    
    signal <- f_mrc(data,
                    n_means,
                    n_samples_per_mean)
    
    null <-
      as.vector(sapply(1:n_null_reps,
                       function(i)
                         f_mrc(data[sample(n_samples_per_mean * n_means),],
                               n_means,
                               n_samples_per_mean)))
    
    df <- tibble(
      label =
        c(rep("signal", length(signal)),
          rep("null", length(null))),
      class =
        c(sds,
          rep(NA, length(null))),
      sim = c(signal, null)
    )
    
    df
  }
```

# Generate data

Generate mrc of signal and null, `n_repeats` times.

```{r}
df <- furrr::future_map_dfr(1:n_repeats,
                            function(repeat_id)
                              map_dfr(n_samples_per_mean_l,
                                      function(n_samples_per_mean_i)
                                        f_signal_null(
                                          n_means = n_means,
                                          n_samples_per_mean = n_samples_per_mean_i,
                                          d = d,
                                          n_null_reps = n_null_reps
                                        ) %>%
                                        mutate(n_samples_per_mean =
                                                 n_samples_per_mean_i,
                                               repeat_id = repeat_id)),
                            .options = furrr::furrr_options(seed = TRUE))
```

Compute percent replicating, for each `n_samples_per_mean` (i.e. replicate cardinality) and each `repeat_id` (i.e. each shuffling)

```{r}
dfn <- df %>%
  group_by(n_samples_per_mean, repeat_id) %>%
  nest() %>%
  mutate(percent_rep = map_dbl(data, function(signal_null_df) {
    null_threshold <-
      signal_null_df %>%
      filter(label == "null") %>%
      pull("sim") %>%
      quantile(null_threshold_pc, names = FALSE)
    
    replicating <-
      signal_null_df %>%
      filter(label == "signal") %>%
      mutate(is_replicating = sim > null_threshold)
    
    percent_rep_i <-
      (replicating %>% filter(is_replicating) %>% nrow()) /
      (replicating %>% nrow())
    
    percent_rep_i
    
  })) %>%
  dplyr::select(-data) %>%
  ungroup() %>%
  group_by(n_samples_per_mean)
```


# Plot

## Dependence of MRC on replicate cardinality 

Combine data across repeats

```{r}
df %>%
 ggplot(aes(sim, fill = label)) + 
  stat_density(position = "identity", alpha = 0.6) +
  facet_wrap(~ n_samples_per_mean)
```

# 

```{r}
dfn %>%
  ggplot(aes(n_samples_per_mean, percent_rep, group = n_samples_per_mean)) + 
  geom_boxplot(notch=FALSE, outlier.shape=NA, fill="red", alpha=0.2) + 
  stat_summary(fun=median, geom="line", aes(group=1)) +
  ylab("Percent replicating") +
  xlab("Number of replicates") +
  cowplot::theme_cowplot()
```


```{r}
#df %>% count(n_samples_per_mean, repeat_id, label) %>% count(n_samples_per_mean, label, n)
```

## Visualize data using PCA

```{r}
n_samples_per_mean <- n_samples_per_mean_l[length(n_samples_per_mean_l)]

n_samples_per_mean <- n_samples_per_mean * 10

data <- lapply(1:n_means,
               function(i)
                 mvrnorm(n = n_samples_per_mean,
                         means[i,],
                         sds[i] * diag(rep(1, d))))

data <- do.call(rbind, data)

m <- prcomp(data)

data_pc <- as_tibble(m$x[,1:2])

data_pc$sdv <- rep(sds, each = n_samples_per_mean)

round_it <- function(x) round(as.numeric(x), 1)
```


```{r}
data_pc %>%
  ggplot(aes(PC1, PC2)) + 
  geom_hex() +
  theme_void() +
  theme(legend.position = "none") +
  facet_wrap(~sdv, labeller = labeller(sdv = round_it)) +
  coord_equal()
```

## Plot spread in PC1 wrt s.d.

```{r}
data_pc %>% 
  group_by(sdv) %>%
  summarise(across(everything(), list(sd = sd))) %>%
  ggplot(aes(sdv, PC1_sd)) + 
  geom_point() + 
  geom_smooth(method = lm, formula = y ~ x)
```

## MRC vs cardinality across different dispersion levels

```{r}
df %>% 
  filter(label == "signal") %>% 
  group_by(n_samples_per_mean, class) %>% 
  summarize(across(c("sim"), list(mean = mean, sd = sd)), .groups = "keep") %>%
  ggplot(aes(n_samples_per_mean, sim_mean, color = class, group = class)) + 
  geom_line(alpha = 0.5) #+ 

df %>% 
  filter(label == "signal") %>% 
  group_by(n_samples_per_mean, class) %>% 
  summarize(across(c("sim"), list(mean = mean, sd = sd)), .groups = "keep") %>%
  ggplot(aes(n_samples_per_mean, sim_sd, color = class, group = class)) + 
  geom_line(alpha = 0.5)
```


```{r eval=FALSE}
custom.config = umap::umap.defaults

custom.config$random_state = 123

umap_embedding <-
  umap::umap(data,
             config = custom.config,
             n_neighbors = 20)

umap_embedding <-
  as.data.frame(umap_embedding$layout)

umap_embedding$sdv <- rep(sds, each = n_samples_per_mean)

round_it <- function(x) round(as.numeric(x), 1)

umap_embedding %>%
  ggplot(aes(V1,V2)) + 
  geom_hex() +
  theme_void() +
  theme(legend.position = "none") +
  facet_wrap(~sdv, labeller = labeller(sdv = round_it)) +
  coord_equal()

umap_embedding %>%
  ggplot(aes(V1, V2, color = as.factor(sdv))) + 
  geom_point() +
  theme_void() +
  theme(legend.position = "none") +
  coord_equal()
```

