Setup

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(MASS))
future::plan(future::multisession, workers = 8)

Specify parameters

#set.seed(42)
d <- 100 # dimensions
n_null_reps <- 10 # number of times to repeat shuffling to create the null distribution
n_means <- 20 # number of centers (components)
n_repeats <- 9 # number of times to repeat the sampling
n_samples_per_mean_l <- seq(3, 20) # number of samples per component (iterate over each)
null_threshold_pc <- 0.95 # threshold above which a component is considered to be replicating
sds <- seq(2, 4, length.out = n_means) ** 3  # s.d. of each component
#sds <- 1 + abs(rnorm(n_means)) / 10
means <- matrix(rnorm(d * n_means), n_means, d) # generate centers

Functions

Elements of upper triangular matrix

corvec <- function(m) {cm <- cor(m); cm[upper.tri(cm)]}

Compute median replicate correlation (mrc)

f_mrc <- function(data_x, n_means, n_samples_per_mean) {
  sapply(1:n_means,
         function(i) {
           start <- (i - 1) * n_samples_per_mean + 1
           end <- i * n_samples_per_mean
           data_i <- data_x[start:end,]
           mean(corvec(t(data_i)))
         })
}

Sample the data and compute signal and null distributions of mrc

f_signal_null <-
  function(n_means,
           n_samples_per_mean,
           d,
           n_null_reps) {
    data <- lapply(1:n_means,
                   function(i)
                     mvrnorm(n = n_samples_per_mean,
                             means[i, ],
                             sds[i] * diag(rep(1, d))))
    
    data <- do.call(rbind, data)
    
    signal <- f_mrc(data,
                    n_means,
                    n_samples_per_mean)
    
    null <-
      as.vector(sapply(1:n_null_reps,
                       function(i)
                         f_mrc(data[sample(n_samples_per_mean * n_means),],
                               n_means,
                               n_samples_per_mean)))
    
    df <- tibble(
      label =
        c(rep("signal", length(signal)),
          rep("null", length(null))),
      class =
        c(sds,
          rep(NA, length(null))),
      sim = c(signal, null)
    )
    
    df
  }

Generate data

Generate mrc of signal and null, n_repeats times.

df <- furrr::future_map_dfr(1:n_repeats,
                            function(repeat_id)
                              map_dfr(n_samples_per_mean_l,
                                      function(n_samples_per_mean_i)
                                        f_signal_null(
                                          n_means = n_means,
                                          n_samples_per_mean = n_samples_per_mean_i,
                                          d = d,
                                          n_null_reps = n_null_reps
                                        ) %>%
                                        mutate(n_samples_per_mean =
                                                 n_samples_per_mean_i,
                                               repeat_id = repeat_id)),
                            .options = furrr::furrr_options(seed = TRUE))

Compute percent replicating, for each n_samples_per_mean (i.e. replicate cardinality) and each repeat_id (i.e. each shuffling)

dfn <- df %>%
  group_by(n_samples_per_mean, repeat_id) %>%
  nest() %>%
  mutate(percent_rep = map_dbl(data, function(signal_null_df) {
    null_threshold <-
      signal_null_df %>%
      filter(label == "null") %>%
      pull("sim") %>%
      quantile(null_threshold_pc, names = FALSE)
    
    replicating <-
      signal_null_df %>%
      filter(label == "signal") %>%
      mutate(is_replicating = sim > null_threshold)
    
    percent_rep_i <-
      (replicating %>% filter(is_replicating) %>% nrow()) /
      (replicating %>% nrow())
    
    percent_rep_i
    
  })) %>%
  dplyr::select(-data) %>%
  ungroup() %>%
  group_by(n_samples_per_mean)

Plot

Dependence of MRC on replicate cardinality

Combine data across repeats

df %>%
 ggplot(aes(sim, fill = label)) + 
  stat_density(position = "identity", alpha = 0.6) +
  facet_wrap(~ n_samples_per_mean)

dfn %>%
  ggplot(aes(n_samples_per_mean, percent_rep, group = n_samples_per_mean)) + 
  geom_boxplot(notch=FALSE, outlier.shape=NA, fill="red", alpha=0.2) + 
  stat_summary(fun=median, geom="line", aes(group=1)) +
  ylab("Percent replicating") +
  xlab("Number of replicates") +
  cowplot::theme_cowplot()

#df %>% count(n_samples_per_mean, repeat_id, label) %>% count(n_samples_per_mean, label, n)

Visualize data using PCA

n_samples_per_mean <- n_samples_per_mean_l[length(n_samples_per_mean_l)]

n_samples_per_mean <- n_samples_per_mean * 10

data <- lapply(1:n_means,
               function(i)
                 mvrnorm(n = n_samples_per_mean,
                         means[i,],
                         sds[i] * diag(rep(1, d))))

data <- do.call(rbind, data)

m <- prcomp(data)

data_pc <- as_tibble(m$x[,1:2])

data_pc$sdv <- rep(sds, each = n_samples_per_mean)

round_it <- function(x) round(as.numeric(x), 1)
data_pc %>%
  ggplot(aes(PC1, PC2)) + 
  geom_hex() +
  theme_void() +
  theme(legend.position = "none") +
  facet_wrap(~sdv, labeller = labeller(sdv = round_it)) +
  coord_equal()

Plot spread in PC1 wrt s.d.

data_pc %>% 
  group_by(sdv) %>%
  summarise(across(everything(), list(sd = sd))) %>%
  ggplot(aes(sdv, PC1_sd)) + 
  geom_point() + 
  geom_smooth(method = lm, formula = y ~ x)

MRC vs cardinality across different dispersion levels

df %>% 
  filter(label == "signal") %>% 
  group_by(n_samples_per_mean, class) %>% 
  summarize(across(c("sim"), list(mean = mean, sd = sd)), .groups = "keep") %>%
  ggplot(aes(n_samples_per_mean, sim_mean, color = class, group = class)) + 
  geom_line(alpha = 0.5) #+ 


df %>% 
  filter(label == "signal") %>% 
  group_by(n_samples_per_mean, class) %>% 
  summarize(across(c("sim"), list(mean = mean, sd = sd)), .groups = "keep") %>%
  ggplot(aes(n_samples_per_mean, sim_sd, color = class, group = class)) + 
  geom_line(alpha = 0.5)

custom.config = umap::umap.defaults

custom.config$random_state = 123

umap_embedding <-
  umap::umap(data,
             config = custom.config,
             n_neighbors = 20)

umap_embedding <-
  as.data.frame(umap_embedding$layout)

umap_embedding$sdv <- rep(sds, each = n_samples_per_mean)

round_it <- function(x) round(as.numeric(x), 1)

umap_embedding %>%
  ggplot(aes(V1,V2)) + 
  geom_hex() +
  theme_void() +
  theme(legend.position = "none") +
  facet_wrap(~sdv, labeller = labeller(sdv = round_it)) +
  coord_equal()

umap_embedding %>%
  ggplot(aes(V1, V2, color = as.factor(sdv))) + 
  geom_point() +
  theme_void() +
  theme(legend.position = "none") +
  coord_equal()
---
title: "Percent replicating"
output: html_notebook
---

# Setup

```{r}
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(MASS))
```


```{r}
future::plan(future::multisession, workers = 8)
```

# Specify parameters

```{r}
#set.seed(42)
d <- 100 # dimensions
n_null_reps <- 10 # number of times to repeat shuffling to create the null distribution
n_means <- 20 # number of centers (components)
n_repeats <- 9 # number of times to repeat the sampling
n_samples_per_mean_l <- seq(3, 20) # number of samples per component (iterate over each)
null_threshold_pc <- 0.95 # threshold above which a component is considered to be replicating
sds <- seq(2, 4, length.out = n_means) ** 3  # s.d. of each component
#sds <- 1 + abs(rnorm(n_means)) / 10
```


```{r}
means <- matrix(rnorm(d * n_means), n_means, d) # generate centers
```

# Functions

Elements of upper triangular matrix

```{r}
corvec <- function(m) {cm <- cor(m); cm[upper.tri(cm)]}
```

Compute median replicate correlation (mrc)

```{r}
f_mrc <- function(data_x, n_means, n_samples_per_mean) {
  sapply(1:n_means,
         function(i) {
           start <- (i - 1) * n_samples_per_mean + 1
           end <- i * n_samples_per_mean
           data_i <- data_x[start:end,]
           mean(corvec(t(data_i)))
         })
}
```

Sample the data and compute signal and null distributions of mrc

```{r}
f_signal_null <-
  function(n_means,
           n_samples_per_mean,
           d,
           n_null_reps) {
    data <- lapply(1:n_means,
                   function(i)
                     mvrnorm(n = n_samples_per_mean,
                             means[i, ],
                             sds[i] * diag(rep(1, d))))
    
    data <- do.call(rbind, data)
    
    signal <- f_mrc(data,
                    n_means,
                    n_samples_per_mean)
    
    null <-
      as.vector(sapply(1:n_null_reps,
                       function(i)
                         f_mrc(data[sample(n_samples_per_mean * n_means),],
                               n_means,
                               n_samples_per_mean)))
    
    df <- tibble(
      label =
        c(rep("signal", length(signal)),
          rep("null", length(null))),
      class =
        c(sds,
          rep(NA, length(null))),
      sim = c(signal, null)
    )
    
    df
  }
```

# Generate data

Generate mrc of signal and null, `n_repeats` times.

```{r}
df <- furrr::future_map_dfr(1:n_repeats,
                            function(repeat_id)
                              map_dfr(n_samples_per_mean_l,
                                      function(n_samples_per_mean_i)
                                        f_signal_null(
                                          n_means = n_means,
                                          n_samples_per_mean = n_samples_per_mean_i,
                                          d = d,
                                          n_null_reps = n_null_reps
                                        ) %>%
                                        mutate(n_samples_per_mean =
                                                 n_samples_per_mean_i,
                                               repeat_id = repeat_id)),
                            .options = furrr::furrr_options(seed = TRUE))
```

Compute percent replicating, for each `n_samples_per_mean` (i.e. replicate cardinality) and each `repeat_id` (i.e. each shuffling)

```{r}
dfn <- df %>%
  group_by(n_samples_per_mean, repeat_id) %>%
  nest() %>%
  mutate(percent_rep = map_dbl(data, function(signal_null_df) {
    null_threshold <-
      signal_null_df %>%
      filter(label == "null") %>%
      pull("sim") %>%
      quantile(null_threshold_pc, names = FALSE)
    
    replicating <-
      signal_null_df %>%
      filter(label == "signal") %>%
      mutate(is_replicating = sim > null_threshold)
    
    percent_rep_i <-
      (replicating %>% filter(is_replicating) %>% nrow()) /
      (replicating %>% nrow())
    
    percent_rep_i
    
  })) %>%
  dplyr::select(-data) %>%
  ungroup() %>%
  group_by(n_samples_per_mean)
```


# Plot

## Dependence of MRC on replicate cardinality 

Combine data across repeats

```{r}
df %>%
 ggplot(aes(sim, fill = label)) + 
  stat_density(position = "identity", alpha = 0.6) +
  facet_wrap(~ n_samples_per_mean)
```

# 

```{r}
dfn %>%
  ggplot(aes(n_samples_per_mean, percent_rep, group = n_samples_per_mean)) + 
  geom_boxplot(notch=FALSE, outlier.shape=NA, fill="red", alpha=0.2) + 
  stat_summary(fun=median, geom="line", aes(group=1)) +
  ylab("Percent replicating") +
  xlab("Number of replicates") +
  cowplot::theme_cowplot()
```


```{r}
#df %>% count(n_samples_per_mean, repeat_id, label) %>% count(n_samples_per_mean, label, n)
```

## Visualize data using PCA

```{r}
n_samples_per_mean <- n_samples_per_mean_l[length(n_samples_per_mean_l)]

n_samples_per_mean <- n_samples_per_mean * 10

data <- lapply(1:n_means,
               function(i)
                 mvrnorm(n = n_samples_per_mean,
                         means[i,],
                         sds[i] * diag(rep(1, d))))

data <- do.call(rbind, data)

m <- prcomp(data)

data_pc <- as_tibble(m$x[,1:2])

data_pc$sdv <- rep(sds, each = n_samples_per_mean)

round_it <- function(x) round(as.numeric(x), 1)
```


```{r}
data_pc %>%
  ggplot(aes(PC1, PC2)) + 
  geom_hex() +
  theme_void() +
  theme(legend.position = "none") +
  facet_wrap(~sdv, labeller = labeller(sdv = round_it)) +
  coord_equal()
```

## Plot spread in PC1 wrt s.d.

```{r}
data_pc %>% 
  group_by(sdv) %>%
  summarise(across(everything(), list(sd = sd))) %>%
  ggplot(aes(sdv, PC1_sd)) + 
  geom_point() + 
  geom_smooth(method = lm, formula = y ~ x)
```

## MRC vs cardinality across different dispersion levels

```{r}
df %>% 
  filter(label == "signal") %>% 
  group_by(n_samples_per_mean, class) %>% 
  summarize(across(c("sim"), list(mean = mean, sd = sd)), .groups = "keep") %>%
  ggplot(aes(n_samples_per_mean, sim_mean, color = class, group = class)) + 
  geom_line(alpha = 0.5) #+ 

df %>% 
  filter(label == "signal") %>% 
  group_by(n_samples_per_mean, class) %>% 
  summarize(across(c("sim"), list(mean = mean, sd = sd)), .groups = "keep") %>%
  ggplot(aes(n_samples_per_mean, sim_sd, color = class, group = class)) + 
  geom_line(alpha = 0.5)
```


```{r eval=FALSE}
custom.config = umap::umap.defaults

custom.config$random_state = 123

umap_embedding <-
  umap::umap(data,
             config = custom.config,
             n_neighbors = 20)

umap_embedding <-
  as.data.frame(umap_embedding$layout)

umap_embedding$sdv <- rep(sds, each = n_samples_per_mean)

round_it <- function(x) round(as.numeric(x), 1)

umap_embedding %>%
  ggplot(aes(V1,V2)) + 
  geom_hex() +
  theme_void() +
  theme(legend.position = "none") +
  facet_wrap(~sdv, labeller = labeller(sdv = round_it)) +
  coord_equal()

umap_embedding %>%
  ggplot(aes(V1, V2, color = as.factor(sdv))) + 
  geom_point() +
  theme_void() +
  theme(legend.position = "none") +
  coord_equal()
```

