PLS_Power_Analysis

Author

John Anderson

PLS Power analysis

In order to run a power analysis for PLS, we need to simulate a data frame. Ideally we do this from existing data & generate increasingly large sets of data until we get convergence/significance.

We will be using the faux package to simulate from existing data. We will use these datasets & fit models of interest & retain key variables eigenvalues, explained variance and p-values. I will plot these using a line plot & this will be the primary outcome of the analysis.

Load the Required Packages

First load the required packages - do so quietly (turn off echo, warnings, and messages). Be aware that if there’s an issue, you will have to disable those to debug this.

library(readr)
library(TExPosition)
library(data4PCCAR)
library(PTCA4CATA)
library(ggplot2)
library(tidyplots)
library(faux)
library(tidyverse)
library(fastDummies)
library(Matrix)

load the data

Dummy code group and sex (might have to remove this from the final version)

temp <- dummy_cols(temp,select_columns = "group",remove_selected_columns = TRUE)

Simulate the dataframes at different N’s & fit PLS

We will need to save the eigenvalues for the first and second LV’s, as well as the cross-block covariance. The rule will be >10% covariance, eigenvalue > 1 to retain the LV.

# Split the data into two matrices
DATA1 <- as.matrix(temp[, c("group_BC", "group_HC", "age")])
DATA2 <- as.matrix(temp[, c("r_vol_Sub", "r_vol_CA1", "r_vol_CA2", "r_vol_CA3", "r_vol_CA4" , "r_vol_DG", "r_vol_SRLM", "r_vol_Cyst")])

# Run the tepPLS model
tepPLS_results <- tepPLS(DATA1, DATA2, graphs = FALSE)

# permuation tests byColumns
nIter <- 1000 #set the number of iterations to run the permutation test 

perm.bycol <- perm4PLSC(DATA1, 
                        DATA2, 
                        permType = 'byColumns', 
                        nIter=nIter) # the default type is byMat which permute by labels of observations
                                                                        # 

scree      <- PlotScree(ev = tepPLS_results$TExPosition.Data$eigs,
                        p.ev = perm.bycol$pEigenvalues,
                        title = "Explained Variance per Dimension + Permutation Tests",
                        plotKaiser = FALSE)

tepPLS_results$TExPosition.Data$t

[1] 90.42723  9.57277

# Load the parallel package
library(parallel)

# Define the sample sizes to loop over
sample_sizes <- seq(10, 1000, by = 10)

# Function to run the PLS analysis for a given sample size and iteration
run_pls_analysis <- function(n, i) {
  # Simulate the data for the current sample size
  temp <- sim_df(residual_df, n = n, between = "group")
  
  # Dummy code the group variable
  temp <- dummy_cols(temp, select_columns = "group", remove_selected_columns = TRUE)
  
  # Split the data into two matrices
  DATA1 <- as.matrix(temp[, c("group_BC", "group_HC", "age")])
  DATA2 <- as.matrix(temp[, c("r_vol_Sub", "r_vol_CA1", "r_vol_CA2", "r_vol_CA3", "r_vol_CA4", "r_vol_DG", "r_vol_SRLM", "r_vol_Cyst")])
  
  # Run the tepPLS model
  tepPLS_results <- tepPLS(DATA1, DATA2, graphs = FALSE)
  
  # Return the covariance explained per LV
  return(tepPLS_results$TExPosition.Data$t)
}

# Initialize a list to store raw results
results_list <- list()

# Initialize a list to store summary results
summary_results_list <- list()

# Loop over sample sizes
for (n in sample_sizes) {
  # Use mclapply to run the PLS analysis in parallel for 100 iterations
  covariance_explained <- mclapply(1:100, function(i) run_pls_analysis(n, i), mc.cores = detectCores() - 1)
  
  # Combine the raw results for the current sample size into a data frame
  results_df <- do.call(rbind, lapply(1:100, function(i) {
    data.frame(
      SampleSize = n,
      Iteration = i,
      LV = 1:length(covariance_explained[[i]]),
      CovarianceExplained = covariance_explained[[i]]
    )
  }))
  
  # Store the raw results for the current sample size
  results_list[[as.character(n)]] <- results_df
  
  # Summarize the results for the current sample size
  covariance_matrix <- do.call(rbind, covariance_explained)
  mean_covariance <- colMeans(covariance_matrix)
  sd_covariance <- apply(covariance_matrix, 2, sd)
  
  summary_results_list[[as.character(n)]] <- data.frame(
    SampleSize = n,
    LV = 1:length(mean_covariance),
    MeanCovariance = mean_covariance,
    SDCovariance = sd_covariance
  )
}

# Combine all raw results into a single data frame
results_df <- do.call(rbind, results_list)

# Combine all summary results into a single data frame
summary_results_df <- do.call(rbind, summary_results_list)

# View the raw results data frame
print(head(results_df))

     SampleSize Iteration LV CovarianceExplained
10.1         10         1  1           78.705063
10.2         10         1  2           21.294937
10.3         10         2  1           78.848293
10.4         10         2  2           21.151707
10.5         10         3  1           95.702362
10.6         10         3  2            4.297638

# View the summary results data frame
print(head(summary_results_df))

     SampleSize LV MeanCovariance SDCovariance
10.1         10  1       81.73507     9.224542
10.2         10  2       18.26493     9.224542
20.1         20  1       80.25529    10.418521
20.2         20  2       19.74471    10.418521
30.1         30  1       80.93860     7.733662
30.2         30  2       19.06140     7.733662

# Filter the data for LV1 and LV2
lv_data <- summary_results_df %>%
  filter(LV %in% c(1, 2))

lv_data$LV <- as.factor(lv_data$LV)
summary_results_df$LV <- as.factor(summary_results_df$LV)
# Calculate the median covariance explained for LV1 and LV2
mean_covariance_lv1 <- mean(results_df %>% filter(LV == 1) %>% pull(CovarianceExplained))
mean_covariance_lv2 <- mean(results_df %>% filter(LV == 2) %>% pull(CovarianceExplained))



LV_1_Plot <-dplyr::filter(results_df, LV == 1) |>
  tidyplot(x = SampleSize, y = CovarianceExplained) |>
  add_mean_line() |>
  add_sem_ribbon()+
  geom_hline(yintercept = mean_covariance_lv1, linetype = 2, colour = "gray")+
  geom_vline(xintercept = 100, linetype = 2, colour = "gray") 
 
LV_1_Plot <- LV_1_Plot |> adjust_title("First Latent Variable") |>
  adjust_x_axis_title("Simulated sample \n per group") |>
  adjust_y_axis_title("Covariance Explained") 

LV_2_Plot <- dplyr::filter(results_df, LV == 2) |>
  tidyplot(x = SampleSize, y = CovarianceExplained) |>
  add_mean_line() |>
  add_sem_ribbon()+
  geom_hline(yintercept = mean_covariance_lv2, linetype = 2, colour = "gray")+
  geom_vline(xintercept = 100, linetype = 2, colour = "gray")

LV_2_Plot <- LV_2_Plot |> adjust_title("Second Latent Variable") |>
  adjust_x_axis_title("Simulated sample Size \n per group") |>
  adjust_y_axis_title("Covariance Explained")

LV_1_Plot + LV_2_Plot

Brain Plot