Setup

library(pacman); p_load(MASS, coda, boot, ggplot2, reshape2, dplyr)

my_theme <- function(){
  theme(
    legend.background = element_rect(colour = "black", linewidth = 0.5),
    legend.key.width = unit(0.5, "cm"),
    legend.key.height = unit(0.4, "cm"),
    panel.background = element_rect(fill = "white"),
    plot.background = element_rect(fill = "white"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_line(colour ="gray80",linewidth = 1,linetype="dashed"),
    panel.grid.minor.y = element_blank(),
    axis.text.x = element_text(colour ="black",size=rel(1.5),hjust=0.5),
    axis.text.y = element_text(colour ="black",size=rel(1.5),vjust=0.5),
    axis.title.x = element_text(colour ="black",size=rel(1.2)),
    axis.title.y = element_text(colour ="black",size=rel(1.2)),
    plot.margin=unit(c(1,0.5,0.5,0.5), "cm"),
    plot.title=element_text(hjust=0.5,vjust=5,size=rel(1.5),face="bold"),
    plot.caption=element_text(hjust=0,size=rel(1.2)),
    axis.line.x=element_line(color="gray80",linewidth = 1,linetype="dashed"),
    axis.line.y=element_line(color="gray80",linewidth = 1,linetype="dashed"),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank())}

# Function to generate correlated binary variables
generate_correlated_data <- function(n, num_vars, fixed_corr) {
  Sigma <- matrix(fixed_corr, nrow = num_vars, ncol = num_vars)
  diag(Sigma) <- 1  # Diagonal to 1
  
  # Adjust Sigma to be positive definite if necessary
  corr_adjustment_factor <- 1
  while(TRUE) {
    eigen_values <- eigen(Sigma)$values
    if (all(eigen_values > 0)) {
      break
    } else {
      corr_adjustment_factor <- corr_adjustment_factor * 0.95
      off_diag_values <- fixed_corr * corr_adjustment_factor
      Sigma[lower.tri(Sigma)] <- off_diag_values
      Sigma[upper.tri(Sigma)] <- off_diag_values}}

  continuous_data <- mvrnorm(n, mu = rep(0, num_vars), Sigma = Sigma)
  binary_data <- ifelse(continuous_data > 0, 1, 0)
  return(binary_data)
}

# Function to compute correlation between two distinct composites
composite_correlation <- function(data) {
    num_vars <- ncol(data)
    
    # Split variables into two groups for composites
    half_vars <- num_vars %/% 2
    composite1_vars <- sample(num_vars, half_vars)
    composite2_vars <- setdiff(1:num_vars, composite1_vars)

    composite1 <- rowSums(data[, composite1_vars, drop = FALSE])
    composite2 <- rowSums(data[, composite2_vars, drop = FALSE])

    # Compute and return the correlation
    return(cor(composite1, composite2))}

Rationale

Wilks’ (1938) first theorem holds that the correlation of a set of linear composites approaches one when the component variables are positively-correlated and the weights in the composites are positive too.

Analyses

Baseline: Does the Theorem Work?

set.seed(123)

# Test for different numbers of variables
num_samples <- 10000
variable_counts <- seq(2, 100, by = 1)  # Adjust as needed
correlations <- numeric(length(variable_counts))

for (i in 1:length(variable_counts)) {
  data <- generate_correlated_data(num_samples, variable_counts[i], 0.5)  # Assume a correlation of 0.5
  correlations[i] <- composite_correlation(data)}

# Print the results
results <- data.frame(Number_of_Variables = variable_counts, Correlation = correlations)
results

What if Correlations Are All Low?

set.seed(123)

num_samples <- 10000
variable_counts <- seq(2, 100, by = 1)
correlations <- numeric(length(variable_counts))

for (i in 1:length(variable_counts)) {
  data <- generate_correlated_data(num_samples, variable_counts[i], 0.1)  # Assume a correlation of 0.1
  correlations[i] <- composite_correlation(data)}

results <- data.frame(Number_of_Variables = variable_counts, Correlation = correlations)
results

What if Correlations Are All High?

set.seed(123)

num_samples <- 10000
variable_counts <- seq(2, 100, by = 1)
correlations <- numeric(length(variable_counts))

for (i in 1:length(variable_counts)) {
  data <- generate_correlated_data(num_samples, variable_counts[i], 0.9)  # Assume a correlation of 0.9
  correlations[i] <- composite_correlation(data)}

results <- data.frame(Number_of_Variables = variable_counts, Correlation = correlations)
results

What if There’s No Correlation?

set.seed(123)

num_samples <- 10000
variable_counts <- seq(2, 100, by = 1)
correlations <- numeric(length(variable_counts))

for (i in 1:length(variable_counts)) {
  data <- generate_correlated_data(num_samples, variable_counts[i], 0)  # Assume no correlation
  correlations[i] <- composite_correlation(data)}

results <- data.frame(Number_of_Variables = variable_counts, Correlation = correlations)
results

What if Correlations Are All Negative?

As implied by Wilks, the theorem does not work in the reverse direction.

set.seed(123)

num_samples <- 10000
variable_counts <- seq(2, 100, by = 1)
correlations <- numeric(length(variable_counts))

for (i in 1:length(variable_counts)) {
  data <- generate_correlated_data(num_samples, variable_counts[i], -0.5)  # Assume a correlation of -0.5
  correlations[i] <- composite_correlation(data)}

results <- data.frame(Number_of_Variables = variable_counts, Correlation = correlations)
results

Small, Modest, and High Inter-Item Correlations

set.seed(123)

# Parameters
num_samples <- 10000
variable_counts <- seq(2, 500, by = 5)
correlation_levels <- c(0.10, 0.30, 0.50, 0.70, 0.90)

# Initialize a dataframe to store results
results <- data.frame(Number_of_Variables = integer(),
                      Correlation = numeric(),
                      Correlation_Level = factor())

# Run simulation for each correlation level
for (corr_level in correlation_levels) {
  correlations <- numeric(length(variable_counts))
  
  for (i in 1:length(variable_counts)) {
    data <- generate_correlated_data(num_samples, variable_counts[i], corr_level)
    correlations[i] <- composite_correlation(data)}

  # Combine results for the current correlation level
  results <- rbind(results, data.frame(
    Number_of_Variables = variable_counts,
    Correlation = correlations,
    Correlation_Level = as.factor(corr_level)))}

ggplot(results, aes(x = Number_of_Variables, y = Correlation, color = Correlation_Level)) +
  geom_line(linewidth = 1) +
  labs(title = "Composite Score Correlations by Number of Items and Item Intercorrelation Strength",
       x = "Number of Items in Composite",
       y = "Correlation Between Composites",
       color = "Item Intercorrelation") +
  my_theme() + 
  scale_color_manual(values = c("orangered", "#1874cd", "#67238a", "#B33F62", "#F3C677")) + 
  theme(
    legend.position = c(.915, .115))

Discussion

Wilks’ first theorem from his 1938 paper holds up. This is a very interesting results, as it suggests that positive manifold implies different cognitive ability and achievement tests will tend to be highly intercorrelated given sufficient length, reliability, breadth, and the absence of bias. This is also the explanation for why Fried, Greene & Eaton (2021) found that the “p factor is the sum of its parts”, and it is why latent representations that do not suffer from this sort of tautological explanation are worthwhile to explore, as noted by Johnson et al. (2004).

References

Wilks, S. S. (1938). Weighting systems for linear functions of correlated variables when there is no dependent variable. Psychometrika, 3(1), 23–40. https://doi.org/10.1007/BF02287917

Fried, E. I., Greene, A. L., & Eaton, N. R. (2021). The p factor is the sum of its parts, for now. World Psychiatry, 20(1), 69–70. https://doi.org/10.1002/wps.20814

Johnson, W., Bouchard, T. J., Krueger, R. F., McGue, M., & Gottesman, I. I. (2004). Just one g: Consistent results from three test batteries. Intelligence, 32(1), 95–107. https://doi.org/10.1016/S0160-2896(03)00062-X

Postscript: Bootstrapping

Bootstrapping

# Function to generate positively correlated variables
#generate_correlated_data <- function(n, num_vars) {
#  # Create a positive definite correlation matrix
#  Sigma <- generate_correlation_matrix(num_vars)
#  
#  # Generate data
#  data <- mvrnorm(n, mu = rep(0, num_vars), Sigma = Sigma)
#  return(data)
#}
#
# Function to compute correlation between two distinct composites
#composite_correlation <- function(data, indices) {
#  # If indices is NULL, use all data. This is for the initial computation.
#  if (is.null(indices)) {
#    indices <- 1:nrow(data)
#  }
#  data <- data[indices, ]
#  
#  num_vars <- ncol(data)
#  half_vars <- num_vars %/% 2
#  composite1_vars <- sample(num_vars, half_vars)
#  composite2_vars <- setdiff(1:num_vars, composite1_vars)
#
#  composite1 <- rowSums(data[, composite1_vars, drop = FALSE])
#  composite2 <- rowSums(data[, composite2_vars, drop = FALSE])
#
#  # Compute and return correlation
#  return(cor(composite1, composite2))}
#
#set.seed(123)
#
# Bootstrap procedure
#bootstrap_correlation <- function(data, num_bootstrap) {
#  boot(data, statistic = composite_correlation, R = num_bootstrap)}
#
# Parameters
#num_bootstrap <- 10000  # Number of bootstrap samples
#num_samples <- 10000    # Number of samples for each variable set
#variable_counts <- seq(2, 500, by = 5) # Variables to test
#bootstrap_results <- list()
#percentiles <- matrix(NA, length(variable_counts), 3, 
#                      dimnames = list(variable_counts, c("2.5%", "50%", "97.5%")))
#
# Perform bootstrap for each variable count and store percentiles
#for (i in 1:length(variable_counts)) {
#  num_vars <- variable_counts[i]
#  data <- generate_correlated_data(num_samples, num_vars)
#  boot_res <- bootstrap_correlation(data, num_bootstrap)
#  bootstrap_results[[as.character(num_vars)]] <- boot_res
#  
#  # Extract and store percentiles
#  boot_percentiles <- apply(boot_res$t, 2, function(x) quantile(x, probs = c(0.025, 0.5, 0.975)))
#  percentiles[i, "2.5%"] <- boot_percentiles[1]
#  percentiles[i, "50%"] <- boot_percentiles[2]
#  percentiles[i, "97.5%"] <- boot_percentiles[3]}
#
# Convert the percentiles matrix to a dataframe
#percentiles_df <- as.data.frame(percentiles)
#percentiles_df$Number_of_Variables <- as.numeric(row.names(percentiles_df))
#
# Melt the dataframe into a long format
#long_df <- melt(percentiles_df, id.vars = 'Number_of_Variables', variable.name = 'Percentile', value.name = #'Value')
#
# Spread the data into a wide format for median and CIs
#wide_df <- long_df %>%
#  pivot_wider(id_cols = Number_of_Variables, names_from = Percentile, values_from = Value) %>%
#  rename(Median = `50%`, Val25 = `2.5%`, Val975 = `97.5%`)
#
# Plotting
#ggplot(wide_df, aes(x = Number_of_Variables, y = Median)) +
#  geom_line(color = 'blue') +
#  geom_ribbon(aes(ymin = Val25, ymax = Val975), alpha = 0.2, fill = "blue") +
#  labs(title = "Bootstrapped Composite Correlations With Empirical Confidence Intervals",
#       x = "Number of Variables",
#       y = "Composite Correlation") +
#  my_theme() + 
#  scale_y_continuous(limits = c(0, 1))

Bootstrapped Composite Correlations and Confidence Intervals

sessionInfo()

## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dplyr_1.1.2    reshape2_1.4.4 ggplot2_3.4.4  boot_1.3-28.1  coda_0.19-4   
## [6] MASS_7.3-60    pacman_0.5.1  
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.4      jsonlite_1.8.7    highr_0.10        compiler_4.3.1   
##  [5] tidyselect_1.2.0  Rcpp_1.0.11       stringr_1.5.1     jquerylib_0.1.4  
##  [9] scales_1.2.1      yaml_2.3.7        fastmap_1.1.1     lattice_0.22-5   
## [13] R6_2.5.1          plyr_1.8.8        labeling_0.4.3    generics_0.1.3   
## [17] knitr_1.45        tibble_3.2.1      munsell_0.5.0     bslib_0.6.0      
## [21] pillar_1.9.0      rlang_1.1.1       utf8_1.2.3        stringi_1.7.12   
## [25] cachem_1.0.8      xfun_0.39         sass_0.4.7        cli_3.6.1        
## [29] withr_2.5.2       magrittr_2.0.3    digest_0.6.33     grid_4.3.1       
## [33] rstudioapi_0.15.0 lifecycle_1.0.4   vctrs_0.6.3       evaluate_0.23    
## [37] glue_1.6.2        farver_2.1.1      fansi_1.0.4       colorspace_2.1-0 
## [41] rmarkdown_2.25    tools_4.3.1       pkgconfig_2.0.3   htmltools_0.5.7

Wilks’ Other Theorem