library(pacman); p_load(MASS, coda, boot, ggplot2, reshape2, dplyr)
my_theme <- function(){
theme(
legend.background = element_rect(colour = "black", linewidth = 0.5),
legend.key.width = unit(0.5, "cm"),
legend.key.height = unit(0.4, "cm"),
panel.background = element_rect(fill = "white"),
plot.background = element_rect(fill = "white"),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(colour ="gray80",linewidth = 1,linetype="dashed"),
panel.grid.minor.y = element_blank(),
axis.text.x = element_text(colour ="black",size=rel(1.5),hjust=0.5),
axis.text.y = element_text(colour ="black",size=rel(1.5),vjust=0.5),
axis.title.x = element_text(colour ="black",size=rel(1.2)),
axis.title.y = element_text(colour ="black",size=rel(1.2)),
plot.margin=unit(c(1,0.5,0.5,0.5), "cm"),
plot.title=element_text(hjust=0.5,vjust=5,size=rel(1.5),face="bold"),
plot.caption=element_text(hjust=0,size=rel(1.2)),
axis.line.x=element_line(color="gray80",linewidth = 1,linetype="dashed"),
axis.line.y=element_line(color="gray80",linewidth = 1,linetype="dashed"),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank())}
# Function to generate correlated binary variables
generate_correlated_data <- function(n, num_vars, fixed_corr) {
Sigma <- matrix(fixed_corr, nrow = num_vars, ncol = num_vars)
diag(Sigma) <- 1 # Diagonal to 1
# Adjust Sigma to be positive definite if necessary
corr_adjustment_factor <- 1
while(TRUE) {
eigen_values <- eigen(Sigma)$values
if (all(eigen_values > 0)) {
break
} else {
corr_adjustment_factor <- corr_adjustment_factor * 0.95
off_diag_values <- fixed_corr * corr_adjustment_factor
Sigma[lower.tri(Sigma)] <- off_diag_values
Sigma[upper.tri(Sigma)] <- off_diag_values}}
continuous_data <- mvrnorm(n, mu = rep(0, num_vars), Sigma = Sigma)
binary_data <- ifelse(continuous_data > 0, 1, 0)
return(binary_data)
}
# Function to compute correlation between two distinct composites
composite_correlation <- function(data) {
num_vars <- ncol(data)
# Split variables into two groups for composites
half_vars <- num_vars %/% 2
composite1_vars <- sample(num_vars, half_vars)
composite2_vars <- setdiff(1:num_vars, composite1_vars)
composite1 <- rowSums(data[, composite1_vars, drop = FALSE])
composite2 <- rowSums(data[, composite2_vars, drop = FALSE])
# Compute and return the correlation
return(cor(composite1, composite2))}
Wilks’ (1938) first theorem holds that the correlation of a set of linear composites approaches one when the component variables are positively-correlated and the weights in the composites are positive too.
set.seed(123)
# Test for different numbers of variables
num_samples <- 10000
variable_counts <- seq(2, 100, by = 1) # Adjust as needed
correlations <- numeric(length(variable_counts))
for (i in 1:length(variable_counts)) {
data <- generate_correlated_data(num_samples, variable_counts[i], 0.5) # Assume a correlation of 0.5
correlations[i] <- composite_correlation(data)}
# Print the results
results <- data.frame(Number_of_Variables = variable_counts, Correlation = correlations)
results
set.seed(123)
num_samples <- 10000
variable_counts <- seq(2, 100, by = 1)
correlations <- numeric(length(variable_counts))
for (i in 1:length(variable_counts)) {
data <- generate_correlated_data(num_samples, variable_counts[i], 0.1) # Assume a correlation of 0.1
correlations[i] <- composite_correlation(data)}
results <- data.frame(Number_of_Variables = variable_counts, Correlation = correlations)
results
set.seed(123)
num_samples <- 10000
variable_counts <- seq(2, 100, by = 1)
correlations <- numeric(length(variable_counts))
for (i in 1:length(variable_counts)) {
data <- generate_correlated_data(num_samples, variable_counts[i], 0.9) # Assume a correlation of 0.9
correlations[i] <- composite_correlation(data)}
results <- data.frame(Number_of_Variables = variable_counts, Correlation = correlations)
results
set.seed(123)
num_samples <- 10000
variable_counts <- seq(2, 100, by = 1)
correlations <- numeric(length(variable_counts))
for (i in 1:length(variable_counts)) {
data <- generate_correlated_data(num_samples, variable_counts[i], 0) # Assume no correlation
correlations[i] <- composite_correlation(data)}
results <- data.frame(Number_of_Variables = variable_counts, Correlation = correlations)
results
As implied by Wilks, the theorem does not work in the reverse direction.
set.seed(123)
num_samples <- 10000
variable_counts <- seq(2, 100, by = 1)
correlations <- numeric(length(variable_counts))
for (i in 1:length(variable_counts)) {
data <- generate_correlated_data(num_samples, variable_counts[i], -0.5) # Assume a correlation of -0.5
correlations[i] <- composite_correlation(data)}
results <- data.frame(Number_of_Variables = variable_counts, Correlation = correlations)
results
set.seed(123)
# Parameters
num_samples <- 10000
variable_counts <- seq(2, 500, by = 5)
correlation_levels <- c(0.10, 0.30, 0.50, 0.70, 0.90)
# Initialize a dataframe to store results
results <- data.frame(Number_of_Variables = integer(),
Correlation = numeric(),
Correlation_Level = factor())
# Run simulation for each correlation level
for (corr_level in correlation_levels) {
correlations <- numeric(length(variable_counts))
for (i in 1:length(variable_counts)) {
data <- generate_correlated_data(num_samples, variable_counts[i], corr_level)
correlations[i] <- composite_correlation(data)}
# Combine results for the current correlation level
results <- rbind(results, data.frame(
Number_of_Variables = variable_counts,
Correlation = correlations,
Correlation_Level = as.factor(corr_level)))}
ggplot(results, aes(x = Number_of_Variables, y = Correlation, color = Correlation_Level)) +
geom_line(linewidth = 1) +
labs(title = "Composite Score Correlations by Number of Items and Item Intercorrelation Strength",
x = "Number of Items in Composite",
y = "Correlation Between Composites",
color = "Item Intercorrelation") +
my_theme() +
scale_color_manual(values = c("orangered", "#1874cd", "#67238a", "#B33F62", "#F3C677")) +
theme(
legend.position = c(.915, .115))
Wilks’ first theorem from his 1938 paper holds up. This is a very interesting results, as it suggests that positive manifold implies different cognitive ability and achievement tests will tend to be highly intercorrelated given sufficient length, reliability, breadth, and the absence of bias. This is also the explanation for why Fried, Greene & Eaton (2021) found that the “p factor is the sum of its parts”, and it is why latent representations that do not suffer from this sort of tautological explanation are worthwhile to explore, as noted by Johnson et al. (2004).
Wilks, S. S. (1938). Weighting systems for linear functions of correlated variables when there is no dependent variable. Psychometrika, 3(1), 23–40. https://doi.org/10.1007/BF02287917
Fried, E. I., Greene, A. L., & Eaton, N. R. (2021). The p factor is the sum of its parts, for now. World Psychiatry, 20(1), 69–70. https://doi.org/10.1002/wps.20814
Johnson, W., Bouchard, T. J., Krueger, R. F., McGue, M., & Gottesman, I. I. (2004). Just one g: Consistent results from three test batteries. Intelligence, 32(1), 95–107. https://doi.org/10.1016/S0160-2896(03)00062-X
# Function to generate positively correlated variables
#generate_correlated_data <- function(n, num_vars) {
# # Create a positive definite correlation matrix
# Sigma <- generate_correlation_matrix(num_vars)
#
# # Generate data
# data <- mvrnorm(n, mu = rep(0, num_vars), Sigma = Sigma)
# return(data)
#}
#
# Function to compute correlation between two distinct composites
#composite_correlation <- function(data, indices) {
# # If indices is NULL, use all data. This is for the initial computation.
# if (is.null(indices)) {
# indices <- 1:nrow(data)
# }
# data <- data[indices, ]
#
# num_vars <- ncol(data)
# half_vars <- num_vars %/% 2
# composite1_vars <- sample(num_vars, half_vars)
# composite2_vars <- setdiff(1:num_vars, composite1_vars)
#
# composite1 <- rowSums(data[, composite1_vars, drop = FALSE])
# composite2 <- rowSums(data[, composite2_vars, drop = FALSE])
#
# # Compute and return correlation
# return(cor(composite1, composite2))}
#
#set.seed(123)
#
# Bootstrap procedure
#bootstrap_correlation <- function(data, num_bootstrap) {
# boot(data, statistic = composite_correlation, R = num_bootstrap)}
#
# Parameters
#num_bootstrap <- 10000 # Number of bootstrap samples
#num_samples <- 10000 # Number of samples for each variable set
#variable_counts <- seq(2, 500, by = 5) # Variables to test
#bootstrap_results <- list()
#percentiles <- matrix(NA, length(variable_counts), 3,
# dimnames = list(variable_counts, c("2.5%", "50%", "97.5%")))
#
# Perform bootstrap for each variable count and store percentiles
#for (i in 1:length(variable_counts)) {
# num_vars <- variable_counts[i]
# data <- generate_correlated_data(num_samples, num_vars)
# boot_res <- bootstrap_correlation(data, num_bootstrap)
# bootstrap_results[[as.character(num_vars)]] <- boot_res
#
# # Extract and store percentiles
# boot_percentiles <- apply(boot_res$t, 2, function(x) quantile(x, probs = c(0.025, 0.5, 0.975)))
# percentiles[i, "2.5%"] <- boot_percentiles[1]
# percentiles[i, "50%"] <- boot_percentiles[2]
# percentiles[i, "97.5%"] <- boot_percentiles[3]}
#
# Convert the percentiles matrix to a dataframe
#percentiles_df <- as.data.frame(percentiles)
#percentiles_df$Number_of_Variables <- as.numeric(row.names(percentiles_df))
#
# Melt the dataframe into a long format
#long_df <- melt(percentiles_df, id.vars = 'Number_of_Variables', variable.name = 'Percentile', value.name = #'Value')
#
# Spread the data into a wide format for median and CIs
#wide_df <- long_df %>%
# pivot_wider(id_cols = Number_of_Variables, names_from = Percentile, values_from = Value) %>%
# rename(Median = `50%`, Val25 = `2.5%`, Val975 = `97.5%`)
#
# Plotting
#ggplot(wide_df, aes(x = Number_of_Variables, y = Median)) +
# geom_line(color = 'blue') +
# geom_ribbon(aes(ymin = Val25, ymax = Val975), alpha = 0.2, fill = "blue") +
# labs(title = "Bootstrapped Composite Correlations With Empirical Confidence Intervals",
# x = "Number of Variables",
# y = "Composite Correlation") +
# my_theme() +
# scale_y_continuous(limits = c(0, 1))
sessionInfo()
## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Chicago
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_1.1.2 reshape2_1.4.4 ggplot2_3.4.4 boot_1.3-28.1 coda_0.19-4
## [6] MASS_7.3-60 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.4 jsonlite_1.8.7 highr_0.10 compiler_4.3.1
## [5] tidyselect_1.2.0 Rcpp_1.0.11 stringr_1.5.1 jquerylib_0.1.4
## [9] scales_1.2.1 yaml_2.3.7 fastmap_1.1.1 lattice_0.22-5
## [13] R6_2.5.1 plyr_1.8.8 labeling_0.4.3 generics_0.1.3
## [17] knitr_1.45 tibble_3.2.1 munsell_0.5.0 bslib_0.6.0
## [21] pillar_1.9.0 rlang_1.1.1 utf8_1.2.3 stringi_1.7.12
## [25] cachem_1.0.8 xfun_0.39 sass_0.4.7 cli_3.6.1
## [29] withr_2.5.2 magrittr_2.0.3 digest_0.6.33 grid_4.3.1
## [33] rstudioapi_0.15.0 lifecycle_1.0.4 vctrs_0.6.3 evaluate_0.23
## [37] glue_1.6.2 farver_2.1.1 fansi_1.0.4 colorspace_2.1-0
## [41] rmarkdown_2.25 tools_4.3.1 pkgconfig_2.0.3 htmltools_0.5.7