About

It’s surprisingly difficult to generate random correlation matrices. This notebook explores some methods. The problem with just picking random uniform values for the matrix is that these can be internally inconsistent, resulting in a non-positive definite matrix. Furthermore, we would probably like to control the average correlation and the distribution of correlations at least in part, as this can affect results of studies using simulated data.

Init

#global options
options(
  digits = 2,
  contrasts = c("contr.treatment", "contr.treatment")
)

#packages
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  randcorr,
  clusterGeneration,
  TruncatedNormal
)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
#ggplot2
theme_set(theme_bw())

Functions

#get sum stats from matrix
get_stats = function(m) {
  m %>% MAT_half() %>% describe2(all_vars = T)
}

Packages

randcorr

This implements the method in

  • Mohsen Pourahmadi and Xiao Wang, Distribution of random correlation matrices: Hyperspherical parameterization of the Cholesky factor, Statistics & Probability Letters, Volume 106, Pages 5-12, 2015.
#generate random correlation matrix
#the function has no parameters aside from number of variables
set.seed(1)
randcorr::randcorr(10)
##         [,1]     [,2]     [,3]   [,4]  [,5]   [,6]   [,7]   [,8]   [,9]  [,10]
##  [1,]  1.000  2.5e-01 -7.3e-02  0.119 -0.61 -0.131  0.324 -0.193 -0.292 -0.228
##  [2,]  0.248  1.0e+00  9.2e-05  0.036 -0.45 -0.360 -0.146 -0.410 -0.373 -0.086
##  [3,] -0.073  9.2e-05  1.0e+00  0.606  0.19 -0.169 -0.514  0.048 -0.139  0.039
##  [4,]  0.119  3.6e-02  6.1e-01  1.000 -0.18 -0.068 -0.191 -0.230 -0.314  0.231
##  [5,] -0.614 -4.5e-01  1.9e-01 -0.177  1.00  0.175 -0.449  0.200  0.211  0.410
##  [6,] -0.131 -3.6e-01 -1.7e-01 -0.068  0.17  1.000  0.171  0.610 -0.009  0.089
##  [7,]  0.324 -1.5e-01 -5.1e-01 -0.191 -0.45  0.171  1.000  0.286 -0.015  0.185
##  [8,] -0.193 -4.1e-01  4.8e-02 -0.230  0.20  0.610  0.286  1.000  0.079 -0.033
##  [9,] -0.292 -3.7e-01 -1.4e-01 -0.314  0.21 -0.009 -0.015  0.079  1.000 -0.329
## [10,] -0.228 -8.6e-02  3.9e-02  0.231  0.41  0.089  0.185 -0.033 -0.329  1.000
#it also has a bias towards negative correlations, whereas one might expect the mean correlation to be 0
map_df(1:10, ~randcorr::randcorr(10) %>% get_stats())

clusterGeneration / simstudy

Both packages offer the same function. It is based on:

  • Joe, H. (2006) Generating Random Correlation Matrices Based on Partial Correlations. Journal of Multivariate Analysis, 97, 2177–2189.
#generate random correlation matrix
set.seed(1)
clusterGeneration::rcorrmatrix(10)
##          [,1]    [,2]   [,3]   [,4]    [,5]    [,6]   [,7]   [,8]   [,9]  [,10]
##  [1,]  1.0000 -0.2237 -0.126  0.173  0.0016 -0.0884  0.049 -0.302 -0.319 -0.440
##  [2,] -0.2237  1.0000  0.066  0.510  0.4743  0.0084  0.262  0.395  0.250  0.138
##  [3,] -0.1264  0.0655  1.000 -0.298 -0.0516  0.4006 -0.654 -0.118  0.012  0.127
##  [4,]  0.1730  0.5102 -0.298  1.000  0.5612 -0.1258  0.486 -0.056 -0.071 -0.036
##  [5,]  0.0016  0.4743 -0.052  0.561  1.0000  0.1176  0.364  0.086 -0.040  0.125
##  [6,] -0.0884  0.0084  0.401 -0.126  0.1176  1.0000 -0.293 -0.067 -0.691  0.155
##  [7,]  0.0493  0.2621 -0.654  0.486  0.3641 -0.2929  1.000  0.174  0.040  0.289
##  [8,] -0.3016  0.3954 -0.118 -0.056  0.0859 -0.0669  0.174  1.000  0.264  0.388
##  [9,] -0.3193  0.2498  0.012 -0.071 -0.0402 -0.6912  0.040  0.264  1.000  0.206
## [10,] -0.4401  0.1384  0.127 -0.036  0.1249  0.1554  0.289  0.388  0.206  1.000
#it also has a bias towards negative correlations
map_df(1:10, ~clusterGeneration::rcorrmatrix(10) %>% get_stats())
#it has one parameter that controls the average size of correlations
map_df(1:10, ~clusterGeneration::rcorrmatrix(10, alphad = 1000) %>% get_stats())
map_df(1:10, ~clusterGeneration::rcorrmatrix(10, alphad = 1/1000) %>% get_stats())
#but the range doesn't seem to get to 2

Julius’ method

Method from https://stackoverflow.com/questions/41942275/simulation-of-correlated-data-with-specifications-on-covariance-matrix with my modifications.

#define it
julius_method = function(p, rfun = rnorm, seed_values = NULL) {
  if (is.null(seed_values)) {
    num_input = rfun(p*p)
  } else {
    num_input = seed_values
  }
  rmat <- matrix(num_input, nrow = p, ncol = p)
  cov_mat <- rmat%*%t(rmat)
  corr_mat <- cov_mat/sqrt(diag(cov_mat)%*%t(diag(cov_mat)))
  corr_mat
}

#test it
set.seed(1)
julius_method(10)
##         [,1]    [,2]    [,3]   [,4]   [,5]   [,6]   [,7]  [,8]   [,9]   [,10]
##  [1,]  1.000 -0.0253  0.1044 -0.469 -0.232  0.099 -0.619  0.31  0.413  0.7771
##  [2,] -0.025  1.0000  0.0046  0.019  0.529 -0.182 -0.247 -0.18 -0.520  0.0018
##  [3,]  0.104  0.0046  1.0000 -0.174 -0.024  0.370 -0.356 -0.18 -0.268  0.2667
##  [4,] -0.469  0.0195 -0.1742  1.000 -0.276 -0.415 -0.036  0.33 -0.255 -0.1948
##  [5,] -0.232  0.5290 -0.0244 -0.276  1.000  0.511  0.036 -0.40 -0.246 -0.2749
##  [6,]  0.099 -0.1816  0.3702 -0.415  0.511  1.000 -0.322 -0.49 -0.047 -0.1443
##  [7,] -0.619 -0.2470 -0.3559 -0.036  0.036 -0.322  1.000 -0.13  0.236 -0.4105
##  [8,]  0.306 -0.1799 -0.1790  0.326 -0.398 -0.488 -0.133  1.00  0.334  0.5086
##  [9,]  0.413 -0.5196 -0.2684 -0.255 -0.246 -0.047  0.236  0.33  1.000  0.3047
## [10,]  0.777  0.0018  0.2667 -0.195 -0.275 -0.144 -0.411  0.51  0.305  1.0000
#10 reps
map_df(1:10, ~julius_method(10) %>% get_stats())
#try another input function to force positive correlations only
map_df(1:10, ~julius_method(10, rfun = runif) %>% get_stats())

PCA method

Assume the data is generated from a PCA model, and we specify the number of latent dimensions.

pca_method = function(p, latent_dims = 2, loadings = "runif", rnorm_sd = 0.2, lower_bound = -1, upper_bound = 1, noise = 0) {
  # browser()
  #generate random data for the PCA
  latent_x = MASS::mvrnorm(1000, mu = rep(0, latent_dims), Sigma = diag(latent_dims))
  
  #loadings method
  if (is.numeric(loadings)) {
    loadings = matrix(loadings, nrow = latent_dims, ncol = latent_dims)
  } else if (loadings == "runif") {
    loadings = matrix(runif(p*latent_dims, min = lower_bound, max = upper_bound), nrow = p, ncol = latent_dims)
  } else if (loadings == "rtnorm") {
    loadings = matrix(TruncatedNormal::rtnorm(p*latent_dims, lb = lower_bound, ub = upper_bound, sd = rnorm_sd), nrow = p, ncol = latent_dims)
  }

  #begin with uncorrelated mvrnorm data
  x = MASS::mvrnorm(1000, mu = rep(0, p), Sigma = diag(p)) * noise
  
  #multiply by loadings
  x2 = (latent_x %*% t(loadings)) + x

  #get correlations
  cor(x2)
}

#test it
set.seed(1)
pca_method(10)
##         [,1]  [,2]   [,3]   [,4]   [,5]  [,6]  [,7]  [,8]  [,9]  [,10]
##  [1,]  1.000 -0.65  0.803  0.662 -0.999  0.98 -0.88 -0.95 -0.41 -0.029
##  [2,] -0.651  1.00 -0.070 -1.000  0.686 -0.50  0.93  0.86  0.96  0.778
##  [3,]  0.803 -0.07  1.000  0.084 -0.774  0.90 -0.43 -0.58  0.21  0.572
##  [4,]  0.662 -1.00  0.084  1.000 -0.696  0.52 -0.94 -0.86 -0.96 -0.769
##  [5,] -0.999  0.69 -0.774 -0.696  1.000 -0.97  0.90  0.96  0.45  0.076
##  [6,]  0.984 -0.50  0.897  0.515 -0.974  1.00 -0.78 -0.88 -0.24  0.152
##  [7,] -0.882  0.93 -0.427 -0.937  0.903 -0.78  1.00  0.99  0.79  0.497
##  [8,] -0.950  0.86 -0.575 -0.863  0.963 -0.88  0.99  1.00  0.68  0.341
##  [9,] -0.412  0.96  0.213 -0.956  0.454 -0.24  0.79  0.68  1.00  0.923
## [10,] -0.029  0.78  0.572 -0.769  0.076  0.15  0.50  0.34  0.92  1.000
#now we have a full range
map_df(1:10, ~pca_method(10) %>% get_stats())
#we can fix this using another distribution of loadings, e.g., non-negative
map_df(1:10, ~pca_method(10, lower_bound = 0) %>% get_stats())
#we can add noise to get smaller correlations overall
map_df(1:10, ~pca_method(10, noise = 3) %>% get_stats())

Meta

write_sessioninfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_DK.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_DK.UTF-8        LC_COLLATE=en_DK.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_DK.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Copenhagen
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] TruncatedNormal_2.3     clusterGeneration_1.3.8 MASS_7.3-61            
##  [4] randcorr_1.0            kirkegaard_2024-09-27   psych_2.4.6.26         
##  [7] assertthat_0.2.1        weights_1.0.4           Hmisc_5.1-3            
## [10] magrittr_2.0.3          lubridate_1.9.3         forcats_1.0.0          
## [13] stringr_1.5.1           dplyr_1.1.4             purrr_1.0.2            
## [16] readr_2.1.5             tidyr_1.3.1             tibble_3.2.1           
## [19] ggplot2_3.5.1           tidyverse_2.0.0        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1    fastmap_1.2.0       digest_0.6.37      
##  [4] rpart_4.1.23        timechange_0.3.0    lifecycle_1.0.4    
##  [7] cluster_2.1.6       survival_3.7-0      gdata_3.0.1        
## [10] compiler_4.4.1      rlang_1.1.4         sass_0.4.9         
## [13] tools_4.4.1         utf8_1.2.4          yaml_2.3.10        
## [16] data.table_1.16.2   spacefillr_0.3.3    knitr_1.48         
## [19] htmlwidgets_1.6.4   mnormt_2.1.1        alabama_2023.1.0   
## [22] nleqslv_3.3.5       numDeriv_2016.8-1.1 withr_3.0.1        
## [25] foreign_0.8-86      nnet_7.3-19         grid_4.4.1         
## [28] fansi_1.0.6         jomo_2.7-6          colorspace_2.1-1   
## [31] mice_3.16.0         scales_1.3.0        gtools_3.9.5       
## [34] iterators_1.0.14    cli_3.6.3           rmarkdown_2.28     
## [37] qrng_0.0-10         generics_0.1.3      rstudioapi_0.17.1  
## [40] tzdb_0.4.0          minqa_1.2.8         cachem_1.1.0       
## [43] splines_4.4.1       parallel_4.4.1      base64enc_0.1-3    
## [46] vctrs_0.6.5         boot_1.3-30         glmnet_4.1-8       
## [49] Matrix_1.6-5        jsonlite_1.8.9      hms_1.1.3          
## [52] mitml_0.4-5         Formula_1.2-5       htmlTable_2.4.3    
## [55] foreach_1.5.2       jquerylib_0.1.4     glue_1.8.0         
## [58] nloptr_2.1.1        pan_1.9             codetools_0.2-19   
## [61] stringi_1.8.4       gtable_0.3.6        shape_1.4.6.1      
## [64] lme4_1.1-35.5       munsell_0.5.1       pillar_1.9.0       
## [67] htmltools_0.5.8.1   R6_2.5.1            evaluate_1.0.1     
## [70] lattice_0.22-5      backports_1.5.0     broom_1.0.7        
## [73] bslib_0.8.0         Rcpp_1.0.13         gridExtra_2.3      
## [76] nlme_3.1-165        checkmate_2.3.2     xfun_0.48          
## [79] pkgconfig_2.0.3
#upload to OSF
#avoid uploading the data in case they freak out again
if (F) {
  library(osfr)
  
  #auth
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload files
  #overwrite existing (versioning)
  osf_upload(osf_proj, conflicts = "overwrite", 
             path = c(
               "figs",
               "data",
               "notebook.html",
               "notebook.Rmd",
             ))
}