Date last code update: 2020-11-19
Date last ran: 2020-11-30



#to avoid confusion, keep only demo variables we need
df_demo <- df_demo[,c('record_id', 'group', 'demo_sex', 'site', 'scanner')] 

#split up dfs by scanner
df_GE <- df[substr(rownames(df), 7, 9) == 'CMH',]
df_PRISMA <- df[substr(rownames(df), 7, 9) != 'CMH',]

#function to variable define sets for CCA
set_fn <- function(df, string){
  set <- df[, grep(string, names(df))]
  return(set)
}

#run function to define sets
Xset_GE <- set_fn(df_GE, 'FA_')
Yset_GE <- set_fn(df_GE, 'cog_')
Xset_PRISMA <- set_fn(df_PRISMA, 'FA_')
Yset_PRISMA <- set_fn(df_PRISMA, 'cog_')

#function to split scanner dfs by diagnostic group
diag_fn <- function(df, group){
  diag_df <- df[rownames(df) %in% df_demo$record_id[df_demo$group == group],]
  return(diag_df)
}

#split up scanner dfs by diagnosis
df_GE_HC <- diag_fn(df_GE, 'HC')
df_GE_SSD <- diag_fn(df_GE, 'SSD')
df_PRISMA_HC <- diag_fn(df_PRISMA, 'HC')
df_PRISMA_SSD <- diag_fn(df_PRISMA, 'SSD')

#determine the number of variates (smaller of the two sets)
variateCount <- min(length(grep('FA_', colnames(df))), length(grep('cog', colnames(df))))

Overview

Here, we perform canonical correlation analysis (CCA) on the final sample of n=308 eligible participants in the SPINS study, who maintained eligibility throughout, and did not have excessive missing data. The sample is comprised of n=180 SSD and 128 HC; n=190 men and n=118 women. We conduct separate CCAs based on scanner: GE Discovery (CMH) and Siemens Prisma (CMP, MRP, ZHP): a total of n=135 are from the GE scanner (CMH) (), and 173 from the combined PRISMA sites.

We have elected to include 35 variables in the CCA analysis. Our \(X\) set is comprised of FA estimates in 19 white matter tracts, representing association, projection, and commissural fibers. Our \(Y\) set is comprised of 16 neurocognition and social cognition variables.


Data structure

[1] Review correlation matrices. Below, we summarise correlations within the X and Y sets, and the cross correlation matrix. We see that variables within sets are more highly correlated than between sets. We also see that the X (brain) variables are more highly correlated than the Y (behaviour) variables.

#run correlations (note: identical to pairwise complete, `cor(X, use='p')`)
corr_GE <- matcor(Xset_GE, Yset_GE)
corr_PRISMA <- matcor(Xset_PRISMA, Yset_PRISMA)

#function to describe correlations -- excluding the 1s from the diagonal
tab_corr_fn <- function(corr){
  data.frame(as.list(summary(c(corr[upper.tri(corr)], corr[lower.tri(corr)])))) 
}

#function to plot correlation matrices
plotCorr_fn <- function(df){
  ggcorrplot(df) + 
  theme_classic(base_size = 6) +
  theme(legend.position = 'top',
        axis.text.x = element_text(angle = 90, vjust=.5, hjust=1),
        axis.title= element_blank())
}

GE X correlation matrix
Min. X1st.Qu. Median Mean X3rd.Qu. Max.
-0.254 0.349 0.514 0.479 0.604 0.854

GE Y correlation matrix
Min. X1st.Qu. Median Mean X3rd.Qu. Max.
-0.069 0.238 0.314 0.328 0.423 0.641

GE XY correlation matrix
Min. X1st.Qu. Median Mean X3rd.Qu. Max.
-0.35 0.053 0.204 0.233 0.414 0.854

PRISMA X correlation matrix
Min. X1st.Qu. Median Mean X3rd.Qu. Max.
-0.194 0.42 0.541 0.506 0.606 0.818

PRISMA Y correlation matrix
Min. X1st.Qu. Median Mean X3rd.Qu. Max.
-0.04 0.253 0.37 0.344 0.436 0.699

PRISMA XY correlation matrix
Min. X1st.Qu. Median Mean X3rd.Qu. Max.
-0.194 0.108 0.253 0.28 0.451 0.818



[2] Test of multicollinearity. Collinearity refers to two or more variables being correlated with each other, and multicollinearity indicates collinearity between three or more variables even if no pair has a particularly high correlation. This latter case of multicollinearity indicates a redundancy between variables, rendering any solution unstable and difficult to interpret. If the pair-wise correlation coefficient between two variables is high (say >0.8) then it is possible that multicollinearity exists; however, this it is not sufficient nor necessary condition for the detection of multicollinearity, as a linear relation in CCA involves many variables. One common approach to identify collinearity among explanatory variables is the use of variance inflation factors (VIF).. The VIF is an indicator of how much the variance of the coefficient is inflated compared to the non-significant correlations with any other variables in the model. The VIF is calculated as 1 divided by tolerance, where tolerence is defined as 1 minus R^2. The higher the VIF value, the higher the collinearity. Typically, VIF = 1 indicates no correlation; > 1 < 5 indicates moderate correlation, >5 high correlation. Here, I use the usdm package, which allows for the calculation of VIF within a variable set, instead of a model. We see evidence of multicollinearity in some social cognition variables, and corpus collosum fibers.

GE X set

usdm::vif(Xset_GE)
##                    Variables      VIF
## 1          FA_CB_left_impCor 3.156656
## 2         FA_CB_right_impCor 4.088937
## 3          FA_UF_left_impCor 3.579958
## 4         FA_UF_right_impCor 5.247063
## 5          FA_TF_left_impCor 4.602655
## 6         FA_TF_right_impCor 4.485036
## 7  FA_CC1_commissural_impCor 1.436866
## 8  FA_CC2_commissural_impCor 4.741639
## 9  FA_CC4_commissural_impCor 4.874303
## 10 FA_CC6_commissural_impCor 3.587645
## 11         FA_AF_left_impCor 2.057726
## 12        FA_AF_right_impCor 2.889302
## 13        FA_ILF_left_impCor 2.960479
## 14       FA_ILF_right_impCor 3.965585
## 15       FA_IOFF_left_impCor 3.183453
## 16      FA_IOFF_right_impCor 3.240159
## 17 FA_CC3_commissural_impCor 7.130455
## 18 FA_CC5_commissural_impCor 2.527392
## 19 FA_CC7_commissural_impCor 3.058379

GE Y set

usdm::vif(Yset_GE)
##                          Variables      VIF
## 1                 ncog_work_memory 2.617277
## 2               ncog_process_speed 2.283526
## 3                    ncog_attn_vig 2.246677
## 4             ncog_verbal_learning 2.184679
## 5             ncog_visual_learning 1.577140
## 6             ncog_problem_solving 1.439629
## 7                     scog_TASIT_1 1.589389
## 8             scog_TASIT_2_sincere 1.269682
## 9       scog_TASIT_2_simpleSarcasm 2.182791
## 10 scog_TASIT_2_paradoxicalSarcasm 2.680300
## 11               scog_TASIT_3_lies 1.690536
## 12            scog_TASIT_3_sarcasm 2.200003
## 13                      scog_ER_40 1.613149
## 14                       scog_RMET 1.800373
## 15                        scog_RAD 2.509319
## 16                         scog_EA 1.460597

PRISMA X set

usdm::vif(Xset_PRISMA)
##                    Variables      VIF
## 1          FA_CB_left_impCor 3.956252
## 2         FA_CB_right_impCor 4.798314
## 3          FA_UF_left_impCor 3.089530
## 4         FA_UF_right_impCor 4.338133
## 5          FA_TF_left_impCor 3.525233
## 6         FA_TF_right_impCor 4.556384
## 7  FA_CC1_commissural_impCor 2.535319
## 8  FA_CC2_commissural_impCor 3.883679
## 9  FA_CC4_commissural_impCor 6.023356
## 10 FA_CC6_commissural_impCor 3.200351
## 11         FA_AF_left_impCor 4.337154
## 12        FA_AF_right_impCor 4.854775
## 13        FA_ILF_left_impCor 3.814658
## 14       FA_ILF_right_impCor 3.818511
## 15       FA_IOFF_left_impCor 3.739780
## 16      FA_IOFF_right_impCor 2.849058
## 17 FA_CC3_commissural_impCor 4.943682
## 18 FA_CC5_commissural_impCor 3.325411
## 19 FA_CC7_commissural_impCor 2.442621

PRISMA Y set

usdm::vif(Yset_PRISMA)
##                          Variables      VIF
## 1                 ncog_work_memory 1.640520
## 2               ncog_process_speed 2.984245
## 3                    ncog_attn_vig 1.640637
## 4             ncog_verbal_learning 2.468598
## 5             ncog_visual_learning 2.696522
## 6             ncog_problem_solving 1.705827
## 7                     scog_TASIT_1 2.007819
## 8             scog_TASIT_2_sincere 1.185962
## 9       scog_TASIT_2_simpleSarcasm 1.783048
## 10 scog_TASIT_2_paradoxicalSarcasm 2.107379
## 11               scog_TASIT_3_lies 1.414244
## 12            scog_TASIT_3_sarcasm 1.955146
## 13                      scog_ER_40 1.440844
## 14                       scog_RMET 1.681038
## 15                        scog_RAD 2.189323
## 16                         scog_EA 1.727740


[3] Visualization of homogeneity between groups (SSD and HC). We can review the covariance matrices, which tell us how much two random variables vary together (as opposed to correlation, which tells how a change in one variable is correspondance to a change in another). In our case, the covariance matrices are useful to compare between HC and SSD groups,

GE HC

heatmap_fn(df_GE_HC, 'GE_HC')[2]

## $colInd
##  [1] 18 12 14 13  1 16 24 23 21 26 25 19 17 20 22 15  5  4 32 31 27  3  8 28 34
## [26] 10 33 35  9 11  2  6  7 30 29

GE SSD

heatmap_fn(df_GE_SSD, 'GE_SSD')[2]

## $colInd
##  [1] 18 26 21 23 15  1 16 13 12 19 22 14 25 17 24 20  5  8 35 29 30 11  4 31 32
## [26] 28 27  9 34  2  3  7  6 10 33

PRISMA HC

heatmap_fn(df_PRISMA_HC, 'PRISMA_HC')[2]

## $colInd
##  [1] 22 19 21 18 25 17  1 26 24 20 23 13 14 15 12 16  4  5  8  7  6 10 33  9 11
## [26] 34 35 28 30  2  3 29 31 32 27

GE SSD

heatmap_fn(df_PRISMA_SSD, 'PRISMA_SSD')[2]

## $colInd
##  [1] 16  1 12 23 18 21 13 14 15 26 17 24 19 22 25 20  5  4 34 10  6  7  8  9 33
## [26] 29 30 35 11 27 32 31 28  2  3


[4] Test of homogeneity between groups. We can use Box’s M-test to test the null hypothesis that the covariance matrices are equal; i.e., that the observed covariance matrices of the variable sets are equal across groups. Box’s M-test is the multivariate version of the univariate assumption of homogeneity of variance (Levene’s test) and the bivariate assumption of homoscedasticity. We find that that the M test is significant at the PRISMA sites for both HC and SSD, but not at the GE sites. This may be ok – inhomogeneity of variance here is not unexpected. Second, Box’s M-test is prone to errors when the sample sizes are small or when the data do not meet model assumptions, especially the assumption of multivariate normality. For large samples, Box’s M test may be too strict, indicating heterogeneity when the covariance matrices are not very different.

GE X set

rstatix::box_m(Xset_GE, df_demo$group[df_demo$record_id %in% rownames(Xset_GE)]) %>%
  kable() %>% kable_styling()
statistic p.value parameter method
212.1391 0.1296046 190 Box’s M-test for Homogeneity of Covariance Matrices

GE Y set

rstatix::box_m(Yset_GE, df_demo$group[df_demo$record_id %in% rownames(Yset_GE)])%>%
  kable() %>% kable_styling()
statistic p.value parameter method
139.8406 0.393173 136 Box’s M-test for Homogeneity of Covariance Matrices

PRISMA X set

rstatix::box_m(Xset_PRISMA, df_demo$group[df_demo$record_id %in% rownames(Xset_PRISMA)]) %>%
  kable() %>% kable_styling()
statistic p.value parameter method
269.6434 0.0001278 190 Box’s M-test for Homogeneity of Covariance Matrices

PRISMA Y set

rstatix::box_m(Yset_PRISMA, df_demo$group[df_demo$record_id %in% rownames(Yset_PRISMA)]) %>%
  kable() %>% kable_styling()
statistic p.value parameter method
154.3998 0.1337217 136 Box’s M-test for Homogeneity of Covariance Matrices


CCA

#function to compute canonical correlation
cca_fn <- function(X, Y){
  cca_cca <- cc(X, Y) #CCA package
  cca_candisc <- cancor(X, Y) #cancor package (better tables/visualizations)
  rho <- cca_cca$cor #define rho
  correlations <- as.data.frame(unlist(cca_cca$cor)) #get canonical correation
return(list(cca=cca_cca, candisc=cca_candisc, rho=rho, correlations=correlations))
}

#run CCA function
CCA_GE <- cca_fn(Xset_GE, Yset_GE)
CCA_PRISMA <- cca_fn(Xset_PRISMA, Yset_PRISMA)

[1] Canonical correlations. We have quite high canonical correlations! As seen in the plot below, the first variate for GE has a correlation values of 0.7083486, and for PRISMA, 0.7186113 . The correlation values at the GE and PRISMA sites appear very similar.

#function to manipulate canonical correlation values
CCA_corr_fn <- function(CCA_df){
  CCA_df$correlations$variate <- 1:variateCount #
  names(CCA_df$correlations)[names(CCA_df$correlations) == 'unlist(cca_cca$cor)'] <- 'canonical_correlation'
  CCA_df$correlations$canonical_correlation <- round(CCA_df$correlations$canonical_correlation, 3)
  return(CCA_df)
}

#run function
CCA_GE <- CCA_corr_fn(CCA_GE)
CCA_PRISMA <- CCA_corr_fn(CCA_PRISMA)

#function for bar plot/scree plot
CCA_corrPlot_fn <- function(CCA_df, title){ 
ggplot(CCA_df[['correlations']], aes(x=variate, y=canonical_correlation)) + 
  geom_bar(stat='identity') +
  ggtitle(title) +
  theme_minimal() +
  theme(axis.text.x=element_blank()) + 
  geom_text(aes(label=canonical_correlation), hjust = 1.2, color= 'white', size=3.5) +
  coord_flip()
}

GE

CCA_corrPlot_fn(CCA_GE, 'Canonical correlations: GE')

PRISMA

CCA_corrPlot_fn(CCA_PRISMA, 'Canonical correlations: PRISMA')


[2] Statistical testing, F distribution. Next, we formally test for significance. There are 4 primary tests of significance in multivariate models, which all draw from the F distribution. We see that the first 3 tests, that similarly evaluate the entire CCA model, have similar p values, none of which reach significance. However, the final test, Roy’s Largest Root, is highly significant, p<0.001. Unlike the other 3 tests, Roy’s Largest Root considers only the largest eigenvector, i.e., it estimates significance on the basis of only the first canonical variate (hence the “largest root”). Note, however, that such a small value for Roy’s largest root can be a reflection of dimensionality, but also the consequence of non-normality or heterscedasticity, to which this estimate is especially sensitive.

#save variables for ease
n_GE <- nrow(df_GE)
n_PRISMA <- nrow(df_PRISMA)
p <- ncol(Xset_GE)
q <- ncol(Yset_GE)

#function to calculate p-values using the F-approximations of different statistical tests
p_fn <- function(df,n,p,q,tstat){
  df_p <- as.data.frame(CCP::p.asym(df$cca$cor, n, p, q, tstat = tstat)) #test
  df_p <- setDT(df_p, keep.rownames = 'variate')[] #add row names
  df_p$variate <- paste(df_p$variate, '-', q, sep='')
  return(df_p)
}

GE - Wilks’ Lambda

p_wilks <- p_fn(CCA_GE,n_GE,p,q,'Wilks')
## Wilks' Lambda, using F-approximation (Rao's F):
##                  stat     approx df1       df2    p.value
## 1 to 16:   0.05169834 1.14284404 304 1274.4319 0.06460694
## 2 to 16:   0.10376146 0.96788967 270 1208.7971 0.62603264
## 3 to 16:   0.17229675 0.84325391 238 1142.0825 0.94924539
## 4 to 16:   0.26274029 0.72701221 208 1074.2714 0.99777363
## 5 to 16:   0.33814500 0.68027480 180 1005.3418 0.99928612
## 6 to 16:   0.43036545 0.61668976 154  935.2646 0.99988180
## 7 to 16:   0.53531372 0.53996414 130  864.0000 0.99998966
## 8 to 16:   0.63354171 0.47418101 108  791.4927 0.99999827
## 9 to 16:   0.70997899 0.43730171  88  717.6629 0.99999799
## 10 to 16:  0.78338239 0.39238628  70  642.3904 0.99999786
## 11 to 16:  0.85072256 0.33734326  54  565.4863 0.99999798
## 12 to 16:  0.90604710 0.27851411  40  486.6323 0.99999741
## 13 to 16:  0.94536882 0.22727806  28  405.2439 0.99999149
## 14 to 16:  0.97787002 0.14125877  18  320.0975 0.99999099
## 15 to 16:  0.99562379 0.05005315  10  228.0000 0.99999282
## 16 to 16:  0.99856612 0.04128318   4  115.0000 0.99672336

GE - Hotelling-Lawley Trace

p_hotelling <- p_fn(CCA_GE,n_GE,p,q,'Hotelling')
##  Hotelling-Lawley Trace, using F-approximation:
##                   stat     approx df1  df2    p.value
## 1 to 16:   3.654923015 1.17973461 304 1570 0.02747707
## 2 to 16:   2.647866989 0.98191734 270 1602 0.56889694
## 3 to 16:   1.987358783 0.85276897 238 1634 0.94135908
## 4 to 16:   1.462429913 0.73209382 208 1666 0.99782591
## 5 to 16:   1.175436608 0.69301783 180 1698 0.99909019
## 6 to 16:   0.902712035 0.63380350 154 1730 0.99981908
## 7 to 16:   0.658853541 0.55812497 130 1762 0.99998273
## 8 to 16:   0.475357415 0.49351343 108 1794 0.99999680
## 9 to 16:   0.354706668 0.46001021  88 1826 0.99999562
## 10 to 16:  0.251318544 0.41691951  70 1858 0.99999491
## 11 to 16:  0.165357759 0.36172010  54 1890 0.99999508
## 12 to 16:  0.100325341 0.30128954  40 1922 0.99999394
## 13 to 16:  0.056926136 0.24828944  28 1954 0.99998180
## 14 to 16:  0.022546753 0.15547865  18 1986 0.99998335
## 15 to 16:  0.004391203 0.05538404  10 2018 0.99998912
## 16 to 16:  0.001435937 0.04599485   4 2050 0.99601627

GE - Pillai-Bartlett Trace

p_pillai <- p_fn(CCA_GE,n_GE,p,q,'Pillai')
##  Pillai-Bartlett Trace, using F-approximation:
##                   stat     approx df1  df2   p.value
## 1 to 16:   2.470762145 1.10535517 304 1840 0.1193418
## 2 to 16:   1.969004340 0.97297182 270 1872 0.6077810
## 3 to 16:   1.571229609 0.87116480 238 1904 0.9142885
## 4 to 16:   1.226997900 0.77306690 208 1936 0.9913708
## 5 to 16:   1.004002720 0.73200176 180 1968 0.9963421
## 6 to 16:   0.789718672 0.67428645 154 2000 0.9990868
## 7 to 16:   0.593668642 0.60231715 130 2032 0.9998665
## 8 to 16:   0.438622824 0.53867787 108 2064 0.9999692
## 9 to 16:   0.330961496 0.50308773  88 2096 0.9999663
## 10 to 16:  0.237260901 0.45758109  70 2128 0.9999677
## 11 to 16:  0.158104465 0.39920593  54 2160 0.9999738
## 12 to 16:  0.097043021 0.33440055  40 2192 0.9999742
## 13 to 16:  0.055448965 0.27622177  28 2224 0.9999440
## 14 to 16:  0.022212240 0.17423777  18 2256 0.9999602
## 15 to 16:  0.004380436 0.06265739  10 2288 0.9999805
## 16 to 16:  0.001433878 0.05198273   4 2320 0.9949522

GE - Roy’s Largest Root

p_roy <- p_fn(CCA_GE,n_GE,p,q,'Roy')
##  Roy's Largest Root, using F-approximation:
##               stat   approx df1 df2              p.value
## 1 to 1:  0.5017578 7.427038  16 118 0.000000000009913514
## 
##  F statistic for Roy's Greatest Root is an upper bound.

PRISMA - Wilks’ Lambda

p_wilks <- p_fn(CCA_PRISMA,n_PRISMA,p,q,'Wilks')
## Wilks' Lambda, using F-approximation (Rao's F):
##                  stat     approx df1       df2       p.value
## 1 to 16:   0.06682737 1.41088520 304 1741.3837 0.00002123177
## 2 to 16:   0.13818793 1.13854827 270 1648.6789 0.07479064325
## 3 to 16:   0.21135105 1.00492076 238 1554.8682 0.47051186310
## 4 to 16:   0.29947271 0.88542965 208 1459.9293 0.86792751337
## 5 to 16:   0.39574973 0.78229811 180 1363.8331 0.98177304408
## 6 to 16:   0.51287500 0.65467574 154 1266.5409 0.99949646938
## 7 to 16:   0.60038056 0.59164977 130 1168.0000 0.99989553529
## 8 to 16:   0.68971414 0.51775675 108 1068.1369 0.99998537739
## 9 to 16:   0.75477470 0.48163176  88  966.8455 0.99998370750
## 10 to 16:  0.81425235 0.44270486  70  863.9666 0.99997859622
## 11 to 16:  0.87450345 0.37467330  54  759.2490 0.99998884298
## 12 to 16:  0.91438496 0.33829683  40  652.2704 0.99996412723
## 13 to 16:  0.95298388 0.26039904  28  542.2549 0.99996553741
## 14 to 16:  0.97473628 0.21587659  18  427.5778 0.99978709107
## 15 to 16:  0.98906855 0.16753277  10  304.0000 0.99819937710
## 16 to 16:  0.99804635 0.07487322   4  153.0000 0.98973987953

PRISMA - Hotelling-Lawley Trace

p_hotelling <- p_fn(CCA_PRISMA,n_PRISMA,p,q,'Hotelling') 
##  Hotelling-Lawley Trace, using F-approximation:
##                  stat     approx df1  df2         p.value
## 1 to 16:   3.33459307 1.49316277 304 2178 0.0000005092686
## 2 to 16:   2.26675880 1.15961504 270 2210 0.0465623474911
## 3 to 16:   1.73731230 1.02286087 238 2242 0.3970274018408
## 4 to 16:   1.32036777 0.90219841 208 2274 0.8318775305389
## 5 to 16:   0.99887932 0.79979712 180 2306 0.9741488830047
## 6 to 16:   0.70292141 0.66697657 154 2338 0.9993409628497
## 7 to 16:   0.53230369 0.60651910 130 2370 0.9998466128211
## 8 to 16:   0.38350876 0.53309494 108 2402 0.9999772859541
## 9 to 16:   0.28917901 0.49990178  88 2434 0.9999716087369
## 10 to 16:  0.21037716 0.46320542  70 2466 0.9999599666659
## 11 to 16:  0.13638155 0.39430684  54 2498 0.9999790564737
## 12 to 16:  0.09077680 0.35885205  40 2530 0.9999340148675
## 13 to 16:  0.04856381 0.27772431  28 2562 0.9999410214437
## 14 to 16:  0.02573824 0.23182294  18 2594 0.9996727297249
## 15 to 16:  0.01103450 0.18110365  10 2626 0.9975818610606
## 16 to 16:  0.00195747 0.08129617   4 2658 0.9881238705686

PRISMA - Pillai-Bartlett Trace

p_pillai <- p_fn(CCA_PRISMA,n_PRISMA,p,q,'Pillai')
##  Pillai-Bartlett Trace, using F-approximation:
##                   stat    approx df1  df2      p.value
## 1 to 16:   2.263763978 1.3270926 304 2448 0.0002963938
## 2 to 16:   1.747361730 1.1260961 270 2480 0.0876735284
## 3 to 16:   1.401193035 1.0130323 238 2512 0.4357971302
## 4 to 16:   1.106936959 0.9090602 208 2544 0.8141097460
## 5 to 16:   0.863659427 0.8165729 180 2576 0.9618719964
## 6 to 16:   0.635289428 0.7002194 154 2608 0.9978064412
## 7 to 16:   0.489539264 0.6409489 130 2640 0.9993994494
## 8 to 16:   0.360016650 0.5695069 108 2672 0.9998858300
## 9 to 16:   0.273817995 0.5350110  88 2704 0.9998772277
## 10 to 16:  0.200772278 0.4966906  70 2736 0.9998532062
## 11 to 16:  0.131874784 0.4259989  54 2768 0.9999261787
## 12 to 16:  0.088259117 0.3882754  40 2800 0.9998174685
## 13 to 16:  0.047755891 0.3027892  28 2832 0.9998573283
## 14 to 16:  0.025439702 0.2533866  18 2864 0.9993870455
## 15 to 16:  0.010949020 0.1983130  10 2896 0.9964524183
## 16 to 16:  0.001953646 0.0893902   4 2928 0.9857942095

PRISMA - Roy’s Largest Root

p_roy <- p_fn(CCA_PRISMA,n_PRISMA,p,q,'Roy')
##  Roy's Largest Root, using F-approximation:
##               stat   approx df1 df2 p.value
## 1 to 1:  0.5164022 10.41138  16 156       0
## 
##  F statistic for Roy's Greatest Root is an upper bound.


[3] Permutation test. In addition to/instead of the objective statistical test above, we should use permutation/randomizaiton, as we don’t know the population distribution in question. Like bootstrapping, a permutation test builds - rather than assumes - the sampling distribution by resampling the observed data. We create these null distributions by shuffling rows of Y set so they no longer correspond to X features. We do this 500 times. Our permutation test is significant. Below, we visualize where our our first variates’ canonical correlation value of 0.708 for GE, and 0.719 for PRISMA, fall on their respective null distributions.

Visualization

#set number of permutations
n <- 500

#write permutation function
permute_fn <- function(X, Y){
    Y_shuffle <- Y[sample(nrow(Y)),] #shuffle rows in Y set
    correlation <- cc(X,Y_shuffle)$cor[1] #compute CCA, take first correlation value
    list(correlation) #add values to a dataframe
}

#run permutation function (takes 30 sec)
GE_null <- lapply(seq_len(n), function(x) permute_fn(Xset_GE, Yset_GE))
PRISMA_null <- lapply(seq_len(n), function(x) permute_fn(Xset_PRISMA, Yset_PRISMA))

#function to manipulate null dataframes
null_fn <- function(df_null){
  df_null <-  unlist(df_null, use.names=FALSE) #extract all values from list
  df_null <- as.data.frame(round(df_null, 3)) #round values to 3 floating points
  names(df_null) <- 'correlation_value' #set names for ggplot
  df_null$permuation <- seq(1:500)
  return(df_null)
}

#overwrite null dfs with manipulation function
GE_null <- null_fn(GE_null)
PRISMA_null <- null_fn(PRISMA_null)

#plot frequency histogram
nullPlot_fn <- function(df_null, title, df_CCA){
  ggplot(data=df_null, aes(x=correlation_value)) + 
    geom_histogram(binwidth = .001) +
    theme_bw() +
    xlab(expression('permuted R'['c'])) +
    ylab('frequency') + 
    xlim(.48, .75) +
    ggtitle(title) +
    theme(axis.ticks=element_blank(), 
          panel.grid.minor = element_blank(), 
          panel.grid.major.x = element_blank()) + 
    geom_vline(xintercept = df_CCA$correlations[1,1], linetype="dashed", color = "darkgreen", size=1.5) +
    geom_text(x = .7, y = 6, label=df_CCA$cca$cor[1], color='darkgreen', parse=T)
}

GE

nullPlot_fn(GE_null, 'GE null correlation distribution (our observation in green)', CCA_GE)

PRISMA

nullPlot_fn(PRISMA_null, 'PRISMA null correlation distribution (our observation in red)', CCA_PRISMA)


Statistical test. Below, we verify that the permutation test for Roy’s largest root shows a significant result, for both GE and PRISMA.

#remove libraries until figure out where conflict lies
invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE))

#function for permutation testing -- all variates
permutedStats_fn <- function(Xset, Yset, nboot=500, rhostart=1){
  wilks <- CCP::p.perm(Xset, Yset, nboot=nboot, rhostart=rhostart, type='Wilks')
  hotelling <- CCP::p.perm(Xset, Yset, nboot=nboot, rhostart=rhostart, type='Hotelling')
  pillai <- CCP::p.perm(Xset, Yset, nboot=nboot, rhostart=rhostart, type='Pillai')
  roy <- CCP::p.perm(Xset, Yset, nboot=nboot, rhostart=rhostart, type='Roy') 
  return(list(wilksPermuted=wilks, hotellingPermuted=hotelling, pillaiPermuted=pillai, royPermuted=roy))
}

#run permutation
GE_permuted <- permutedStats_fn(Xset_GE, Yset_GE)
PRISMA_permuted <- permutedStats_fn(Xset_PRISMA, Yset_PRISMA)

#load libraries again
lapply(libraries, require, character.only = T)

#pull out important results
tbl_permuted <- data.frame(matrix(ncol = 4, nrow = 2))
colnames(tbl_permuted) <- c('Wilks', 'Hotelling', 'Pillai', 'Roy')
tbl_permuted[1,1] <- GE_permuted$wilksPermuted$p.value
tbl_permuted[1,2] <- GE_permuted$hotellingPermuted$p.value
tbl_permuted[1,3] <- GE_permuted$pillaiPermuted$p.value
tbl_permuted[1,4] <- GE_permuted$royPermuted$p.value
tbl_permuted[2,1] <- PRISMA_permuted$wilksPermuted$p.value
tbl_permuted[2,2] <- PRISMA_permuted$hotellingPermuted$p.value
tbl_permuted[2,3] <- PRISMA_permuted$pillaiPermuted$p.value
tbl_permuted[2,4] <- PRISMA_permuted$royPermuted$p.value

row.names(tbl_permuted) <- c('GE', 'PRISMA')
#feed into pretty table
tbl_permuted %>% kable() %>% kable_styling() %>%
  column_spec(1, bold = T, border_right = T) %>%
  column_spec(5, bold=T, color='white',background = "green")
Wilks Hotelling Pillai Roy
GE 0.08 0.044 0.1 0.01
PRISMA 0.00 0.000 0.0 0.00

Model exploration

[1] Participant scores (CV1). Here, we show the association between scores in the X set and Y set, for each participant, for the first canonical variate, for both GE (Rc=0.708), and PRISMA (Rc=0.708). It appears that HCs have lower correspondence between the X and Y sets than do SSD.

#bind together data
GE_scores.plot <- as.data.frame(cbind(CCA_GE$cca$scores$xscores[,1], CCA_GE$cca$scores$yscores[,1]))
GE_scores.plot <- merge(GE_scores.plot, df_demo[, c('record_id', 'group')], by.x='row.names', by.y='record_id')

PRISMA_scores.plot <- as.data.frame(cbind(CCA_PRISMA$cca$scores$xscores[,1], CCA_PRISMA$cca$scores$yscores[,1]))
PRISMA_scores.plot <- merge(PRISMA_scores.plot, df_demo[, c('record_id', 'group')], by.x='row.names', by.y='record_id')

#make sure numeric
GE_scores.plot[2:3] <- sapply(GE_scores.plot[2:3],as.numeric)
PRISMA_scores.plot[2:3] <- sapply(PRISMA_scores.plot[2:3],as.numeric)

#change names for clarity
names(GE_scores.plot) <- c('record_id', 'Xset', 'Yset', 'group')
names(PRISMA_scores.plot) <- c('record_id','Xset', 'Yset', 'group')

#function for scatter plot
scoresPlot_fn <- function(df_scores){
ggplot(df_scores, aes(x=Xset, y=Yset)) +
  geom_point(size = 4, aes(shape=group), alpha=.8) +
  geom_smooth(method=lm, color="black", alpha=.4) + 
  xlim(-3, 3) +
  ylim(-2, 3) +
  theme_bw() +
  theme(axis.ticks=element_blank(), 
        panel.grid.minor = element_blank(), 
        panel.grid.major.x = element_blank(),
        legend.position='top') 
}

GE

scoresPlot_fn(GE_scores.plot)

PRISMA

scoresPlot_fn(PRISMA_scores.plot)


[2] Descriptive table (CV1). Here, we show standardized canonical function coefficients, structure coefficients, and squared structure coefficients, for the first canonical variate. The structure coefficients, \(rs\), are of most interest here. We can interpret the structure coefficients like regression coefficients, i.e., a one unit increase in the given variable would result in an increase/decrease of the \(rs\) value of the first canonical variate for the given variable’s set, when the other variables are held constant. There are interesting patterns in variable contributions here!

structure_fn <- function(X, Y){

  #calculate correlations
  rxx <- cor(X, use = "p") #pairwise complete observations
  ryy <- cor(Y, use = "p")
  rxy <- cor(X,Y, use = "p")
  
  #calculate omega
  omega = t(solve(chol(rxx))) %*% rxy %*% solve(chol(ryy)) 
  
  #decompose matrix
  usv <- svd(omega)
  
  #standardized weights (beta)
  x.weights.std <- solve(chol(rxx)) %*% usv$u
  y.weights.std <- solve(chol(ryy)) %*% usv$v
  
  #structure coefficients
  x.structures <- rxx %*% x.weights.std
  y.structures <- ryy %*% y.weights.std
  
  #canonical cross-loadings/redundancy
  x.scores <- as.matrix(X) %*% x.weights.std
  y.scores <- as.matrix(Y) %*% y.weights.std
  
  return(list(rxx=rxx, ryy=ryy, rxy=rxy, omega=omega, usv=usv, x.weights.std=x.weights.std, y.weights.std=y.weights.std, 
    x.structures=x.structures, y.structures=y.structures, x.scores=x.scores, y.scores=y.scores))

}

#use function
GE_structure <- structure_fn(Xset_GE, Yset_GE)
PRISMA_structure <- structure_fn(Xset_PRISMA, Yset_PRISMA)

#for table, take just first column
GE.x.weights.std <- t(data.frame(lapply(GE_structure$x.weights.std[,1], type.convert), stringsAsFactors = F))
PRISMA.x.weights.std <- t(data.frame(lapply(PRISMA_structure$x.weights.std[,1], type.convert), stringsAsFactors = F)) 

GE.y.weights.std <- t(data.frame(lapply(GE_structure$y.weights.std[,1], type.convert), stringsAsFactors = F))
PRISMA.y.weights.std <- t(data.frame(lapply(PRISMA_structure$y.weights.std[,1], type.convert), stringsAsFactors = F)) 

#bind together standardized weights
GE.weights <- rbind(GE.x.weights.std, GE.y.weights.std)
PRISMA.weights <- rbind(PRISMA.x.weights.std, PRISMA.y.weights.std)

#bind together structure coefficients
GE.structures <- rbind(GE_structure$x.structures, GE_structure$y.structures)
PRISMA.structures <- rbind(PRISMA_structure$x.structures, PRISMA_structure$y.structures)

#make table
GE.tbl <- as.data.frame(cbind(GE.weights, GE.structures))
PRISMA.tbl <- as.data.frame(cbind(PRISMA.weights, PRISMA.structures))

#add rownames as a variable
GE.tbl <- tibble::rownames_to_column(GE.tbl, "variable")
PRISMA.tbl <- tibble::rownames_to_column(PRISMA.tbl, "variable")

#redefine row names with full names
GE.tbl$variable <- c(names(Xset_GE), names(Yset_GE))
PRISMA.tbl$variable <- c(names(Xset_PRISMA), names(Yset_PRISMA))

#set up dynamic names for table
GE.label <- paste("$R_c$=", round(CCA_GE$cca$cor[1], 5))
GE.myHeader <- c(" " = 1, GE.label = 2) # create vector with colspan
names(GE.myHeader) <- c(" ", GE.label) #set names

PRISMA.label <- paste("$R_c$=", round(CCA_PRISMA$cca$cor[1], 5))
PRISMA.myHeader <- c(" " = 1, PRISMA.label = 2) # create vector with colspan
names(PRISMA.myHeader) <- c(" ", PRISMA.label) #set names

#make into pretty table
GE.tbl_plot <- GE.tbl
PRISMA.tbl_plot <- PRISMA.tbl

#take only first 3 columns -- adjust if want more variates
GE.tbl_plot <- GE.tbl_plot[, 1:3]
PRISMA.tbl_plot <- PRISMA.tbl_plot[, 1:3]

#add a column for the communality coefficient
GE.tbl_plot$communalities <- GE.tbl_plot$V2 ^ 2
PRISMA.tbl_plot$communalities <- PRISMA.tbl_plot$V2 ^ 2

#round
GE.tbl_plot[,2:4] <- round(GE.tbl_plot[,2:4], 3) #shorten here, because formatting makes characters (?!)
PRISMA.tbl_plot[,2:4] <- round(PRISMA.tbl_plot[,2:4], 3) 

#sort the variables in the table alphabetically
GE.tbl_plot <- GE.tbl_plot[order(GE.tbl_plot$variable),]
PRISMA.tbl_plot <- PRISMA.tbl_plot[order(PRISMA.tbl_plot$variable),]

#for a nice table, remove redundant characters from table
GE.tbl_plot$variable <- gsub("FA_|_impCor|ncog_|scog_", "", GE.tbl_plot$variable)
PRISMA.tbl_plot$variable <- gsub("FA_|_impCor|ncog_|scog_", "", PRISMA.tbl_plot$variable)

#function for table
structureTbl_fn <- function(tbl){
  tbl %>%
    mutate(V2 = ifelse((V2 > .45 | V2 < -.45),
                       cell_spec(V2, color='black', underline = T, bold = T), #no green
                       cell_spec(V2, color='black'))) %>%
    kable(escape=F,
          align = c('l', 'c', 'c', 'c'),
          col.names = c('_Variable_', '$\\beta$', '$r_s$', '$r_s^2$ ($h^2$)')) %>%
    kable_styling(bootstrap_options=c('striped', 'hover', 'condensed')) %>%
    add_header_above(c(" " = 1, "_Function 1_" = 3)) %>%
    pack_rows('X', 1, p) %>%
    pack_rows('Y', p+1, p+q) %>%
    row_spec(which(tbl$variable == 'ILF_right'),  background = 'coral') %>%
    row_spec(which(tbl$variable == 'AF_right'), background = 'coral') %>%
    row_spec(which(tbl$variable == 'UF_left'), background = 'coral') %>%
    row_spec(which(tbl$variable == 'ER_40'),  background = 'burlywood') %>%
    row_spec(which(tbl$variable == 'RMET'), background = 'burlywood') %>%
    row_spec(which(tbl$variable == 'EA'), background = 'burlywood') %>%
    row_spec(which(tbl$variable == 'TASIT_3_lies'), background = 'burlywood') %>%
    row_spec(which(tbl$variable == 'TASIT_2_simpleSarcasm'), background = 'lightseagreen') %>%
    row_spec(which(tbl$variable == 'TASIT_2_paradoxicalSarcasm'), background = 'lightseagreen') %>%
    row_spec(which(tbl$variable == 'TASIT_3_sarcasm'), background = 'lightseagreen') %>%
    footnote(general = "$r_s$ values > .450 are emphasized, following convention;
         $\\beta$ = standardized canonical function coefficient;
         $r_s$ = structure coefficient;
         $r_s^2$ = squared structure coefficient, here also communality $h^2$")
}

GE

structureTbl_fn(GE.tbl_plot)
Function 1
Variable \(\beta\) \(r_s\) \(r_s^2\) (\(h^2\))
X
AF_left -0.284 0.152 0.023
AF_right 0.283 -0.216 0.046
CB_left 0.060 -0.026 0.001
CB_right -0.285 -0.318 0.101
CC1_commissural -0.247 -0.207 0.043
CC2_commissural 0.511 -0.05 0.002
CC3_commissural -0.405 -0.501 0.251
CC4_commissural -0.261 -0.428 0.183
CC5_commissural 0.108 -0.066 0.004
CC6_commissural -0.055 -0.045 0.002
CC7_commissural -0.028 -0.074 0.006
ILF_left 0.313 0.588 0.345
ILF_right 0.270 -0.102 0.010
IOFF_left -0.403 -0.082 0.007
IOFF_right 0.334 0.094 0.009
TF_left 0.619 0.032 0.001
TF_right -0.752 -0.445 0.198
UF_left -0.016 -0.222 0.049
UF_right -0.165 -0.592 0.350
Y
attn_vig -0.147 -0.631 0.398
problem_solving 0.195 -0.212 0.045
process_speed -0.101 -0.561 0.315
verbal_learning 0.129 -0.47 0.221
visual_learning -0.501 -0.665 0.442
work_memory -0.378 -0.657 0.432
EA -0.269 -0.414 0.171
ER_40 -0.400 -0.493 0.243
RAD 0.242 -0.425 0.181
RMET 0.099 -0.259 0.067
TASIT_1 0.379 -0.036 0.001
TASIT_2_paradoxicalSarcasm 0.220 -0.299 0.089
TASIT_2_simpleSarcasm -0.260 -0.387 0.150
TASIT_2_sincere 0.144 0.11 0.012
TASIT_3_lies -0.015 -0.227 0.051
TASIT_3_sarcasm -0.326 -0.46 0.211
Note:
\(r_s\) values > .450 are emphasized, following convention;
\(\beta\) = standardized canonical function coefficient;
\(r_s\) = structure coefficient;
\(r_s^2\) = squared structure coefficient, here also communality \(h^2\)

PRISMA

structureTbl_fn(PRISMA.tbl_plot)
Function 1
Variable \(\beta\) \(r_s\) \(r_s^2\) (\(h^2\))
X
AF_left 0.270 0.085 0.007
AF_right -0.048 -0.529 0.280
CB_left -0.227 -0.279 0.078
CB_right 0.051 -0.447 0.200
CC1_commissural 0.187 -0.12 0.014
CC2_commissural 0.215 -0.39 0.152
CC3_commissural -0.664 -0.635 0.404
CC4_commissural 0.228 -0.449 0.201
CC5_commissural -0.079 -0.237 0.056
CC6_commissural 0.316 -0.084 0.007
CC7_commissural 0.151 -0.295 0.087
ILF_left -0.071 0.268 0.072
ILF_right -0.222 -0.41 0.168
IOFF_left 0.096 -0.288 0.083
IOFF_right -0.262 -0.327 0.107
TF_left 0.049 -0.195 0.038
TF_right -0.076 -0.559 0.312
UF_left 0.211 -0.508 0.258
UF_right -0.833 -0.834 0.696
Y
attn_vig -0.144 -0.548 0.301
problem_solving 0.235 -0.337 0.114
process_speed -0.758 -0.806 0.650
verbal_learning -0.085 -0.624 0.389
visual_learning 0.241 -0.489 0.239
work_memory 0.269 -0.301 0.090
EA -0.205 -0.545 0.297
ER_40 -0.070 -0.386 0.149
RAD -0.266 -0.698 0.487
RMET 0.111 -0.372 0.139
TASIT_1 -0.067 -0.557 0.310
TASIT_2_paradoxicalSarcasm -0.149 -0.608 0.370
TASIT_2_simpleSarcasm -0.041 -0.538 0.290
TASIT_2_sincere 0.030 -0.062 0.004
TASIT_3_lies 0.095 -0.195 0.038
TASIT_3_sarcasm -0.211 -0.582 0.339
Note:
\(r_s\) values > .450 are emphasized, following convention;
\(\beta\) = standardized canonical function coefficient;
\(r_s\) = structure coefficient;
\(r_s^2\) = squared structure coefficient, here also communality \(h^2\)

Tracts


[3] Structure coefficients (CV1). This is the same data as in the table above, but perhaps more clearly presented in this visualization.

#make a new dataframe
GE.tbl_sc <- GE.tbl_plot
PRISMA.tbl_sc <- PRISMA.tbl_plot

#keep only the structure coefficients variable
GE.tbl_sc <- subset(GE.tbl_sc, select = c(variable, V2))
PRISMA.tbl_sc <- subset(PRISMA.tbl_sc, select = c(variable, V2))

#make a logical variable for colour of bars
GE.tbl_sc$colour <- as.factor(ifelse(abs(GE.tbl_sc$V2) >= .45, 1, 0))
PRISMA.tbl_sc$colour <- as.factor(ifelse(abs(PRISMA.tbl_sc$V2) >= .45, 1, 0))

#Lock in factor-level order
GE.tbl_sc$variable <- factor(GE.tbl_sc$variable, levels=GE.tbl_sc$variable)
PRISMA.tbl_sc$variable <- factor(PRISMA.tbl_sc$variable, levels=PRISMA.tbl_sc$variable)

#make plot
structurePlot_fn <- function(tbl_sc){
ggplot(tbl_sc, aes(variable, V2)) + 
  geom_bar(stat='identity', aes(fill = colour)) + 
  theme_bw() +
  theme(axis.ticks=element_blank(), 
        panel.grid.minor = element_blank(), 
        panel.grid.major.x = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1, size=10),
        legend.position = "none") + 
  xlab('') + 
  ylab('structure coefficient') + 
  #ylim(min(tbl_sc$V2)-.1, max(tbl_sc$V2)+.1) +
  ylim(-.9, .5) +
  geom_hline(yintercept = .45, linetype="dashed", color = "darkgreen", size=1.5) +
  geom_hline(yintercept = -.45, linetype="dashed", color = "darkgreen", size=1.5) +
  scale_fill_manual(values=c('grey', 'darkgreen'))
}

GE

structurePlot_fn(GE.tbl_sc)

PRISMA

structurePlot_fn(PRISMA.tbl_sc)


[4] Model stability / validation. Lastly, we validated our model via iterative feature removal, i.e., we removed one of features from the combined p=35 of the \(X\) and \(Y\) sets, ran the CCA, and compared the derived canonical correlation coefficients. This procedure leveraged the CCA property that each variable relates to all other variables in both sets: if the model is stable, canonical correlation estimates will remain similar; if unstable, estimate will vary widely. Values were similar across all iterations, suggesting that our model is stable.

#run the original correlation 
GE.original <- cc(Xset_GE, Yset_GE)$cor
PRISMA.original <- cc(Xset_PRISMA, Yset_PRISMA)$cor

#write a nested function to leave out a column, and calculate correlation
cc_dfOneOut_fn <- function(X, Y){ #takes X and Y dataframe or matrix
  f <- function(x) combn(ncol(x), ncol(x) - 1) #base combn function leave out a column ...
  
  X_inx <- f(X) #from X
  Y_inx <- f(Y) #from Y
  
  ccX <- t(apply(X_inx, 2, function(i) cc(X[, i], Y)$cor)) #run the correlation between dropped X and Y
  ccY <- t(apply(Y_inx, 2, function(i) cc(X, Y[, i])$cor)) #run the correlation between dropped Y and X
  
  list(XVALS = as.data.frame(ccX), YVALS = as.data.frame(ccY)) #add the values to a dataframe
}

#run the function
GE.values <- cc_dfOneOut_fn(Xset_GE, Yset_GE)
PRISMA.values <- cc_dfOneOut_fn(Xset_PRISMA, Yset_PRISMA)

#extract from list and combine the dataframes 
GE.values <- data.frame(Reduce(rbind.fill, GE.values))
PRISMA.values <- data.frame(Reduce(rbind.fill, PRISMA.values))

#add the original observations as the last row
GE.values <- rbind(GE.values, GE.original)
PRISMA.values <- rbind(PRISMA.values, PRISMA.original)

#add a column indicating that the last row is original -- used to colour the plot
GE.values$original <- c(rep(FALSE, ncol(Xset_GE)), rep(FALSE, ncol(Yset_GE)), TRUE)
PRISMA.values$original <- c(rep(FALSE, ncol(Xset_PRISMA)), rep(FALSE, ncol(Yset_PRISMA)), TRUE)

#melt for ggplot
GE.values <- reshape2::melt(GE.values)
PRISMA.values <- reshape2::melt(PRISMA.values)

#boxplot
stability_fn <- function(values){
  ggplot(values, aes(x=variable, y=value)) +
    geom_boxplot(outlier.shape = NA) +
    geom_violin(fill='grey', alpha=.5) +
    geom_jitter(aes(colour = original, size= original)) +
    theme_bw() +
    xlab('') + 
    ylim(0, .75) +
    theme(axis.ticks = element_blank()) +
    scale_colour_manual(values = c("FALSE" = "black", "TRUE" = "red")) +
    scale_size_manual(values = c("FALSE" = .5, "TRUE" = 5)) +
    guides(color = FALSE, size = FALSE)
}

GE

stability_fn(GE.values)

PRISMA

stability_fn(PRISMA.values)