steps 1 - 5 done in Matlab, remaining steps done in R
import packages code and messages suppressed
library(tidyr)
library(plyr)
library(dplyr)
library(ggplot2)
library(magrittr)
library(stringr)
library(corrplot)
library(psych)
library(GPArotation)
library(lavaan)
library(caret)
library(ks)
library(CCA)
library(vegan)
import data
# pheno has all behavioral and phenotypic data
pheno <- read.csv("pheno.hcp1200.csv")
# labels has community labels for all 200 nodes
labels <- read.csv("labels.csv", header = FALSE)
labels <- rename(labels, "comm" = "V1")
# sublist has subject numbers for all participants
sublist <- read.csv("colnames.csv", header = FALSE, sep = ",")
sublist <- str_replace_all(sublist, ".mat", "")
sublist <- as.array(sublist)
# modcontr has contribution to modularity for each community
modularitycontribution <- read.csv("modcontr.csv", header = FALSE, col.names = sublist)
# partcoef has participation coefficient for each node
participationcoeffficient <- read.csv("partcoeff.csv", header = FALSE, col.names = sublist)
tidy data so that each variable is of correct type and that only variables needed for analysis are included
# pivot to long format to compute average across scans
modcontr <- modularitycontribution %>%
pivot_longer(col = everything(),
names_to = c("Subject", "scan", "gamma"),
names_sep = "_",
values_to = "contrib")
modcontr$gamma <- as.double(modcontr$gamma)
modcontr$Subject <- str_replace_all(modcontr$Subject, "X", "")
modcontr$Subject <- as.integer(modcontr$Subject)
modcontr <- modcontr[with(modcontr, order(Subject, scan, gamma)),]
modcontr$comm <- rep(1:16, length(modcontr$Subject)/16)
modcontr$Subject <- as.character(modcontr$Subject)
modcontr$gamma <- as.character(modcontr$gamma)
modcontr$gamma <- str_replace_all(modcontr$gamma, "1", "0.6")
modcontr$gamma <- str_replace_all(modcontr$gamma, "2", "0.8")
modcontr$gamma <- str_replace_all(modcontr$gamma, "3", "1")
modcontr$gamma <- str_replace_all(modcontr$gamma, "4", "1.2")
modcontr$gamma <- str_replace_all(modcontr$gamma, "5", "1.4")
# rearrange structure of data so dataframes can be combined
mcw <- modcontr %>%
group_by(Subject, gamma, comm) %>%
summarise(mc = mean(contrib)) %>%
pivot_wider(names_from = c(gamma, comm),
names_glue = "mc_g{gamma}_c{comm}",
values_from = mc)
partcoef <- participationcoeffficient %>% pivot_longer(col = everything(),
names_to = c("Subject", "scan", "gamma"),
names_sep = "_",
values_to = "pc")
partcoef$Subject <- str_replace_all(partcoef$Subject, "X", "")
partcoef$gamma <- as.double(partcoef$gamma)
partcoef <- partcoef[with(partcoef, order(Subject, scan, gamma)),]
partcoef$node <- rep(1:200, length(partcoef$Subject)/200)
partcoef$gamma <- as.character(partcoef$gamma)
partcoef$Subject <- as.character(partcoef$Subject)
partcoef$gamma <- str_replace_all(partcoef$gamma, "1", "0.6")
partcoef$gamma <- str_replace_all(partcoef$gamma, "2", "0.8")
partcoef$gamma <- str_replace_all(partcoef$gamma, "3", "1")
partcoef$gamma <- str_replace_all(partcoef$gamma, "4", "1.2")
partcoef$gamma <- str_replace_all(partcoef$gamma, "5", "1.4")
pcw <- partcoef %>%
group_by(Subject, gamma, node) %>%
summarise(pc = mean(pc)) %>%
pivot_wider(names_from = c(gamma, node),
names_glue = "pc_g{gamma}_n{node}",
values_from = pc)
we have to reduce the dimensionality of the participation coefficient data because otherwise we have more variables than observations. because contribution to modularity is computed at the community level and participation coefficient is computed at the node level, by reducing the dimensionality of only the participation coefficient we aren’t losing any community level data and can get a sense of which brain regions are relevant for social relationships
pcw_mad<- apply(pcw[-1],2,function(x) round(mad(x),digits=4))
pcw_mad_order <- data.frame(noderank = as.factor(1:dim(pcw[-1])[2]),mad = pcw_mad[order(-pcw_mad)])
#take top 10% of nodes across gammas and communities
nodes <- which(pcw_mad>=pcw_mad_order$mad[100])
pc_final <- pcw[,nodes]
pc_final <- cbind(pcw$Subject, pc_final)
names(pc_final)[names(pc_final) == "pcw$Subject"] <- "Subject"
netstats <- full_join(mcw, pc_final, by = "Subject")
netstats <- ungroup(netstats)
pheno <- dplyr::select(pheno, c("Subject", "Friendship_Unadj",
"PercHostil_Unadj", "EmotSupp_Unadj",
"PercStress_Unadj", "Loneliness_Unadj",
"PercReject_Unadj",
"InstruSupp_Unadj", "Gender", "Age",
"QC_Issue", "X3T_RS.fMRI_PctCompl"))
pheno$Subject <- as.character(pheno$Subject)
go1tosplit <- pheno[,c("Subject", "Friendship_Unadj", "PercHostil_Unadj",
"EmotSupp_Unadj", "PercStress_Unadj",
"Loneliness_Unadj", "PercReject_Unadj",
"InstruSupp_Unadj", "Gender", "Age", "QC_Issue",
"X3T_RS.fMRI_PctCompl")]
# remove NAs
go1nona <- subset(go1tosplit, is.na(go1tosplit$EmotSupp_Unadj) == FALSE)
# split sample
set.seed(3456)
trainIndex <- createDataPartition(go1nona$EmotSupp_Unadj,
p = 0.667, list = F, times = 1)
go1train <- go1nona[trainIndex,]
go1test <- go1nona[-trainIndex,]
# save sample split
#write.csv(go1train, 'go1train_subject.csv',
# row.names = FALSE, quote = FALSE)
#write.csv(go1test, "go1test_subject.csv",
# row.names = FALSE, quote = FALSE)
The final training set for go1 has 805 subjects. The final testing set for go1 has 399 subjects.
goodbrains <- filter(go1train, QC_Issue == "" & X3T_RS.fMRI_PctCompl == 100.0)
complete <- intersect(goodbrains$Subject, netstats$Subject)
pheno.train <- subset(pheno, is.element(pheno$Subject, complete))
brain.train <- subset(netstats, is.element(netstats$Subject, complete))
after subjects with incomplete or low quality data there are 540 subjects in the training sample
x <- dplyr::select(pheno.train, c(Friendship_Unadj, PercReject_Unadj, PercStress_Unadj,
PercHostil_Unadj, EmotSupp_Unadj, Loneliness_Unadj, InstruSupp_Unadj))
x <- apply(x, 2, as.numeric)
y <- brain.train[-1]
y <- apply(y, 2, as.numeric)
check within and between group correlations
correl <- matcor(x, y)
img.matcor(correl, type = 2)
using base R function
cca <- cancor(x,y)
cca$cor
## [1] 0.5724001 0.5305774 0.5172358 0.4991709 0.4724951 0.4314115 0.3951710
These values represent the canonical correlation coefficients, which are moderately high.
To better interpret these values and the canonical variates (both statistically and conceptually), we need more information. The base R function can’t give us this, so we’ll use another package’s function.
using CCorA function in CCA package
cca2 <- CCorA(x,y)
cca2$CanCorr # canonical correlations
## [1] 0.5724001 0.5305774 0.5172358 0.4991709 0.4724951 0.4314115 0.3951710
cca2$Eigenvalues # canonical eigenvalues (sqs of canonical correlations)
## [1] 0.3276418 0.2815124 0.2675329 0.2491716 0.2232516 0.1861159 0.1561601
#cca2$Cy # object scores in Y biplot
#cca2$Cx # object scores in X biplot
#cca2$corr.Y.Cy # scores of Y variables in Y biplot, computed as cor(y,Cy)
#cca2$corr.X.Cx # scores of X variables in X biplot
#cca2$corr.Y.Cx # cor(Y, Cy) available for plotting variables Y in space of X manually
#cca2$corr.X.Cy # cor(X, Cx) available for plotting variables X in space of Y manually
With this kind of data, Pillai’s trace (= 1.6913863) is the most robust test statistic. The p-value for Pillai’s trace with this analysis is 0.626373 > 0.05. We’re also given the results of a redundancy analysis, with R^2s = 0.2196024, 0.0111735 and adjusted (for n and number of explanatory variables) R^2s = -0.0334995, -0.0018374.
visualize canonical variates
biplot(cca2, "v", cex=c(0.7,0.6))
Because there are so many varibles in the right plot (132 network features in the analysis), let’s break them down in terms of how they load onto canonical variates.