Research Question

How does variation in brain network features relate to variation in perceived social support? What is the correlation structure between brain network features and perceived social support?

Analysis pipeline

steps 1 - 5 done in Matlab, remaining steps done in R

  1. construct networks from rsfMRI data from HCP
  2. calculate modular structure using Louvain algorithm
  3. identify consensus communities
  4. identify high participation nodes
  5. identify contribution to modularity by brain systems
  6. identify nodes with highest median absolute deviance in participation coefficient
  7. compute canonical correlation analysis
  8. analyze canonical variates

import packages code and messages suppressed

library(tidyr)
library(plyr)
library(dplyr)
library(ggplot2)
library(magrittr)
library(stringr)
library(corrplot)
library(psych)
library(GPArotation)
library(lavaan)
library(caret)
library(ks)
library(CCA)
library(vegan)

import data

# pheno has all behavioral and phenotypic data
pheno <- read.csv("pheno.hcp1200.csv")

# labels has community labels for all 200 nodes
labels <- read.csv("labels.csv", header = FALSE)
labels <- rename(labels, "comm" = "V1")

# sublist has subject numbers for all participants
sublist <- read.csv("colnames.csv", header = FALSE, sep = ",")
sublist <- str_replace_all(sublist, ".mat", "")
sublist <- as.array(sublist)

# modcontr has contribution to modularity for each community
modularitycontribution <- read.csv("modcontr.csv", header = FALSE, col.names = sublist)

# partcoef has participation coefficient for each node
participationcoeffficient <- read.csv("partcoeff.csv", header =  FALSE, col.names = sublist)

tidy data so that each variable is of correct type and that only variables needed for analysis are included

# pivot to long format to compute average across scans
modcontr <- modularitycontribution %>% 
  pivot_longer(col = everything(),
  names_to = c("Subject", "scan", "gamma"),
  names_sep = "_",
  values_to = "contrib")
modcontr$gamma <- as.double(modcontr$gamma)
modcontr$Subject <- str_replace_all(modcontr$Subject, "X", "")
modcontr$Subject <- as.integer(modcontr$Subject)
modcontr <- modcontr[with(modcontr, order(Subject, scan, gamma)),]
modcontr$comm <- rep(1:16, length(modcontr$Subject)/16)
modcontr$Subject <- as.character(modcontr$Subject)
modcontr$gamma <- as.character(modcontr$gamma)
modcontr$gamma <- str_replace_all(modcontr$gamma, "1", "0.6")
modcontr$gamma <- str_replace_all(modcontr$gamma, "2", "0.8")
modcontr$gamma <- str_replace_all(modcontr$gamma, "3", "1")
modcontr$gamma <- str_replace_all(modcontr$gamma, "4", "1.2")
modcontr$gamma <- str_replace_all(modcontr$gamma, "5", "1.4")
# rearrange structure of data so dataframes can be combined
mcw <- modcontr %>%
  group_by(Subject, gamma, comm) %>% 
  summarise(mc = mean(contrib)) %>% 
  pivot_wider(names_from = c(gamma, comm),
              names_glue = "mc_g{gamma}_c{comm}",
              values_from = mc)
partcoef <- participationcoeffficient %>% pivot_longer(col = everything(),
                           names_to = c("Subject", "scan", "gamma"),
                           names_sep = "_",
                           values_to = "pc")
partcoef$Subject <- str_replace_all(partcoef$Subject, "X", "")
partcoef$gamma <- as.double(partcoef$gamma)
partcoef <- partcoef[with(partcoef, order(Subject, scan, gamma)),]
partcoef$node <- rep(1:200, length(partcoef$Subject)/200)
partcoef$gamma <- as.character(partcoef$gamma)
partcoef$Subject <- as.character(partcoef$Subject)
partcoef$gamma <- str_replace_all(partcoef$gamma, "1", "0.6")
partcoef$gamma <- str_replace_all(partcoef$gamma, "2", "0.8")
partcoef$gamma <- str_replace_all(partcoef$gamma, "3", "1")
partcoef$gamma <- str_replace_all(partcoef$gamma, "4", "1.2")
partcoef$gamma <- str_replace_all(partcoef$gamma, "5", "1.4")

pcw <- partcoef %>%
  group_by(Subject, gamma, node) %>%
  summarise(pc = mean(pc)) %>%
  pivot_wider(names_from = c(gamma, node),
              names_glue = "pc_g{gamma}_n{node}",
              values_from = pc)

identify nodes with participation coefficients that vary the most

we have to reduce the dimensionality of the participation coefficient data because otherwise we have more variables than observations. because contribution to modularity is computed at the community level and participation coefficient is computed at the node level, by reducing the dimensionality of only the participation coefficient we aren’t losing any community level data and can get a sense of which brain regions are relevant for social relationships

pcw_mad<- apply(pcw[-1],2,function(x) round(mad(x),digits=4))
pcw_mad_order <- data.frame(noderank = as.factor(1:dim(pcw[-1])[2]),mad = pcw_mad[order(-pcw_mad)])
#take top 10% of nodes across gammas and communities
nodes <- which(pcw_mad>=pcw_mad_order$mad[100])
pc_final <- pcw[,nodes]
pc_final <- cbind(pcw$Subject, pc_final)
names(pc_final)[names(pc_final) == "pcw$Subject"] <- "Subject"

netstats <- full_join(mcw, pc_final, by = "Subject")
netstats <- ungroup(netstats)
pheno <- dplyr::select(pheno, c("Subject", "Friendship_Unadj",
                                "PercHostil_Unadj", "EmotSupp_Unadj",
                                "PercStress_Unadj", "Loneliness_Unadj",
                                "PercReject_Unadj",
                                "InstruSupp_Unadj", "Gender", "Age",
                                "QC_Issue", "X3T_RS.fMRI_PctCompl"))
pheno$Subject <- as.character(pheno$Subject)

Split sample

go1tosplit <- pheno[,c("Subject", "Friendship_Unadj", "PercHostil_Unadj",
                       "EmotSupp_Unadj", "PercStress_Unadj",
                       "Loneliness_Unadj", "PercReject_Unadj",
                       "InstruSupp_Unadj", "Gender", "Age", "QC_Issue",
                       "X3T_RS.fMRI_PctCompl")]
# remove NAs
go1nona <- subset(go1tosplit, is.na(go1tosplit$EmotSupp_Unadj) == FALSE)

# split sample
set.seed(3456)
trainIndex <- createDataPartition(go1nona$EmotSupp_Unadj,
                                  p = 0.667, list = F, times = 1)
go1train <- go1nona[trainIndex,]
go1test <- go1nona[-trainIndex,]

# save sample split
#write.csv(go1train, 'go1train_subject.csv', 
#          row.names = FALSE, quote = FALSE)
#write.csv(go1test, "go1test_subject.csv", 
#          row.names = FALSE, quote = FALSE)

The final training set for go1 has 805 subjects. The final testing set for go1 has 399 subjects.

Apply exclusions

goodbrains <- filter(go1train, QC_Issue == "" & X3T_RS.fMRI_PctCompl == 100.0)
complete <- intersect(goodbrains$Subject, netstats$Subject)
pheno.train <- subset(pheno, is.element(pheno$Subject, complete))
brain.train <- subset(netstats, is.element(netstats$Subject, complete))

after subjects with incomplete or low quality data there are 540 subjects in the training sample

prepare for cca

x <- dplyr::select(pheno.train, c(Friendship_Unadj, PercReject_Unadj, PercStress_Unadj,
                     PercHostil_Unadj, EmotSupp_Unadj, Loneliness_Unadj, InstruSupp_Unadj))
x <- apply(x, 2, as.numeric)
y <- brain.train[-1]
y <- apply(y, 2, as.numeric)

check within and between group correlations

correl <- matcor(x, y)
img.matcor(correl, type = 2)

perform standard cca

using base R function

cca <- cancor(x,y)
cca$cor
## [1] 0.5724001 0.5305774 0.5172358 0.4991709 0.4724951 0.4314115 0.3951710

These values represent the canonical correlation coefficients, which are moderately high.
To better interpret these values and the canonical variates (both statistically and conceptually), we need more information. The base R function can’t give us this, so we’ll use another package’s function.

using CCorA function in CCA package

cca2 <- CCorA(x,y)
cca2$CanCorr          # canonical correlations
## [1] 0.5724001 0.5305774 0.5172358 0.4991709 0.4724951 0.4314115 0.3951710
cca2$Eigenvalues      # canonical eigenvalues (sqs of canonical correlations)
## [1] 0.3276418 0.2815124 0.2675329 0.2491716 0.2232516 0.1861159 0.1561601
#cca2$Cy               # object scores in Y biplot
#cca2$Cx               # object scores in X biplot
#cca2$corr.Y.Cy        # scores of Y variables in Y biplot, computed as cor(y,Cy)
#cca2$corr.X.Cx        # scores of X variables in X biplot
#cca2$corr.Y.Cx        # cor(Y, Cy) available for plotting variables Y in space of X manually
#cca2$corr.X.Cy        # cor(X, Cx) available for plotting variables X in space of Y manually

With this kind of data, Pillai’s trace (= 1.6913863) is the most robust test statistic. The p-value for Pillai’s trace with this analysis is 0.626373 > 0.05. We’re also given the results of a redundancy analysis, with R^2s = 0.2196024, 0.0111735 and adjusted (for n and number of explanatory variables) R^2s = -0.0334995, -0.0018374.

visualize canonical variates

biplot(cca2, "v", cex=c(0.7,0.6))

Because there are so many varibles in the right plot (132 network features in the analysis), let’s break them down in terms of how they load onto canonical variates.