This contains my data wrangling efforts as well as the relative importance analysis. The relative importance analysis failed. There’s not enough data for this type of machine learning.
First I read in Cathy’s files and change the dates from SAS’s format (where they originate on 1/1/1960) to a format that plays nice in R.
library(sas7bdat)
library(eeptools)
df.clin<-read.sas7bdat("Y:/julie/Cathy/Data/d1605_clindata_110216.sas7bdat")
df.demog<-read.sas7bdat("Y:/julie/Cathy/Data/d1605_demog_110216.sas7bdat")
df<-read.sas7bdat("Y:/julie/Cathy/Data/allz_090518.sas7bdat")
df.clin.sub<-as.data.frame(cbind(df.clin$ID, df.clin$testdate, df.clin$cdr))
colnames(df.clin.sub)<-c("ID", "date1", "cdr")
df.sub<-as.data.frame(cbind(df$ID, df$Session_Date, df[,104:194]))
colnames(df.sub)[1:2]<-c("ID", "date2")
#dealing with the fact that SAS originates dates on 1/1/1960:
df.clin$csf_lp_date<-as.Date(df.clin$csf_lp_date, origin="1960-01-01")
df.clin$ca_close_to_lp<-as.Date(df.clin$ca_close_to_lp, origin="1960-01-01")
df.clin$ca_close_to_pib<-as.Date(df.clin$ca_close_to_pib, origin="1960-01-01")
df.clin$testdate<-as.Date(df.clin$testdate, origin="1960-01-01")
df.clin$base_ca_date<-as.Date(df.clin$base_ca_date, origin="1960-01-01")
df.clin.sub$date1<-as.Date(df.clin.sub$date1, origin="1960-01-01")
df.sub$date2<-as.Date(df.sub$date2, origin = "1960-01-01")
df$Session_Date<-as.Date(df$Session_Date, origin = "1960-01-01")
df$BIRTH<-as.Date(df$BIRTH, origin = "1960-01-01")
df.demog$BIRTH<-as.Date(df.demog$BIRTH, origin = "1960-01-01")
df.demog$DEATH<-as.Date(df.demog$DEATH, origin = "1960-01-01")
Then I calculate either the current age or the age at death of each participant.
for(i in 1:length(df.demog$BIRTH)){
if(is.na(df.demog$DEATH[i])){
df.demog$DEATH[i]<-Sys.Date()
}
df.demog$age[i]<-floor(age_calc(df.demog$BIRTH[i], df.demog$DEATH[i], units = "years"))
if(df.demog$DEATH[i] == Sys.Date()){
df.demog$DEATH[i]<- NA
}
}
Then I create a dataframe called df.result that contains the BOLD data and clinical data. df.result is organized such that each scan contains the clinical information for the date nearest to the scan.
df.combined<-merge(df.clin.sub, df.sub, by = "ID")
unique.id<-unique(df.combined$ID)
k<-1
df.result<-df.combined[FALSE,]
for(i in 1:length(unique(df.combined$ID))){
df.working<-subset(df.combined, df.combined$ID == unique.id[i])
scandates<-unique(df.working$date2)
for(j in 1:length(scandates)){
df.working2<-subset(df.working, df.working$date2 == scandates[j])
ind.val<-which.min(abs(df.working2$date2-df.working2$date1))
df.result[k,]<-df.working2[ind.val,]
k<-k+1
}
}
I had to figure out who we had data for that converted from CDR 0 to either 0.5 or 1. I found 20 individuals who had scans for both before and after conversion.
cdr0<-subset(df.result, df.result$cdr == 0)
cdr.imp<-subset(df.result, df.result$cdr>0)
df.convert<-merge(cdr0, cdr.imp, by = "ID", all = FALSE) #df.convert contains the FC matrices we have for cognitive converters
df.convert.id<-unique(df.convert$ID) #list of the individuals we have scans for both before and after they convert
df.demog.convert<-subset(df.demog, df.demog$ID %in% df.convert.id) #demographics for all 20 individuals who we have pre- and post- conversion scans
repeatedscans<-as.data.frame(cdr0$ID[duplicated(cdr0$ID)])
colnames(repeatedscans)<-"ID"
repeatedscans<-merge(repeatedscans, cdr0, by = "ID", all = FALSE)
repeatedscans<-merge(repeatedscans, df.demog, by = "ID", all = FALSE)
Then I tried to cohort match. So I searched through all of the non-converters and found non-converters with multiple scans who were within 3 years of the converters, were the same gender, and were the same APOE status. I was able to find 91 non-converters who matched with 1 (or more) of 17 of the 20 individuals.
savedID<-data.frame()
k<-1
for (i in 1:length(df.demog.convert$ID)){
for (j in 1:length(repeatedscans$ID)){
if(abs(df.demog.convert$age[i] - repeatedscans$age[j]) < 4 & df.demog.convert$GENDER[i] == repeatedscans$GENDER[j] & df.demog.convert$APOE[i] == repeatedscans$APOE[j]){
savedID[k,1]<-repeatedscans$ID[j]
savedID[k, 2]<-df.demog.convert$ID[i]
k<-k+1}
}}
colnames(savedID)<-c("ID", "Partner")
length(unique(savedID$Partner)) #17 of 20 are matched with all the criteria
## [1] 17
repeatedscans.matches<-merge(savedID, repeatedscans, by = "ID", all = FALSE)
forview<-cbind(repeatedscans.matches[,1:5], repeatedscans.matches[,97:104])
forview<-forview[,1:2]
df.demog.match<-subset(df.demog, savedID$Partner %in% unique(repeatedscans.matches$Partner))
df.demog.match<-merge(df.demog.match, forview, by = "ID") #20 individuals who have multiple scans and do convert over time
df.demog.match<-df.demog.match[!duplicated(df.demog.match$ID),] #91 matching indviduals who did not convert
Since I didn’t successfully match every single converter, I wanted to make sure that our overall populations did not differ in terms of education, age, apoe status, and gender proportions. On all counts, the cohorts are not significantly different, so I decided to proceed with analyzing the 111 selected participants - 20 converters and 91 non-converters.
#cohorts are well matched
t.test(df.demog.convert$GENDER, df.demog.match$GENDER)
##
## Welch Two Sample t-test
##
## data: df.demog.convert$GENDER and df.demog.match$GENDER
## t = -0.65443, df = 27.362, p-value = 0.5183
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3406597 0.1758245
## sample estimates:
## mean of x mean of y
## 1.500000 1.582418
t.test(df.demog.convert$EDUC, df.demog.match$EDUC)
##
## Welch Two Sample t-test
##
## data: df.demog.convert$EDUC and df.demog.match$EDUC
## t = -0.53644, df = 25.806, p-value = 0.5963
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.866903 1.094376
## sample estimates:
## mean of x mean of y
## 15.35000 15.73626
t.test(df.demog.convert$APOE, df.demog.match$APOE)
##
## Welch Two Sample t-test
##
## data: df.demog.convert$APOE and df.demog.match$APOE
## t = -0.92947, df = 22.597, p-value = 0.3625
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.435586 1.687235
## sample estimates:
## mean of x mean of y
## 31.45000 32.82418
t.test(df.demog.convert$age, df.demog.match$age)
##
## Welch Two Sample t-test
##
## data: df.demog.convert$age and df.demog.match$age
## t = 1.4789, df = 24.426, p-value = 0.1519
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.093714 6.642065
## sample estimates:
## mean of x mean of y
## 81.95000 79.17582
I did some data wrangling to get everything ready for the PLS-DA….I wound up with two dataframes to use, df.earlyscan which has the first scan for each individual in my 111-person cohort, and df.latescan which has the final scan for each individual in the cohort. For individuals who convert to CDR > 0 and then revert back to CDR = 0, I kept the scan for when they were classified at CDR > 0. Not sure if that’s the right approach, but it’s what I did…
df.converters<-merge(df.demog.convert, df.result, "ID", all = FALSE)
df.converters$WillConvert<-1
df.nonconverters<-merge(df.demog.match, df.result, "ID", all = FALSE)
df.nonconverters<-df.nonconverters[!(df.nonconverters$ID %in% unique(df.converters$ID)),]
df.nonconverters<-subset(df.nonconverters, select = -c(Partner)) #dropping the extra column so now df.converters and df.nonconverters match
df.nonconverters$WillConvert<-0
df.all<-rbind(df.converters, df.nonconverters)
df.earlyscan<-df.all[tapply(1:nrow(df.all),df.all$ID,function(ii) ii[which.min(df.all$date2[ii])]),] #contains earliest scan for every individual
df.latescan1<-subset(df.converters, df.converters$cdr > 0)
df.latescan1<-df.latescan1[tapply(1:nrow(df.latescan1),df.latescan1$ID,function(ii) ii[which.max(df.latescan1$date2[ii])]),] #contains most recent scan for every individual
df.latescan2<-df.nonconverters[tapply(1:nrow(df.nonconverters),df.nonconverters$ID,function(ii) ii[which.max(df.nonconverters$date2[ii])]),] #contains most recent scan for every individual
df.latescan<-rbind(df.latescan1, df.latescan2)
I use a package called mixOmics to do PLS-DA. MixOmics is designed for genomics and chemometrics applications, but works well for neuro-imaging as well. PLS-DA is a machine learning technique for classification and feature selection. I input the funcitonal connectivity matrix for each individual and tried to predict whether or not they would convert. I did this with both the earliest scans and the most recent scans in two separate tests. My hope was that key networks would emerge for classification, and that these networks would align with the interesting graphs that Cathy had produced.
library(mixOmics)
#input a matrix that contains ONLY the functional connectivity values and the LAST column which indicates whether or not an individual will convert
#the last column of the matrix will be used as the categories for prediction
createConfusionDistribution<-function(df_func){
train.id <- sample(seq_len(nrow(df_func)), size = floor(0.66 * nrow(df_func))) #spltting data into 2/3rds train, 1/3 test
data.test<-df_func[-train.id,]
data.train<-df_func[train.id,]
y.test<-df_func[,(length(data.train))][-train.id]
y.train<-df_func[,(length(data.train))][train.id]
#http://mixomics.org/methods/pls-da/
plsda.res <- plsda(as.matrix(data.train[,1:(length(data.train)-1)]), as.factor(y.train), ncomp = 7)
# this code takes ~ 1 min to run
#set.seed(2543) # for reproducibility here, only when the `cpus' argument is not used
perf.plsda <- perf(plsda.res, validation = "Mfold", folds = 10,
progressBar = FALSE, auc = TRUE, nrepeat = 10)
# perf.plsda.srbct$error.rate # error rates
#plot(perf.plsda, col = color.mixo(1:3), sd = TRUE, legend.position = "horizontal")
#final feature selection output and weight coefficient (take abs value to compare importance)
primaryfeatures<-selectVar(plsda.res, comp = 1)$value
primaryfeatures$percent<-abs(primaryfeatures$value.var)/sum(abs(primaryfeatures$value.var))
test.predict <- predict(plsda.res, data.test[,1:(length(data.train)-1)], dist = "max.dist")
prediction <- test.predict$class$max.dist[,1]
# calculate the error rate of the model
confusion.mat = get.confusion_matrix(truth = as.factor(y.test), predicted = prediction)
ConfusionRate<-get.BER(confusion.mat) #appropriate for unbalanced number of samples. calculates average proportion of wrongly classified samples
#confusion.mat #shows raw table of how I did
toReturn<-list(ErrorRate = ConfusionRate, ImportantComponents = primaryfeatures, Matrix = confusion.mat)
return(toReturn)}
for.plsda<-df.earlyscan[,13:104]
for.plsda$WillConvert<-as.factor(for.plsda$WillConvert)
#500 iteration permutation test. Ran it over the weekend....
# #iterating to do a permutation test....
# earlyscanresult<-replicate(500, createConfusionDistribution(for.plsda), simplify = FALSE)
#
# ErrorRate<-unlist(lapply(earlyscanresult, '[[', 1))
# ImportantComponents<-sapply(earlyscanresult, function(m) m[2])
# Matrix<-sapply(earlyscanresult, function(m) m[3])
#
# write.csv(ErrorRate, 'Y:/julie/Cathy/Data/PLSDA_earlyscan_errorrate.csv' )
# lapply(ImportantComponents, function(x) write.table( data.frame(x), 'Y:/julie/Cathy/Data/PLSDA_earlyscan_ImptCmpt.csv' , append= T, sep=',' ))
# lapply(Matrix, function(x) write.table( data.frame(x), 'Y:/julie/Cathy/Data/PLSDA_earlyscan_Matrix.csv' , append= T, sep=',' ))
#
# for.plsda<-df.latescan[,13:104]
# for.plsda$WillConvert<-as.factor(for.plsda$WillConvert)
#
# latescanresult<-replicate(500, createConfusionDistribution(for.plsda), simplify = FALSE)
#
# ErrorRate<-unlist(lapply(latescanresult, '[[', 1))
# ImportantComponents<-sapply(latescanresult, function(m) m[2])
# Matrix<-sapply(latescanresult, function(m) m[3])
#
# write.csv(ErrorRate, 'Y:/julie/Cathy/Data/PLSDA_latescan_errorrate.csv' )
# lapply(ImportantComponents, function(x) write.table( data.frame(x), 'Y:/julie/Cathy/Data/PLSDA_latescan_ImptCmpt.csv' , append= T, sep=',' ))
# lapply(Matrix, function(x) write.table( data.frame(x), 'Y:/julie/Cathy/Data/PLSDA_latescan_Matrix.csv' , append= T, sep=',' ))
In order to properly render the Rmd file, I can’t run the 500 iteration permutation test in the document. I ran it over the weekend and instead will just read the results of the permutation tests in here.
pls_early<-read.csv('Y:/julie/Cathy/Data/PLSDA_earlyscan_errorrate.csv', header = TRUE)
pls_late<-read.csv('Y:/julie/Cathy/Data/PLSDA_latescan_errorrate.csv', header = TRUE)
colnames(pls_early)<-c("iter.", "ErrorRate")
colnames(pls_late)<-c("iter.", "ErrorRate")
The predictive capabilities of the FC matrices are….nonexistant. It’s terrible. Also, classification goes better with the early scans than the late scans. This is also unfortunate.
t.test(pls_early$ErrorRate, pls_late$ErrorRate)
##
## Welch Two Sample t-test
##
## data: pls_early$ErrorRate and pls_late$ErrorRate
## t = 0.48619, df = 776.2, p-value = 0.627
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.003435975 0.005698277
## sample estimates:
## mean of x mean of y
## 0.5052282 0.5040971
I simulate randomly guessing whether someone will convert or not 500 times, to see if this classification approach works better than random guess. It’s not, really.
ConfusionRate<-numeric(0)
for(i in 1:500){
sample.space <- c(0,1)
theta1 <- 0.1904762 #proportion of individuals who convert in our sample cohort
theta2 <-0.5
N <- 500 # we want to flip a coin 20 times
flips <- sample(sample.space,
size = N,
replace = TRUE,
prob = c(theta1, 1 - theta1))
flips2 <- sample(sample.space,
size = N,
replace = TRUE,
prob = c(theta2, 1 - theta2))
compareguesses = flips - flips2
confusion.mat = get.confusion_matrix(truth = as.factor(flips), predicted = flips2)
ConfusionRate[i]<-get.BER(confusion.mat) }
Here is a plot of how terrible this technique is for this problem.
library(ggpubr)
library(tidyr)
#http://www.sthda.com/english/rpkgs/ggpubr/reference/stat_compare_means.html
plottingDF<-cbind(ConfusionRate, pls_early$ErrorRate, pls_late$ErrorRate)
colnames(plottingDF)<-c("Random", "Early Prediction", "Late Prediction")
plottingDF<-as.data.frame(plottingDF)
plottingDF_long<-gather(plottingDF, groups, errorrate, Random:'Late Prediction', factor_key = TRUE)
my_comparisons<-list(c("Early Prediction", "Late Prediction"), c("Early Prediction", "Random"), c("Late Prediction", "Random"))
p<-ggboxplot(data = plottingDF_long, x = "groups", y = "errorrate", color = "groups", add = c("jitter", "mean"), palette = "Set1", outlier.shape = NA)
p<- p +stat_compare_means(comparisons = my_comparisons)#+ stat_compare_means(label.x = 1.75, label.y = 0.85)
p + xlab("Group") + ylab("Error Rate") + ggtitle("Comparison of CDR Predictive Abilities by Time")+theme(plot.title = element_text(hjust = 0.5, family = "Trebuchet MS", color="black", face="bold", size=14)) +
theme(axis.title = element_text(family = "Trebuchet MS", color="black", face="bold", size=14))
Sigh. So then I tried basicaly normalizing the difference between baseline and most recent scan for each of the 105 individuals we’re considering. (Scan2 - Scan1)/Weeks between scans. These results were also a real bummer.
## [1] "Accuracy = 0.391464484068986"
## [1] "Sensitivity = 0.260115606936416"
## [1] "Specificity = 0.393487875100175"
Here are spaghetti plots on a region-by-region basis. I’m not sure how useful they are either.