Library
library(randomForest);library(leaps);library(pROC);library(ggplot2);library(reshape2);library(plotly)
predict.regsubsets <- function (object ,newdata ,id ,...) {
form=as.formula (object$call [[2]])
mat=model.matrix (form ,newdata )
coefi =coef(object ,id=id)
xvars =names (coefi )
mat[,xvars ]%*% coefi
}
Data
Subject 2 is an outlier, so we deleted it.
19 subjects remain
Create a factor variable
dta <- readRDS("ML_NIRS_Connectivity_MeanVar.Rdata")[-12,c(2,10:39)]
dta$Word_C <-as.factor(ifelse(dta$Word > median(dta$Word), "H","L"))
head(dta[,c(1:4,32)])
Word Sex Age Channel_2_Channel_1 Word_C
1 125 M 10.41 0.2485940 H
2 102 M 9.95 0.7715194 L
3 57 M 10.22 0.8711941 L
4 88 M 9.68 0.2212408 L
5 114 F 10.35 0.3854612 H
6 74 F 10.37 0.7481600 L
dta2 <- dta[,-1]
rownames(dta2) <- 1:nrow(dta2)
Word_C : Linear model versus RF
1 v.s.1 varaible
fit_lm <- list()
auc_lm <- data.frame(AUC = NA, variable = NA)
p_lm <- data.frame(pvalue = NA, variable = NA)
fit_rf <- list()
auc_rf <- data.frame(AUC = NA, variable = NA)
for (i in 1:30){
fit_lm[[i]] <- summary(fit <- glm(Word_C~.,
data = dta2[,c(i,31)],family = "binomial"))
names(fit_lm)[i] <- colnames(dta2)[i]
yhat <- predict(fit,data = dta2[,c(i,31)],type = "response")
auc_lm <- rbind(auc_lm,
data.frame(AUC = pROC::auc(dta2$Word_C,yhat),
variable = colnames(dta2)[i]))
p_lm <- rbind(p_lm, data.frame(pvalue = fit_lm[[i]]$coefficient[2,4], variable = colnames(dta2)[i]))
fit_rf[[i]] <- randomForest(Word_C~ .,
data = dta2[,c(i,31)],
mtry = 1,
ntree = 10001,
nodesize = 5,
importance = F)
names(fit_rf)[i] <- colnames(dta2)[i]
yhat <- predict(fit_rf[[i]],data = dta2[,c(i,31)],type = "prob")[,2]
auc_rf <- rbind(auc_rf,
data.frame(AUC = pROC::auc(dta2$Word_C,yhat),
variable = colnames(dta2)[i]))
}
Comparison
na.omit(cbind(p_lm[order(p_lm$pvalue),c(2,1)],auc_lm[order(-auc_lm$AUC),c(2,1)],auc_rf[order(-auc_rf$AUC),c(2,1)]))
variable pvalue variable AUC
29 Channel_8_Channel_5 0.04705111 Channel_4_Channel_1 0.7777778
7 Channel_4_Channel_1 0.05937883 Channel_8_Channel_5 0.7777778
8 Channel_4_Channel_2 0.08392544 Channel_5_Channel_4 0.7555556
13 Channel_5_Channel_4 0.09553314 Channel_3_Channel_1 0.7444444
12 Channel_5_Channel_3 0.09629423 Channel_7_Channel_1 0.7444444
19 Channel_7_Channel_1 0.10024331 Channel_5_Channel_3 0.7333333
5 Channel_3_Channel_1 0.11573316 Channel_4_Channel_2 0.7222222
23 Channel_7_Channel_5 0.14191010 Channel_2_Channel_1 0.7111111
6 Channel_3_Channel_2 0.16463977 Channel_7_Channel_5 0.7111111
4 Channel_2_Channel_1 0.18213073 Channel_7_Channel_2 0.6888889
20 Channel_7_Channel_2 0.21802741 Channel_5_Channel_1 0.6777778
14 Channel_6_Channel_1 0.26819310 Channel_3_Channel_2 0.6555556
28 Channel_8_Channel_4 0.28798293 Channel_6_Channel_1 0.6555556
26 Channel_8_Channel_2 0.28829268 Channel_6_Channel_3 0.6444444
3 Age 0.29008348 Channel_8_Channel_2 0.6444444
21 Channel_7_Channel_3 0.30960563 Channel_5_Channel_2 0.6222222
25 Channel_8_Channel_1 0.34049363 Channel_7_Channel_3 0.6222222
16 Channel_6_Channel_3 0.34369258 Channel_8_Channel_7 0.6222222
31 Channel_8_Channel_7 0.37323461 Channel_7_Channel_6 0.6111111
17 Channel_6_Channel_4 0.37615053 Channel_8_Channel_4 0.6111111
22 Channel_7_Channel_4 0.43048479 Age 0.6055556
11 Channel_5_Channel_2 0.43153040 Channel_8_Channel_1 0.5888889
24 Channel_7_Channel_6 0.57684330 Channel_4_Channel_3 0.5777778
15 Channel_6_Channel_2 0.62888762 Channel_6_Channel_4 0.5666667
9 Channel_4_Channel_3 0.64173084 Channel_7_Channel_4 0.5444444
10 Channel_5_Channel_1 0.74926410 Channel_6_Channel_5 0.5333333
18 Channel_6_Channel_5 0.77826675 Sex 0.5222222
30 Channel_8_Channel_6 0.78451749 Channel_8_Channel_6 0.5111111
2 Sex 0.84473144 Channel_6_Channel_2 0.4666667
27 Channel_8_Channel_3 0.89285636 Channel_8_Channel_3 0.4666667
variable AUC
29 Sex 1.0000000
7 Channel_6_Channel_5 0.8888889
8 Channel_8_Channel_3 0.8444444
13 Channel_8_Channel_2 0.7666667
12 Channel_5_Channel_4 0.7000000
19 Channel_6_Channel_1 0.6888889
5 Channel_7_Channel_6 0.6888889
23 Channel_7_Channel_3 0.6777778
6 Channel_5_Channel_3 0.6666667
4 Channel_6_Channel_4 0.6555556
20 Channel_7_Channel_5 0.6333333
14 Channel_6_Channel_3 0.6222222
28 Channel_4_Channel_3 0.6222222
26 Channel_8_Channel_5 0.6111111
3 Channel_4_Channel_1 0.6111111
21 Channel_6_Channel_2 0.6111111
25 Channel_8_Channel_1 0.6111111
16 Channel_3_Channel_2 0.6000000
31 Channel_7_Channel_2 0.6000000
17 Channel_7_Channel_4 0.6000000
22 Channel_3_Channel_1 0.5888889
11 Channel_8_Channel_6 0.5777778
24 Channel_5_Channel_2 0.5555556
15 Channel_8_Channel_7 0.5444444
9 Channel_2_Channel_1 0.5333333
10 Age 0.5111111
18 Channel_4_Channel_2 0.4555556
30 Channel_8_Channel_4 0.4333333
2 Channel_7_Channel_1 0.4333333
27 Channel_5_Channel_1 0.4222222
Word_C : Linear model versus RF with cv
1 v.s.1 varaible
idc_train <- c(sample(which(dta2\(Word_C == "H"), 5),sample(which(dta2\)Word_C == “L”), 5))
idc_test <- -idc_train
auc_lm_v <- data.frame(AUC = NA, variable = NA)
auc_rf_v <- data.frame(AUC = NA, variable = NA)
for (i in 1:30){
auc_mat_lm <- matrix(NA,1000,1)
auc_mat_rf <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_train <- c(sample(which(dta2$Word_C == "H"), 5),sample(which(dta2$Word_C == "L"), 5))
idc_test <- -idc_train
if (!mean(as.numeric(dta2[idc_train,c(i,31)]$Word_C)) %in% c(2,1) &
!mean(as.numeric(dta2[idc_test,c(i,31)]$Word_C)) %in% c(2,1)){
fit_lm <- glm(Word_C~.,data = dta2[idc_train,c(i,31)],
family = "binomial")
yhat <- predict(fit_lm,newdata = dta2[idc_test,c(i,31)],
type = "response")
auc_mat_lm[k,] <- pROC::auc(dta2[idc_test,c(i,31)]$Word_C,yhat)
fit_rf <- randomForest(Word_C~ .,
data = dta2[idc_train,c(i,31)],
mtry = 1,
ntree = 10001,
nodesize = 5,
importance = F)
yhat <- predict(fit_rf,newdata = dta2[idc_test,c(i,31)],
type = "prob")[,2]
auc_mat_rf[k,] <- pROC::auc(dta2[idc_test,c(i,31)]$Word_C,yhat)
}
}
auc_lm_v <- rbind(auc_lm_v,data.frame(AUC = mean(auc_mat_lm,na.rm = T),
variable = colnames(dta2)[i]))
auc_rf_v <- rbind(auc_rf_v,data.frame(AUC = mean(auc_mat_rf,na.rm = T),
variable = colnames(dta2)[i]))
}
Comparison
na.omit(cbind(auc_lm_v[order(-auc_lm_v$AUC),],auc_rf_v[order(-auc_rf_v$AUC),]))
AUC variable AUC variable
29 0.775975 Channel_8_Channel_5 0.711525 Channel_6_Channel_5
7 0.769650 Channel_4_Channel_1 0.709600 Channel_5_Channel_4
13 0.750025 Channel_5_Channel_4 0.697475 Channel_8_Channel_5
5 0.741700 Channel_3_Channel_1 0.687875 Channel_6_Channel_3
12 0.731475 Channel_5_Channel_3 0.674700 Channel_7_Channel_6
19 0.728375 Channel_7_Channel_1 0.661525 Channel_4_Channel_1
8 0.723150 Channel_4_Channel_2 0.659075 Channel_3_Channel_1
4 0.711175 Channel_2_Channel_1 0.658525 Channel_7_Channel_5
23 0.699950 Channel_7_Channel_5 0.656575 Channel_5_Channel_1
20 0.690275 Channel_7_Channel_2 0.650250 Channel_8_Channel_2
10 0.687250 Channel_5_Channel_1 0.649675 Channel_8_Channel_3
6 0.670800 Channel_3_Channel_2 0.647850 Channel_7_Channel_2
14 0.664250 Channel_6_Channel_1 0.644200 Channel_7_Channel_1
21 0.654875 Channel_7_Channel_3 0.638000 Channel_2_Channel_1
26 0.650600 Channel_8_Channel_2 0.635500 Channel_8_Channel_6
11 0.649950 Channel_5_Channel_2 0.633750 Channel_7_Channel_3
16 0.647525 Channel_6_Channel_3 0.628075 Channel_8_Channel_1
28 0.637075 Channel_8_Channel_4 0.626950 Age
31 0.635925 Channel_8_Channel_7 0.621050 Channel_4_Channel_3
24 0.634725 Channel_7_Channel_6 0.620450 Channel_5_Channel_2
3 0.634650 Age 0.617425 Channel_4_Channel_2
25 0.632300 Channel_8_Channel_1 0.613025 Channel_8_Channel_7
15 0.619200 Channel_6_Channel_2 0.610800 Channel_5_Channel_3
17 0.619200 Channel_6_Channel_4 0.609625 Channel_8_Channel_4
18 0.617350 Channel_6_Channel_5 0.609025 Channel_6_Channel_2
22 0.614450 Channel_7_Channel_4 0.608450 Channel_6_Channel_4
27 0.606100 Channel_8_Channel_3 0.607750 Channel_6_Channel_1
9 0.601900 Channel_4_Channel_3 0.603175 Channel_3_Channel_2
30 0.595600 Channel_8_Channel_6 0.602425 Channel_7_Channel_4
2 0.559650 Sex 0.564225 Sex
Best lm versus Best RF with CV
Lm : Ch8_Ch5
auc_mat0 <- matrix(NA,1000,1)
auc_mat1 <- matrix(NA,1000,1)
auc_mat2 <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_train <- c(sample(which(dta2$Word_C == "H"), 5),sample(which(dta2$Word_C == "L"), 5))
idc_test <- -idc_train
# CV AUC
# Channel_8_Channel_5
fit0 <- glm(Word_C~Channel_8_Channel_5,
data = dta2[idc_train,],family = "binomial")
yhat0 <- predict(fit0,newdata = dta2[idc_test,],type = "response")
auc_mat0[k,] <- pROC::auc(dta2[idc_test,]$Word_C,yhat0)
# Channel_8_Channel_5+Channel_4_Channel_1
fit1 <- glm(Word_C~Channel_8_Channel_5+Channel_4_Channel_1,
data = dta2[idc_train,],family = "binomial")
yhat1 <- predict(fit1,newdata = dta2[idc_test,],type = "response")
auc_mat1[k,] <- pROC::auc(dta2[idc_test,]$Word_C,yhat1)
# Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4
fit2 <- glm(Word_C~ Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4,data = dta2[idc_train,],
family = "binomial")
yhat2 <- predict(fit2,newdata = dta2[idc_test,],type = "response")
auc_mat2[k,] <- pROC::auc(dta2[idc_test,]$Word_C,yhat2)
}
mean(auc_mat0,na.rm = T) # Channel_8_Channel_5
[1] 0.775975
mean(auc_mat1,na.rm = T) # Channel_8_Channel_5+Channel_4_Channel_1
[1] 0.72325
mean(auc_mat2,na.rm = T) # Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4
[1] 0.65735
Word_C : RF
Cross Validation : tuning
# Feature Selection (after CV tuning decide mtry and nodesize)
# CV tuning decide mtry and nodesize
# tune_mat_full <- matrix(NA,30,5)
# for (m in 1:30){
# for (n in 1:5){
# auc_mat_rf <- matrix(NA,1000,1)
# for (k in 1:1000){
# set.seed(k)
# if (k %% 250 == 0) {print(paste("Mtry = ",m,"Node = ", n ,"k = ", k))}
# set.seed(k)
# idc_train <- c(sample(which(dta2$Word_C == "H"), 5),sample(which(dta2$Word_C == "L"), 5))
# idc_test <- -idc_train
# fit <- randomForest(Word_C~ .,data = dta2[idc_train,],
# mtry = m,
# ntree = 10001,
# nodesize = n, importance = F)
# yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
# auc_mat_rf[k,] <- pROC::auc(dta2[idc_test,]$Word_C,yhat)
# }
# print(mean(auc_mat_rf,na.rm =T))
# tune_mat_full[m,n] <- mean(auc_mat_rf,na.rm =T)
# }
#}
#max(tune_mat_full)
#tune_mat_full
fit_full <- randomForest(Word_C~ .,data = dta2,
mtry = 1,
ntree = 1000001,
nodesize = 5,
importance = T)
yhat <- predict(fit_full,newdata = dta2,type = "prob")[,2]
pROC::auc(dta2$Word_C,yhat)
Area under the curve: 1
varImpPlot(fit_full)

fit_full$importance[order(-fit_full$importance[,3]),c(3,4)] # CE
MeanDecreaseAccuracy MeanDecreaseGini
Channel_8_Channel_5 0.0073105149 0.27508190
Channel_4_Channel_1 0.0055728360 0.24422345
Channel_5_Channel_4 0.0043251927 0.23778901
Channel_2_Channel_1 0.0040151260 0.22433021
Channel_5_Channel_1 0.0033328620 0.24011573
Channel_3_Channel_1 0.0029110475 0.22947003
Channel_6_Channel_3 0.0027515834 0.20264659
Channel_7_Channel_5 0.0024714176 0.22091118
Channel_7_Channel_1 0.0014196826 0.22310558
Channel_7_Channel_2 0.0013452420 0.19768204
Channel_5_Channel_3 0.0008443240 0.21038460
Channel_4_Channel_2 0.0006896350 0.19388925
Channel_6_Channel_2 0.0005462532 0.19209518
Channel_8_Channel_7 0.0001372718 0.17969180
Channel_7_Channel_3 -0.0003314785 0.17620762
Channel_8_Channel_6 -0.0003889560 0.17276604
Channel_3_Channel_2 -0.0007186635 0.17748355
Channel_5_Channel_2 -0.0007367849 0.17548471
Channel_7_Channel_6 -0.0008509356 0.17080864
Channel_6_Channel_1 -0.0008990143 0.18681772
Channel_7_Channel_4 -0.0009184747 0.17741840
Channel_8_Channel_4 -0.0012022443 0.16805933
Sex -0.0012152033 0.04481467
Age -0.0012846252 0.16166101
Channel_4_Channel_3 -0.0018157501 0.16565388
Channel_8_Channel_2 -0.0020604720 0.16526608
Channel_6_Channel_4 -0.0022499477 0.16206524
Channel_8_Channel_1 -0.0023023254 0.15423285
Channel_8_Channel_3 -0.0024979483 0.16023286
Channel_6_Channel_5 -0.0046473410 0.14029381
fit_full$importance[order(-fit_full$importance[,4]),c(3,4)] # Gini
MeanDecreaseAccuracy MeanDecreaseGini
Channel_8_Channel_5 0.0073105149 0.27508190
Channel_4_Channel_1 0.0055728360 0.24422345
Channel_5_Channel_1 0.0033328620 0.24011573
Channel_5_Channel_4 0.0043251927 0.23778901
Channel_3_Channel_1 0.0029110475 0.22947003
Channel_2_Channel_1 0.0040151260 0.22433021
Channel_7_Channel_1 0.0014196826 0.22310558
Channel_7_Channel_5 0.0024714176 0.22091118
Channel_5_Channel_3 0.0008443240 0.21038460
Channel_6_Channel_3 0.0027515834 0.20264659
Channel_7_Channel_2 0.0013452420 0.19768204
Channel_4_Channel_2 0.0006896350 0.19388925
Channel_6_Channel_2 0.0005462532 0.19209518
Channel_6_Channel_1 -0.0008990143 0.18681772
Channel_8_Channel_7 0.0001372718 0.17969180
Channel_3_Channel_2 -0.0007186635 0.17748355
Channel_7_Channel_4 -0.0009184747 0.17741840
Channel_7_Channel_3 -0.0003314785 0.17620762
Channel_5_Channel_2 -0.0007367849 0.17548471
Channel_8_Channel_6 -0.0003889560 0.17276604
Channel_7_Channel_6 -0.0008509356 0.17080864
Channel_8_Channel_4 -0.0012022443 0.16805933
Channel_4_Channel_3 -0.0018157501 0.16565388
Channel_8_Channel_2 -0.0020604720 0.16526608
Channel_6_Channel_4 -0.0022499477 0.16206524
Age -0.0012846252 0.16166101
Channel_8_Channel_3 -0.0024979483 0.16023286
Channel_8_Channel_1 -0.0023023254 0.15423285
Channel_6_Channel_5 -0.0046473410 0.14029381
Sex -0.0012152033 0.04481467
Tuning
## 6 variables select from CE and Gini
## Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4+
## Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1 ## 0.837975
## 5 variables
## Channel_4_Channel_1+Channel_5_Channel_4+Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1 # 0.8116
## Channel_8_Channel_5+Channel_5_Channel_4+Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1 # 0.8454 ##
## Channel_8_Channel_5+Channel_4_Channel_1+Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1 # 0.830325
## Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4+Channel_2_Channel_1+Channel_3_Channel_1
## Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4+Channel_2_Channel_1+Channel_5_Channel_1 # 0.826825
## Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4+Channel_5_Channel_1+Channel_3_Channel_1 # 0.823525
## 4 variables
## Channel_8_Channel_5+Channel_5_Channel_4+Channel_2_Channel_1+Channel_5_Channel_1 # 0.804125
## Channel_8_Channel_5+Channel_5_Channel_4+Channel_2_Channel_1+Channel_3_Channel_1 # 0.812475
## Channel_8_Channel_5+Channel_5_Channel_4+Channel_5_Channel_1+Channel_3_Channel_1 # 0.829
## Channel_8_Channel_5+Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1 # 0.838375 ##
## Channel_5_Channel_4+Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1 # 0.80765
## 3 variables
## Channel_8_Channel_5+Channel_2_Channel_1+Channel_5_Channel_1 ## 0.787375
## Channel_8_Channel_5+Channel_2_Channel_1+Channel_3_Channel_1 ## 0.8106
## Channel_8_Channel_5+Channel_5_Channel_1+Channel_3_Channel_1 ## 0.810325
## Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1
## 2 variables
## Channel_8_Channel_5+Channel_2_Channel_1 ## 0.778325
## Channel_8_Channel_5+Channel_3_Channel_1 ## 0.771925
## Channel_2_Channel_1+Channel_3_Channel_1 ## 0.75915
RF
Channel_8_Channel_5+Channel_5_Channel_4+Channel_2_Channel_1+
Channel_5_Channel_1+Channel_3_Channel_1
auc_mat_rf <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_train <- c(sample(which(dta2$Word_C == "H"), 5),sample(which(dta2$Word_C == "L"), 5))
idc_test <- -idc_train
fit <- randomForest(Word_C~ Channel_8_Channel_5+Channel_5_Channel_4+Channel_2_Channel_1+
Channel_5_Channel_1+Channel_3_Channel_1,
data = dta2[idc_train,],
mtry = 1,
ntree = 10001,
nodesize = 5, importance = F)
yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
auc_mat_rf[k,] <- pROC::auc(dta2[idc_test,]$Word_C,yhat)
}
mean(auc_mat_rf)
[1] 0.8454
quantile(auc_mat_rf,probs = c(0.025,0.975))
2.5% 97.5%
0.6 1.0
LM : Ch8_Ch5
mean(auc_mat0)
[1] 0.775975
quantile(auc_mat0,probs = c(0.025,0.975))
2.5% 97.5%
0.55 1.00
Comparsion
p <- melt(auc_mat_rf)
p$var <- "RF:5"
p2 <- melt(auc_mat0)
p2$var <- "LM:1"
p3 <- rbind(p[,3:4],p2[,3:4])
Fig1 <- ggplot(p3,aes(value,fill = var, col =var))+
geom_density(alpha = 0.3)+
theme_bw()+
xlab("AUC")+
scale_fill_manual(values = c("blue","orange"),name = "Model")+
scale_colour_manual(values = c("blue","orange"),name = "Model")
Fig1+geom_vline(xintercept = c(0.55,mean(auc_mat0),1.003),col = "blue",size = 1.5)+
geom_vline(xintercept = c(0.6,mean(auc_mat_rf),1.0),col = "orange",size = 1.2)

ggplotly(Fig1)
Lm : only Ch8_Ch5, auc = 0.78
RF : 5 variables auc = 0.85
Variable selection for the best model
1. best from overall cv
2. best from 1 v.s 1 rf
3. best from lm