Library

library(randomForest);library(leaps);library(pROC);library(ggplot2);library(reshape2);library(plotly)
predict.regsubsets <- function (object ,newdata ,id ,...) {
    form=as.formula (object$call [[2]])
    mat=model.matrix (form ,newdata )
    coefi =coef(object ,id=id)
    xvars =names (coefi )
    mat[,xvars ]%*% coefi
}

Data

Subject 2 is an outlier, so we deleted it.
19 subjects remain
Create a factor variable
dta <- readRDS("ML_NIRS_Connectivity_MeanVar.Rdata")[-12,c(2,10:39)]
dta$Word_C <-as.factor(ifelse(dta$Word > median(dta$Word), "H","L"))
head(dta[,c(1:4,32)])
  Word Sex   Age Channel_2_Channel_1 Word_C
1  125   M 10.41           0.2485940      H
2  102   M  9.95           0.7715194      L
3   57   M 10.22           0.8711941      L
4   88   M  9.68           0.2212408      L
5  114   F 10.35           0.3854612      H
6   74   F 10.37           0.7481600      L
dta2 <- dta[,-1]
rownames(dta2) <- 1:nrow(dta2)

Word_C : Linear model versus RF

1 v.s.1 varaible
fit_lm <- list()
auc_lm <- data.frame(AUC = NA, variable = NA)
p_lm <- data.frame(pvalue = NA, variable = NA)
fit_rf <- list()
auc_rf <- data.frame(AUC = NA, variable = NA)
for (i in 1:30){
        fit_lm[[i]] <- summary(fit <- glm(Word_C~.,
                                          data = dta2[,c(i,31)],family = "binomial"))
        names(fit_lm)[i] <- colnames(dta2)[i]
        yhat <- predict(fit,data = dta2[,c(i,31)],type = "response")
        auc_lm <- rbind(auc_lm,
                        data.frame(AUC = pROC::auc(dta2$Word_C,yhat),
                                   variable = colnames(dta2)[i]))
        p_lm <- rbind(p_lm, data.frame(pvalue = fit_lm[[i]]$coefficient[2,4], variable = colnames(dta2)[i]))
        fit_rf[[i]] <- randomForest(Word_C~ .,
                                    data = dta2[,c(i,31)],
                                    mtry = 1,
                                    ntree = 10001,
                                    nodesize = 5, 
                                    importance = F)
        names(fit_rf)[i] <- colnames(dta2)[i]
        yhat <- predict(fit_rf[[i]],data = dta2[,c(i,31)],type = "prob")[,2]
        auc_rf <- rbind(auc_rf,
                        data.frame(AUC = pROC::auc(dta2$Word_C,yhat),
                                   variable = colnames(dta2)[i]))
}

Comparison

na.omit(cbind(p_lm[order(p_lm$pvalue),c(2,1)],auc_lm[order(-auc_lm$AUC),c(2,1)],auc_rf[order(-auc_rf$AUC),c(2,1)]))
              variable     pvalue            variable       AUC
29 Channel_8_Channel_5 0.04705111 Channel_4_Channel_1 0.7777778
7  Channel_4_Channel_1 0.05937883 Channel_8_Channel_5 0.7777778
8  Channel_4_Channel_2 0.08392544 Channel_5_Channel_4 0.7555556
13 Channel_5_Channel_4 0.09553314 Channel_3_Channel_1 0.7444444
12 Channel_5_Channel_3 0.09629423 Channel_7_Channel_1 0.7444444
19 Channel_7_Channel_1 0.10024331 Channel_5_Channel_3 0.7333333
5  Channel_3_Channel_1 0.11573316 Channel_4_Channel_2 0.7222222
23 Channel_7_Channel_5 0.14191010 Channel_2_Channel_1 0.7111111
6  Channel_3_Channel_2 0.16463977 Channel_7_Channel_5 0.7111111
4  Channel_2_Channel_1 0.18213073 Channel_7_Channel_2 0.6888889
20 Channel_7_Channel_2 0.21802741 Channel_5_Channel_1 0.6777778
14 Channel_6_Channel_1 0.26819310 Channel_3_Channel_2 0.6555556
28 Channel_8_Channel_4 0.28798293 Channel_6_Channel_1 0.6555556
26 Channel_8_Channel_2 0.28829268 Channel_6_Channel_3 0.6444444
3                  Age 0.29008348 Channel_8_Channel_2 0.6444444
21 Channel_7_Channel_3 0.30960563 Channel_5_Channel_2 0.6222222
25 Channel_8_Channel_1 0.34049363 Channel_7_Channel_3 0.6222222
16 Channel_6_Channel_3 0.34369258 Channel_8_Channel_7 0.6222222
31 Channel_8_Channel_7 0.37323461 Channel_7_Channel_6 0.6111111
17 Channel_6_Channel_4 0.37615053 Channel_8_Channel_4 0.6111111
22 Channel_7_Channel_4 0.43048479                 Age 0.6055556
11 Channel_5_Channel_2 0.43153040 Channel_8_Channel_1 0.5888889
24 Channel_7_Channel_6 0.57684330 Channel_4_Channel_3 0.5777778
15 Channel_6_Channel_2 0.62888762 Channel_6_Channel_4 0.5666667
9  Channel_4_Channel_3 0.64173084 Channel_7_Channel_4 0.5444444
10 Channel_5_Channel_1 0.74926410 Channel_6_Channel_5 0.5333333
18 Channel_6_Channel_5 0.77826675                 Sex 0.5222222
30 Channel_8_Channel_6 0.78451749 Channel_8_Channel_6 0.5111111
2                  Sex 0.84473144 Channel_6_Channel_2 0.4666667
27 Channel_8_Channel_3 0.89285636 Channel_8_Channel_3 0.4666667
              variable       AUC
29                 Sex 1.0000000
7  Channel_6_Channel_5 0.8888889
8  Channel_8_Channel_3 0.8444444
13 Channel_8_Channel_2 0.7666667
12 Channel_5_Channel_4 0.7000000
19 Channel_6_Channel_1 0.6888889
5  Channel_7_Channel_6 0.6888889
23 Channel_7_Channel_3 0.6777778
6  Channel_5_Channel_3 0.6666667
4  Channel_6_Channel_4 0.6555556
20 Channel_7_Channel_5 0.6333333
14 Channel_6_Channel_3 0.6222222
28 Channel_4_Channel_3 0.6222222
26 Channel_8_Channel_5 0.6111111
3  Channel_4_Channel_1 0.6111111
21 Channel_6_Channel_2 0.6111111
25 Channel_8_Channel_1 0.6111111
16 Channel_3_Channel_2 0.6000000
31 Channel_7_Channel_2 0.6000000
17 Channel_7_Channel_4 0.6000000
22 Channel_3_Channel_1 0.5888889
11 Channel_8_Channel_6 0.5777778
24 Channel_5_Channel_2 0.5555556
15 Channel_8_Channel_7 0.5444444
9  Channel_2_Channel_1 0.5333333
10                 Age 0.5111111
18 Channel_4_Channel_2 0.4555556
30 Channel_8_Channel_4 0.4333333
2  Channel_7_Channel_1 0.4333333
27 Channel_5_Channel_1 0.4222222

Word_C : Linear model versus RF with cv

1 v.s.1 varaible
idc_train <- c(sample(which(dta2\(Word_C == "H"), 5),sample(which(dta2\)Word_C == “L”), 5))
idc_test <- -idc_train
auc_lm_v <- data.frame(AUC = NA, variable = NA)
auc_rf_v <- data.frame(AUC = NA, variable = NA)
for (i in 1:30){
        auc_mat_lm <- matrix(NA,1000,1)
        auc_mat_rf <- matrix(NA,1000,1)
        for (k in 1:1000){
                set.seed(k)
                idc_train <- c(sample(which(dta2$Word_C == "H"), 5),sample(which(dta2$Word_C == "L"), 5))
                idc_test <- -idc_train
                 if  (!mean(as.numeric(dta2[idc_train,c(i,31)]$Word_C)) %in% c(2,1) &
                      !mean(as.numeric(dta2[idc_test,c(i,31)]$Word_C)) %in% c(2,1)){
                               fit_lm <- glm(Word_C~.,data = dta2[idc_train,c(i,31)],
                                          family = "binomial")
                               yhat <- predict(fit_lm,newdata = dta2[idc_test,c(i,31)],
                                               type = "response")
                               auc_mat_lm[k,] <- pROC::auc(dta2[idc_test,c(i,31)]$Word_C,yhat)
                               fit_rf <- randomForest(Word_C~ .,
                                                   data = dta2[idc_train,c(i,31)],
                                                   mtry = 1,
                                                   ntree = 10001,
                                                   nodesize = 5, 
                                                   importance = F)
                               yhat <- predict(fit_rf,newdata = dta2[idc_test,c(i,31)],
                                               type = "prob")[,2]
                               auc_mat_rf[k,] <- pROC::auc(dta2[idc_test,c(i,31)]$Word_C,yhat)
                      }
        }
        auc_lm_v <- rbind(auc_lm_v,data.frame(AUC = mean(auc_mat_lm,na.rm = T),
                                              variable = colnames(dta2)[i]))
        auc_rf_v <- rbind(auc_rf_v,data.frame(AUC = mean(auc_mat_rf,na.rm = T),
                                              variable = colnames(dta2)[i]))
}

Comparison

na.omit(cbind(auc_lm_v[order(-auc_lm_v$AUC),],auc_rf_v[order(-auc_rf_v$AUC),]))
        AUC            variable      AUC            variable
29 0.775975 Channel_8_Channel_5 0.711525 Channel_6_Channel_5
7  0.769650 Channel_4_Channel_1 0.709600 Channel_5_Channel_4
13 0.750025 Channel_5_Channel_4 0.697475 Channel_8_Channel_5
5  0.741700 Channel_3_Channel_1 0.687875 Channel_6_Channel_3
12 0.731475 Channel_5_Channel_3 0.674700 Channel_7_Channel_6
19 0.728375 Channel_7_Channel_1 0.661525 Channel_4_Channel_1
8  0.723150 Channel_4_Channel_2 0.659075 Channel_3_Channel_1
4  0.711175 Channel_2_Channel_1 0.658525 Channel_7_Channel_5
23 0.699950 Channel_7_Channel_5 0.656575 Channel_5_Channel_1
20 0.690275 Channel_7_Channel_2 0.650250 Channel_8_Channel_2
10 0.687250 Channel_5_Channel_1 0.649675 Channel_8_Channel_3
6  0.670800 Channel_3_Channel_2 0.647850 Channel_7_Channel_2
14 0.664250 Channel_6_Channel_1 0.644200 Channel_7_Channel_1
21 0.654875 Channel_7_Channel_3 0.638000 Channel_2_Channel_1
26 0.650600 Channel_8_Channel_2 0.635500 Channel_8_Channel_6
11 0.649950 Channel_5_Channel_2 0.633750 Channel_7_Channel_3
16 0.647525 Channel_6_Channel_3 0.628075 Channel_8_Channel_1
28 0.637075 Channel_8_Channel_4 0.626950                 Age
31 0.635925 Channel_8_Channel_7 0.621050 Channel_4_Channel_3
24 0.634725 Channel_7_Channel_6 0.620450 Channel_5_Channel_2
3  0.634650                 Age 0.617425 Channel_4_Channel_2
25 0.632300 Channel_8_Channel_1 0.613025 Channel_8_Channel_7
15 0.619200 Channel_6_Channel_2 0.610800 Channel_5_Channel_3
17 0.619200 Channel_6_Channel_4 0.609625 Channel_8_Channel_4
18 0.617350 Channel_6_Channel_5 0.609025 Channel_6_Channel_2
22 0.614450 Channel_7_Channel_4 0.608450 Channel_6_Channel_4
27 0.606100 Channel_8_Channel_3 0.607750 Channel_6_Channel_1
9  0.601900 Channel_4_Channel_3 0.603175 Channel_3_Channel_2
30 0.595600 Channel_8_Channel_6 0.602425 Channel_7_Channel_4
2  0.559650                 Sex 0.564225                 Sex

Best lm versus Best RF with CV

Lm : Ch8_Ch5

auc_mat0 <- matrix(NA,1000,1)
auc_mat1 <- matrix(NA,1000,1)
auc_mat2 <- matrix(NA,1000,1)

for (k in 1:1000){
        set.seed(k)
        idc_train <- c(sample(which(dta2$Word_C == "H"), 5),sample(which(dta2$Word_C == "L"), 5))
        idc_test <- -idc_train
        # CV AUC
        # Channel_8_Channel_5
        fit0 <- glm(Word_C~Channel_8_Channel_5,
                    data = dta2[idc_train,],family = "binomial")
        yhat0 <- predict(fit0,newdata = dta2[idc_test,],type = "response")
        auc_mat0[k,] <- pROC::auc(dta2[idc_test,]$Word_C,yhat0)
        # Channel_8_Channel_5+Channel_4_Channel_1
        fit1 <- glm(Word_C~Channel_8_Channel_5+Channel_4_Channel_1,
                    data = dta2[idc_train,],family = "binomial")
        yhat1 <- predict(fit1,newdata = dta2[idc_test,],type = "response")
        auc_mat1[k,] <- pROC::auc(dta2[idc_test,]$Word_C,yhat1)
        # Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4
        fit2 <- glm(Word_C~ Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4,data = dta2[idc_train,],
                    family = "binomial")
        yhat2 <- predict(fit2,newdata = dta2[idc_test,],type = "response")
        auc_mat2[k,] <- pROC::auc(dta2[idc_test,]$Word_C,yhat2)
}
mean(auc_mat0,na.rm = T) # Channel_8_Channel_5
[1] 0.775975
mean(auc_mat1,na.rm = T) # Channel_8_Channel_5+Channel_4_Channel_1
[1] 0.72325
mean(auc_mat2,na.rm = T) # Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4
[1] 0.65735

Word_C : RF

Cross Validation : tuning
# Feature Selection (after CV tuning decide mtry and nodesize)
# CV tuning decide mtry and nodesize
# tune_mat_full <- matrix(NA,30,5)
# for (m in 1:30){
#  for (n in 1:5){
#    auc_mat_rf <- matrix(NA,1000,1)
#    for (k in 1:1000){
#      set.seed(k)
#      if (k %% 250 == 0) {print(paste("Mtry = ",m,"Node = ", n ,"k = ", k))}
#      set.seed(k)
#      idc_train <- c(sample(which(dta2$Word_C == "H"), 5),sample(which(dta2$Word_C == "L"), 5))
#      idc_test <- -idc_train
#      fit <- randomForest(Word_C~ .,data = dta2[idc_train,],
#                            mtry = m,
#                            ntree = 10001,
#                            nodesize = n, importance = F)
#      yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
#      auc_mat_rf[k,] <- pROC::auc(dta2[idc_test,]$Word_C,yhat)
#    }
#    print(mean(auc_mat_rf,na.rm =T))
#    tune_mat_full[m,n] <- mean(auc_mat_rf,na.rm =T)
#  }
#}
#max(tune_mat_full)
#tune_mat_full
fit_full <- randomForest(Word_C~ .,data = dta2,
                    mtry = 1,
                    ntree = 1000001,
                    nodesize = 5, 
                    importance = T)
yhat <- predict(fit_full,newdata = dta2,type = "prob")[,2]
pROC::auc(dta2$Word_C,yhat)
Area under the curve: 1
varImpPlot(fit_full)

fit_full$importance[order(-fit_full$importance[,3]),c(3,4)] # CE
                    MeanDecreaseAccuracy MeanDecreaseGini
Channel_8_Channel_5         0.0073105149       0.27508190
Channel_4_Channel_1         0.0055728360       0.24422345
Channel_5_Channel_4         0.0043251927       0.23778901
Channel_2_Channel_1         0.0040151260       0.22433021
Channel_5_Channel_1         0.0033328620       0.24011573
Channel_3_Channel_1         0.0029110475       0.22947003
Channel_6_Channel_3         0.0027515834       0.20264659
Channel_7_Channel_5         0.0024714176       0.22091118
Channel_7_Channel_1         0.0014196826       0.22310558
Channel_7_Channel_2         0.0013452420       0.19768204
Channel_5_Channel_3         0.0008443240       0.21038460
Channel_4_Channel_2         0.0006896350       0.19388925
Channel_6_Channel_2         0.0005462532       0.19209518
Channel_8_Channel_7         0.0001372718       0.17969180
Channel_7_Channel_3        -0.0003314785       0.17620762
Channel_8_Channel_6        -0.0003889560       0.17276604
Channel_3_Channel_2        -0.0007186635       0.17748355
Channel_5_Channel_2        -0.0007367849       0.17548471
Channel_7_Channel_6        -0.0008509356       0.17080864
Channel_6_Channel_1        -0.0008990143       0.18681772
Channel_7_Channel_4        -0.0009184747       0.17741840
Channel_8_Channel_4        -0.0012022443       0.16805933
Sex                        -0.0012152033       0.04481467
Age                        -0.0012846252       0.16166101
Channel_4_Channel_3        -0.0018157501       0.16565388
Channel_8_Channel_2        -0.0020604720       0.16526608
Channel_6_Channel_4        -0.0022499477       0.16206524
Channel_8_Channel_1        -0.0023023254       0.15423285
Channel_8_Channel_3        -0.0024979483       0.16023286
Channel_6_Channel_5        -0.0046473410       0.14029381
fit_full$importance[order(-fit_full$importance[,4]),c(3,4)] # Gini
                    MeanDecreaseAccuracy MeanDecreaseGini
Channel_8_Channel_5         0.0073105149       0.27508190
Channel_4_Channel_1         0.0055728360       0.24422345
Channel_5_Channel_1         0.0033328620       0.24011573
Channel_5_Channel_4         0.0043251927       0.23778901
Channel_3_Channel_1         0.0029110475       0.22947003
Channel_2_Channel_1         0.0040151260       0.22433021
Channel_7_Channel_1         0.0014196826       0.22310558
Channel_7_Channel_5         0.0024714176       0.22091118
Channel_5_Channel_3         0.0008443240       0.21038460
Channel_6_Channel_3         0.0027515834       0.20264659
Channel_7_Channel_2         0.0013452420       0.19768204
Channel_4_Channel_2         0.0006896350       0.19388925
Channel_6_Channel_2         0.0005462532       0.19209518
Channel_6_Channel_1        -0.0008990143       0.18681772
Channel_8_Channel_7         0.0001372718       0.17969180
Channel_3_Channel_2        -0.0007186635       0.17748355
Channel_7_Channel_4        -0.0009184747       0.17741840
Channel_7_Channel_3        -0.0003314785       0.17620762
Channel_5_Channel_2        -0.0007367849       0.17548471
Channel_8_Channel_6        -0.0003889560       0.17276604
Channel_7_Channel_6        -0.0008509356       0.17080864
Channel_8_Channel_4        -0.0012022443       0.16805933
Channel_4_Channel_3        -0.0018157501       0.16565388
Channel_8_Channel_2        -0.0020604720       0.16526608
Channel_6_Channel_4        -0.0022499477       0.16206524
Age                        -0.0012846252       0.16166101
Channel_8_Channel_3        -0.0024979483       0.16023286
Channel_8_Channel_1        -0.0023023254       0.15423285
Channel_6_Channel_5        -0.0046473410       0.14029381
Sex                        -0.0012152033       0.04481467

Tuning

## 6 variables select from CE and Gini
## Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4+
## Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1    ## 0.837975


## 5 variables
## Channel_4_Channel_1+Channel_5_Channel_4+Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1  # 0.8116 
## Channel_8_Channel_5+Channel_5_Channel_4+Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1  # 0.8454 ##
## Channel_8_Channel_5+Channel_4_Channel_1+Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1  # 0.830325
## Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4+Channel_2_Channel_1+Channel_3_Channel_1 
## Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4+Channel_2_Channel_1+Channel_5_Channel_1  # 0.826825
## Channel_8_Channel_5+Channel_4_Channel_1+Channel_5_Channel_4+Channel_5_Channel_1+Channel_3_Channel_1  # 0.823525


## 4 variables
## Channel_8_Channel_5+Channel_5_Channel_4+Channel_2_Channel_1+Channel_5_Channel_1 #  0.804125
## Channel_8_Channel_5+Channel_5_Channel_4+Channel_2_Channel_1+Channel_3_Channel_1 #  0.812475
## Channel_8_Channel_5+Channel_5_Channel_4+Channel_5_Channel_1+Channel_3_Channel_1 #  0.829
## Channel_8_Channel_5+Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1 #  0.838375 ## 
## Channel_5_Channel_4+Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1 #  0.80765


## 3 variables
## Channel_8_Channel_5+Channel_2_Channel_1+Channel_5_Channel_1 ##  0.787375
## Channel_8_Channel_5+Channel_2_Channel_1+Channel_3_Channel_1 ##  0.8106
## Channel_8_Channel_5+Channel_5_Channel_1+Channel_3_Channel_1 ##  0.810325
## Channel_2_Channel_1+Channel_5_Channel_1+Channel_3_Channel_1

## 2 variables
## Channel_8_Channel_5+Channel_2_Channel_1 ## 0.778325
## Channel_8_Channel_5+Channel_3_Channel_1 ## 0.771925
## Channel_2_Channel_1+Channel_3_Channel_1 ## 0.75915

RF

Channel_8_Channel_5+Channel_5_Channel_4+Channel_2_Channel_1+
Channel_5_Channel_1+Channel_3_Channel_1
auc_mat_rf <- matrix(NA,1000,1)
for (k in 1:1000){
      set.seed(k)
      idc_train <- c(sample(which(dta2$Word_C == "H"), 5),sample(which(dta2$Word_C == "L"), 5))
      idc_test <- -idc_train
      fit <- randomForest(Word_C~ Channel_8_Channel_5+Channel_5_Channel_4+Channel_2_Channel_1+
                            Channel_5_Channel_1+Channel_3_Channel_1,
                          data = dta2[idc_train,],
                          mtry = 1,
                          ntree = 10001,
                          nodesize = 5, importance = F)
      yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
      auc_mat_rf[k,] <- pROC::auc(dta2[idc_test,]$Word_C,yhat)
}
mean(auc_mat_rf)
[1] 0.8454
quantile(auc_mat_rf,probs = c(0.025,0.975))
 2.5% 97.5% 
  0.6   1.0 

LM : Ch8_Ch5

mean(auc_mat0) 
[1] 0.775975
quantile(auc_mat0,probs = c(0.025,0.975))
 2.5% 97.5% 
 0.55  1.00 

Comparsion

p <- melt(auc_mat_rf)
p$var <- "RF:5"
p2 <- melt(auc_mat0)
p2$var <- "LM:1" 
p3 <- rbind(p[,3:4],p2[,3:4])
Fig1 <- ggplot(p3,aes(value,fill = var, col =var))+
  geom_density(alpha = 0.3)+
  theme_bw()+
  xlab("AUC")+
  scale_fill_manual(values = c("blue","orange"),name = "Model")+
  scale_colour_manual(values = c("blue","orange"),name = "Model")
Fig1+geom_vline(xintercept =  c(0.55,mean(auc_mat0),1.003),col = "blue",size  = 1.5)+
  geom_vline(xintercept =  c(0.6,mean(auc_mat_rf),1.0),col = "orange",size = 1.2)

ggplotly(Fig1)

Lm : only Ch8_Ch5, auc = 0.78

RF : 5 variables auc = 0.85


Variable selection for the best model

1. best from overall cv

2. best from 1 v.s 1 rf

3. best from lm