The packages and functions that were used for the demonstration are shown below.
library(randomForest);library(leaps);library(pROC);library(ggplot2);
library(reshape2);library(plotly);library(ltm);
library(dplyr);library(plyr)
library(bestglm)
predict.regsubsets <- function (object ,newdata ,id ,...) {
    form=as.formula (object$call [[2]])
    mat=model.matrix (form ,newdata )
    coefi =coef(object ,id=id)
    xvars =names (coefi )
    mat[,xvars ]%*% coefi
}

Data

Subject 2 is an outlier, so we deleted it.
The following analysis have two sessions.
Session 1 are regression problem, so we use the noraml Word variable for the analysis.
However, because of the lack of stablity on regression problem,
so we create a new factor variable : “Word_c”.
The subjects whose “word” scores were higher than median(word) will be classified as “H”,
and others will be “L”.
dta1 was used for regression problem. (Word)
dta2 was used for classifcation problem. (Word_c)
The analysis will use two models : Linear Model (LM) and RandomForest (RF).
dta <- readRDS("ML_NIRS_Connectivity_MeanVar.Rdata")[-12,c(2,10:39)]
dta1 <- dta
dta$Word_C <-as.factor(ifelse(dta$Word > median(dta$Word), "H","L"))
head(dta[,c(1:4,32)])
  Word Sex   Age Channel_2_Channel_1 Word_C
1  125   M 10.41           0.2485940      H
2  102   M  9.95           0.7715194      L
3   57   M 10.22           0.8711941      L
4   88   M  9.68           0.2212408      L
5  114   F 10.35           0.3854612      H
6   74   F 10.37           0.7481600      L
dta2 <- dta[,-1]
rownames(dta2) <- 1:nrow(dta2)
dta1 <- dta1[,c(2:31,1)]

dta1 : regression

dim(dta1)
[1] 19 31
str(dta1)
'data.frame':   19 obs. of  31 variables:
 $ Sex                : Factor w/ 2 levels "F","M": 2 2 2 2 1 1 2 2 2 1 ...
 $ Age                : num  10.41 9.95 10.22 9.68 10.35 ...
 $ Channel_2_Channel_1: num  0.249 0.772 0.871 0.221 0.385 ...
 $ Channel_3_Channel_1: num  0.4629 -0.2038 0.2337 0.0604 0.2966 ...
 $ Channel_3_Channel_2: num  0.2118 -0.2993 0.2036 0.0802 0.1537 ...
 $ Channel_4_Channel_1: num  0.704 -0.236 0.132 -0.129 0.444 ...
 $ Channel_4_Channel_2: num  -0.0493 -0.4176 0.0857 0.0466 0.0346 ...
 $ Channel_4_Channel_3: num  0.46 0.763 0.316 0.2 0.363 ...
 $ Channel_5_Channel_1: num  0.701 0.772 0.844 -0.101 0.646 ...
 $ Channel_5_Channel_2: num  0.204 0.809 0.878 0.392 0.52 ...
 $ Channel_5_Channel_3: num  0.514 -0.4901 0.284 -0.0392 0.384 ...
 $ Channel_5_Channel_4: num  0.411 -0.404 0.265 -0.238 0.399 ...
 $ Channel_6_Channel_1: num  0.41166 0.61328 0.67807 0.00797 0.26021 ...
 $ Channel_6_Channel_2: num  0.4969 0.8346 0.875 -0.0943 0.5515 ...
 $ Channel_6_Channel_3: num  -0.1512 -0.4591 0.2282 -0.0503 -0.0248 ...
 $ Channel_6_Channel_4: num  0.1177 -0.4061 0.3255 -0.1352 0.0864 ...
 $ Channel_6_Channel_5: num  0.442 0.948 0.874 -0.114 0.588 ...
 $ Channel_7_Channel_1: num  0.70067 -0.18438 -0.00831 0.17769 0.13275 ...
 $ Channel_7_Channel_2: num  0.1746 -0.5087 -0.0524 -0.0345 -0.2864 ...
 $ Channel_7_Channel_3: num  0.6 0.534 0.505 0.058 0.27 ...
 $ Channel_7_Channel_4: num  0.8218 0.6002 0.44 0.0555 0.8157 ...
 $ Channel_7_Channel_5: num  0.6179 -0.2584 0.2951 0.0499 0.0664 ...
 $ Channel_7_Channel_6: num  0.272 -0.354 0.107 -0.208 -0.252 ...
 $ Channel_8_Channel_1: num  0.3583 -0.2319 -0.0919 0.4151 0.3768 ...
 $ Channel_8_Channel_2: num  -0.141 -0.36118 -0.00897 0.0775 0.0612 ...
 $ Channel_8_Channel_3: num  0.296 0.2906 0.2069 -0.064 0.0494 ...
 $ Channel_8_Channel_4: num  0.679 0.631 0.748 0.331 0.706 ...
 $ Channel_8_Channel_5: num  0.333 -0.205 0.167 -0.34 0.377 ...
 $ Channel_8_Channel_6: num  -0.00803 -0.18494 0.3446 0.07539 0.19827 ...
 $ Channel_8_Channel_7: num  0.72 0.573 0.506 -0.108 0.523 ...
 $ Word               : int  125 102 57 88 114 74 30 112 69 99 ...

dta2 : Classification

dim(dta2)
[1] 19 31
str(dta2)
'data.frame':   19 obs. of  31 variables:
 $ Sex                : Factor w/ 2 levels "F","M": 2 2 2 2 1 1 2 2 2 1 ...
 $ Age                : num  10.41 9.95 10.22 9.68 10.35 ...
 $ Channel_2_Channel_1: num  0.249 0.772 0.871 0.221 0.385 ...
 $ Channel_3_Channel_1: num  0.4629 -0.2038 0.2337 0.0604 0.2966 ...
 $ Channel_3_Channel_2: num  0.2118 -0.2993 0.2036 0.0802 0.1537 ...
 $ Channel_4_Channel_1: num  0.704 -0.236 0.132 -0.129 0.444 ...
 $ Channel_4_Channel_2: num  -0.0493 -0.4176 0.0857 0.0466 0.0346 ...
 $ Channel_4_Channel_3: num  0.46 0.763 0.316 0.2 0.363 ...
 $ Channel_5_Channel_1: num  0.701 0.772 0.844 -0.101 0.646 ...
 $ Channel_5_Channel_2: num  0.204 0.809 0.878 0.392 0.52 ...
 $ Channel_5_Channel_3: num  0.514 -0.4901 0.284 -0.0392 0.384 ...
 $ Channel_5_Channel_4: num  0.411 -0.404 0.265 -0.238 0.399 ...
 $ Channel_6_Channel_1: num  0.41166 0.61328 0.67807 0.00797 0.26021 ...
 $ Channel_6_Channel_2: num  0.4969 0.8346 0.875 -0.0943 0.5515 ...
 $ Channel_6_Channel_3: num  -0.1512 -0.4591 0.2282 -0.0503 -0.0248 ...
 $ Channel_6_Channel_4: num  0.1177 -0.4061 0.3255 -0.1352 0.0864 ...
 $ Channel_6_Channel_5: num  0.442 0.948 0.874 -0.114 0.588 ...
 $ Channel_7_Channel_1: num  0.70067 -0.18438 -0.00831 0.17769 0.13275 ...
 $ Channel_7_Channel_2: num  0.1746 -0.5087 -0.0524 -0.0345 -0.2864 ...
 $ Channel_7_Channel_3: num  0.6 0.534 0.505 0.058 0.27 ...
 $ Channel_7_Channel_4: num  0.8218 0.6002 0.44 0.0555 0.8157 ...
 $ Channel_7_Channel_5: num  0.6179 -0.2584 0.2951 0.0499 0.0664 ...
 $ Channel_7_Channel_6: num  0.272 -0.354 0.107 -0.208 -0.252 ...
 $ Channel_8_Channel_1: num  0.3583 -0.2319 -0.0919 0.4151 0.3768 ...
 $ Channel_8_Channel_2: num  -0.141 -0.36118 -0.00897 0.0775 0.0612 ...
 $ Channel_8_Channel_3: num  0.296 0.2906 0.2069 -0.064 0.0494 ...
 $ Channel_8_Channel_4: num  0.679 0.631 0.748 0.331 0.706 ...
 $ Channel_8_Channel_5: num  0.333 -0.205 0.167 -0.34 0.377 ...
 $ Channel_8_Channel_6: num  -0.00803 -0.18494 0.3446 0.07539 0.19827 ...
 $ Channel_8_Channel_7: num  0.72 0.573 0.506 -0.108 0.523 ...
 $ Word_C             : Factor w/ 2 levels "H","L": 1 2 2 2 1 2 2 1 2 2 ...

Regression Problem : dta1

Feature Selection

First of all, we do the classical linear model for all variables.
Each model will contain a X variable and a Y variable. (Y~X1)
We report the R-squared, adjusted-R-Squared and p value from each variable.
fit_lm <- list()
R2_lm <- data.frame(R2 = NA, variable = NA)
aR2_lm <- data.frame(aR2 = NA, variable = NA)
p_lm <- data.frame(pvalue = NA, variable = NA)
for (i in 1:30){
        fit_lm[[i]] <- summary(fit <- lm(Word~.,
                                          data = dta1[,c(i,31)]))
        names(fit_lm)[i] <- colnames(dta1)[i]
        yhat <- predict(fit,data = dta1[,c(i,31)],type = "response")
        R2_lm <- rbind(R2_lm,
                        data.frame(R2 = fit_lm[[i]]$r.squared,
                                   variable = colnames(dta1)[i]))
        aR2_lm <- rbind(aR2_lm,
                        data.frame(aR2 = fit_lm[[i]]$adj.r.squared,
                                   variable = colnames(dta1)[i]))
        p_lm <- rbind(p_lm, data.frame(pvalue = fit_lm[[i]]$coefficient[2,4],
                                       variable = colnames(dta1)[i]))
}

Feature Selection

Following code provided the final data that was used for analysis
We select the variable whose R-Squared > 0.07, so the final data have 6 variable.
vs <- full_join(p_lm,R2_lm)
vs <- full_join(vs,aR2_lm)
vs <- na.omit(vs[order(vs$pvalue),c(2,1,3,4)])
head(vs,10)
              variable     pvalue         R2          aR2
30 Channel_8_Channel_6 0.04796165 0.21085134  0.164430832
29 Channel_8_Channel_5 0.14345477 0.12157590  0.069903893
31 Channel_8_Channel_7 0.24141640 0.07975418  0.025622076
13 Channel_5_Channel_4 0.24694482 0.07798054  0.023744103
12 Channel_5_Channel_3 0.26345264 0.07294288  0.018410111
9  Channel_4_Channel_3 0.26976994 0.07111060  0.016470050
28 Channel_8_Channel_4 0.28794471 0.06610719  0.011172321
23 Channel_7_Channel_5 0.32297046 0.05745242  0.002008439
4  Channel_2_Channel_1 0.36667231 0.04816139 -0.007829121
7  Channel_4_Channel_1 0.37525910 0.04650344 -0.009584594
dta1_vs<- dta1[,c("Word",filter(vs, R2>0.07)$variable)]
str(dta1_vs)
'data.frame':   19 obs. of  7 variables:
 $ Word               : int  125 102 57 88 114 74 30 112 69 99 ...
 $ Channel_8_Channel_6: num  -0.00803 -0.18494 0.3446 0.07539 0.19827 ...
 $ Channel_8_Channel_5: num  0.333 -0.205 0.167 -0.34 0.377 ...
 $ Channel_8_Channel_7: num  0.72 0.573 0.506 -0.108 0.523 ...
 $ Channel_5_Channel_4: num  0.411 -0.404 0.265 -0.238 0.399 ...
 $ Channel_5_Channel_3: num  0.514 -0.4901 0.284 -0.0392 0.384 ...
 $ Channel_4_Channel_3: num  0.46 0.763 0.316 0.2 0.363 ...

Linear Model

We used best subset selection in order to select the best model of current data in
linear context. The limition of numers of variables was 6 (all variables).
The best subset method selected the 1 variable model.
Channel8_Channel6 is the most important variable in this model.
However, the cross validation R-Squared was < 0, so we have to reconsider the reliablity
of the present setting.
nmax = 6 
bs_r2 <- matrix(NA,1000,nmax,dimnames =list(NULL , paste (1:nmax) ))
for (k in 1:1000){
        set.seed(k)
        idc_test <- sample(1:nrow(dta1_vs),6)
        idc_train <- -idc_test
        best.fit <- regsubsets(Word~. ,data=dta1_vs[idc_train,],
                               method ="exhaustive",nvmax= nmax)
        for(i in 1:nmax) {
                yhat <- predict(best.fit,newdata = dta1_vs[idc_test,],
                                id=i,type = "response")
                y <- dta1_vs[idc_test,]$Word
                bs_r2 [k,i] = 1 - mean((y - yhat)^2)/var(y)
        }
}
sort(apply(bs_r2 ,2, mean))
        4         6         5         3         2         1 
-2.302816 -2.286349 -2.227113 -1.985276 -1.398768 -1.084220 
best.fit <- regsubsets(Word~. ,data=dta1_vs,method ="exhaustive",nvmax= nmax)
coef(best.fit,1)
        (Intercept) Channel_8_Channel_6 
          105.07076           -53.16316 

Linear Model

The following codes used 1000 replications to gain the cross validation R-Squared
on the best LM.
rs_mat_lm <- matrix(NA,1000,1)
for (k in 1:1000){
        set.seed(k)
        idc_test <- sample(1:nrow(dta1_vs),6)
        idc_train <- -idc_test
        fit <- lm(Word~Channel_8_Channel_6,data = dta1_vs[idc_train,])
        yhat <- predict(fit,newdata = dta1_vs[idc_test,],type = "response")
        y <- dta1_vs[idc_test,]$Word
        rs_mat_lm[k,] = 1 - mean((y - yhat)^2)/var(y)
}
mean(rs_mat_lm)
[1] -0.6092389
quantile(rs_mat_lm,probs = c(0.025,0.975))
      2.5%      97.5% 
-5.5763401  0.4659623 

Linear Model

Because of the low R-Squared, the present study used “current variables” (Ch8_Ch6)
to predict Word_c (Classification Problem).
dta1_vs_c <- dta1_vs
dta1_vs_c$Word_c <- as.factor(ifelse(dta1_vs$Word > median(dta1_vs$Word), "H","L"))
dta1_vs_c <- dta1_vs_c[,-1]
auc_mat_glm_c <- matrix(NA,1000,1)
for (k in 1:1000){
        set.seed(k)
        idc_test <- c(sample(which(dta1_vs_c$Word_c == "H"), 3),
                       sample(which(dta1_vs_c$Word_c == "L"), 3))
        idc_train <- -idc_test
        fit <- glm(Word_c~Channel_8_Channel_6,data = dta1_vs_c[idc_train,],family = "binomial")
        yhat <- predict(fit,newdata = dta1_vs_c[idc_test,],type = "response")
        y <- dta1_vs_c[idc_test,]$Word
        auc_mat_glm_c[k,] = pROC::auc(y,yhat)
}
mean( auc_mat_glm_c)
[1] 0.6695556
quantile( auc_mat_glm_c,probs = c(0.025,0.975))
     2.5%     97.5% 
0.4444444 1.0000000 

RF

In RF model, we used full 6 variables, default metry and default nodesize to
construct the RF model.
R-Squared was not really good as well.
rs_mat_rf <- matrix(NA,1000,1)
for (k in 1:1000){
        set.seed(k)
        idc_test <- sample(1:nrow(dta1_vs),6)
        idc_train <- -idc_test
        fit <- randomForest(Word~. ,
                            data = dta1_vs[idc_train,],
                            ntree = 10001,importance = F)
        yhat <- predict(fit,newdata = dta1_vs[idc_test,],type = "response")
        y <- dta1_vs[idc_test,]$Word
        rs_mat_rf[k,] = 1 - mean((y - yhat)^2)/var(y)
}
mean(rs_mat_rf)
[1] -0.5409942
quantile(rs_mat_rf,probs = c(0.025,0.975))
      2.5%      97.5% 
-4.0545783  0.3578068 

RF

Because of the low R-Squared, the present study used “current variables” (6 variables)
to predict Word_c (Classification Problem).
Default mrty and nodesize was used as well.
dta1_vs_c <- dta1_vs
dta1_vs_c$Word_c <- as.factor(ifelse(dta1_vs$Word > median(dta1_vs$Word), "H","L"))
dta1_vs_c <- dta1_vs_c[,-1]
auc_mat_rf_c <- matrix(NA,1000,1)
for (k in 1:1000){
        set.seed(k)
        idc_test <- c(sample(which(dta1_vs_c$Word_c == "H"), 3),
                       sample(which(dta1_vs_c$Word_c == "L"), 3))
        idc_train <- -idc_test
        fit <- randomForest(Word_c~. ,
                            data = dta1_vs_c[idc_train,],
                            ntree = 10001,importance = F)
        yhat <- predict(fit,newdata = dta1_vs_c[idc_test,],type = "prob")[,2]
        y <- dta1_vs_c[idc_test,]$Word
        auc_mat_rf_c[k,] = pROC::auc(y,yhat)
}
mean(auc_mat_rf_c)
[1] 0.7144444
quantile(auc_mat_rf_c,probs = c(0.025,0.975))
     2.5%     97.5% 
0.4444444 1.0000000 

Comparison

pdta <- data.frame(RS_LM = rs_mat_lm,AUC_LM = auc_mat_glm_c,
                   RS_RF = rs_mat_rf, AUC_RF = auc_mat_rf_c)
apply(pdta, 2, mean)
     RS_LM     AUC_LM      RS_RF     AUC_RF 
-0.6092389  0.6695556 -0.5409942  0.7144444 
apply(pdta, 2 , quantile,probs = c(0.025,0.975))
           RS_LM    AUC_LM      RS_RF    AUC_RF
2.5%  -5.5763401 0.4444444 -4.0545783 0.4444444
97.5%  0.4659623 1.0000000  0.3578068 1.0000000
pdta <- melt(pdta)
ggplot(pdta, aes(value))+
        geom_density()+
        facet_wrap(~variable,scales = "free")+
        theme_bw()

Classification : dta2

Because of the low reliablity of regression problem,
the present study used classification approach to gain insight into this data.

Feature selection

We provided feature selection on classification problem as well.
1 by 1 model that like the method is session1 was also used here.
The index of p_value and AUC were used for selecting variables.
fit_lm <- list()
auc_lm <- data.frame(AUC = NA, variable = NA)
p_lm <- data.frame(pvalue = NA, variable = NA)
for (i in 1:30){
        fit_lm[[i]] <- summary(fit <- glm(Word_C~.,
                                          data = dta2[,c(i,31)],family = "binomial"))
        names(fit_lm)[i] <- colnames(dta2)[i]
        yhat <- predict(fit,data = dta2[,c(i,31)],type = "response")
        auc_lm <- rbind(auc_lm,
                        data.frame(AUC = pROC::auc(dta2$Word_C,yhat),
                                   variable = colnames(dta2)[i]))
        p_lm <- rbind(p_lm, data.frame(pvalue = fit_lm[[i]]$coefficient[2,4], variable = colnames(dta2)[i]))
}

Feature selection

We select the variables that AUC > 0.75 on the current setting.
The highest 3 variables were chosen.
vs <- full_join(p_lm,auc_lm)
head(na.omit(vs[order(-vs$AUC),c(2,1,3)]),10)
              variable     pvalue       AUC
7  Channel_4_Channel_1 0.05937883 0.7777778
29 Channel_8_Channel_5 0.04705111 0.7777778
13 Channel_5_Channel_4 0.09553314 0.7555556
5  Channel_3_Channel_1 0.11573316 0.7444444
19 Channel_7_Channel_1 0.10024331 0.7444444
12 Channel_5_Channel_3 0.09629423 0.7333333
8  Channel_4_Channel_2 0.08392544 0.7222222
4  Channel_2_Channel_1 0.18213073 0.7111111
23 Channel_7_Channel_5 0.14191010 0.7111111
20 Channel_7_Channel_2 0.21802741 0.6888889
dta2_vs<- dta2[,c("Word_C",filter(vs,AUC > 0.75)$variable)] 
head(dta2_vs)
  Word_C Channel_4_Channel_1 Channel_5_Channel_4 Channel_8_Channel_5
1      H           0.7039091           0.4112742           0.3329433
2      L          -0.2355694          -0.4043635          -0.2051132
3      L           0.1320238           0.2647561           0.1667268
4      L          -0.1288285          -0.2381991          -0.3403798
5      H           0.4443495           0.3985602           0.3773816
6      L           0.3856371           0.4010637           0.4028124

LM

We performed the logistic model that was chosen from best subset selection method.
the package “bestglm” was used due to the lack of glm model seletion in “leaps” package.
However, we need to reconsider the code below.
#bs_auc_i <- matrix(NA,10000,3,dimnames =list(NULL , paste (1:3)))
#for (k in 1:10000){
#        set.seed(k)
#        print(k)
#        idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
#                       sample(which(dta2_vs$Word_C == "L"), 3))
#        idc_train <- -idc_test
#        best.fit <- bestglm(dta2_vs[idc_train,c(2:4,1)],
#                            IC="AIC",method ="exhaustive",family=binomial,nvmax = 3)
#        yhat <- predict(best.fit$BestModel,newdata = dta2_vs[idc_test,c(2:4,1)],
#                        type = "response")
#        n = nrow(as.data.frame(best.fit$BestModel$coefficients)) - 1
#        bs_auc_i[k,n] = pROC::auc(dta2_vs[idc_test,]$Word_C,as.numeric(yhat))
#}
#sort(apply(bs_auc_i,2, mean,na.rm =T))

LM

After Best subset selection, the best model contain 1 variable.
The selected variable was Ch8_Ch5.
best.fit <- bestglm(dta2_vs[,c(2:4,1)],
                    IC="AIC",method ="exhaustive",family=binomial,nvmax = 1)##???)
Morgan-Tatar search since family is non-gaussian.
summary(best.fit$BestModel)

Call:
glm(formula = y ~ ., family = family, data = Xi, weights = weights)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7242  -0.9143   0.3820   0.8524   2.0069  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)  
(Intercept)           0.8311     0.6805   1.221   0.2220  
Channel_8_Channel_5  -3.8347     1.9310  -1.986   0.0471 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 26.287  on 18  degrees of freedom
Residual deviance: 20.690  on 17  degrees of freedom
AIC: 24.69

Number of Fisher Scoring iterations: 4

LM

We provided 1000 replications to gain the ross validation AUC on present data fitting.
auc_mat_lm <- matrix(NA,1000,1)
for (k in 1:1000){
        set.seed(k)
        idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
                       sample(which(dta2_vs$Word_C == "L"), 3))
        idc_train <- -idc_test
        # Channel_8_Channel_5
        fit <- glm(Word_C~Channel_8_Channel_5,
                    data = dta2_vs[idc_train,],family = "binomial")
        yhat <- predict(fit,newdata = dta2_vs[idc_test,],type = "response")
        auc_mat_lm[k,] <- pROC::auc(dta2_vs[idc_test,]$Word_C,yhat)
}
mean(auc_mat_lm)  # 0.791
[1] 0.7911667
quantile(auc_mat_lm,probs = c(0.025,0.975)) # 0.5555556 1.0000000 
     2.5%     97.5% 
0.5555556 1.0000000 

RF

In RF model, we used full 3 variables, default metry and default nodesize to
construct the RF model.
AUC was a little higher than GLM.
auc_mat_rf_3 <- matrix(NA,1000,1)
for (k in 1:1000){
      set.seed(k)
      idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
                     sample(which(dta2_vs$Word_C == "L"), 3))
      idc_train <- -idc_test
      fit <- randomForest(Word_C~. ,
                          data = dta2_vs[idc_train,],
                          ntree = 10001,importance = F)
      yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
      auc_mat_rf_3[k,] <- pROC::auc(dta2_vs[idc_test,]$Word_C,yhat)
}
mean(auc_mat_rf_3) 
[1] 0.7997222
quantile(auc_mat_rf_3,probs = c(0.025,0.975))
 2.5% 97.5% 
  0.5   1.0 

RF

If we tune the RF model,
we could gain a much better AUC using RF model (mtry = 1, nodesize = 7)
auc_mat_rf_3_tune <- matrix(NA,1000,1)
for (k in 1:1000){
      set.seed(k)
      idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
                     sample(which(dta2_vs$Word_C == "L"), 3))
      idc_train <- -idc_test
      fit <- randomForest(Word_C~. ,
                          data = dta2_vs[idc_train,],
                          mtry = 1 ,nodesize = 7,
                          ntree = 10001,importance = F)
      yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
      auc_mat_rf_3_tune[k,] <- pROC::auc(dta2_vs[idc_test,]$Word_C,yhat)
}
mean(auc_mat_rf_3_tune) 
[1] 0.8253889
quantile(auc_mat_rf_3_tune,probs = c(0.025,0.975))
     2.5%     97.5% 
0.5555556 1.0000000 
# 9 var = 0.729
# 7 var = 0.726
# 6 var = 0.748
# 5 var = 0.779 #### mtry = 1 ,nodesize = 4 ; 0.797
# 3 var = 0.799 #### mtry = 1 ,nodesize = 7 ; 0.825
# 2 var =  0.7333889
I also try the top 5 variables in GLM and RF model
GLM : the results was as same as 3 variables, because it would only select Ch8_Ch5
RF : demonstrate below

RF_2

AUC > 0.74 : 5 variables
Default metry and nodesize
head(na.omit(cbind(p_lm[order(p_lm$pvalue),c(2,1)],
                        vs <-  auc_lm[order(-auc_lm$AUC),c(2,1)])),10)
              variable     pvalue            variable       AUC
29 Channel_8_Channel_5 0.04705111 Channel_4_Channel_1 0.7777778
7  Channel_4_Channel_1 0.05937883 Channel_8_Channel_5 0.7777778
8  Channel_4_Channel_2 0.08392544 Channel_5_Channel_4 0.7555556
13 Channel_5_Channel_4 0.09553314 Channel_3_Channel_1 0.7444444
12 Channel_5_Channel_3 0.09629423 Channel_7_Channel_1 0.7444444
19 Channel_7_Channel_1 0.10024331 Channel_5_Channel_3 0.7333333
5  Channel_3_Channel_1 0.11573316 Channel_4_Channel_2 0.7222222
23 Channel_7_Channel_5 0.14191010 Channel_2_Channel_1 0.7111111
6  Channel_3_Channel_2 0.16463977 Channel_7_Channel_5 0.7111111
4  Channel_2_Channel_1 0.18213073 Channel_7_Channel_2 0.6888889
dta2_vs<- dta2[,c("Word_C",filter(vs,AUC > 0.74)$variable)] 
auc_mat_rf_5 <- matrix(NA,1000,1)
for (k in 1:1000){
      set.seed(k)
      idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
                     sample(which(dta2_vs$Word_C == "L"), 3))
      idc_train <- -idc_test
      fit <- randomForest(Word_C~. ,
                          data = dta2_vs[idc_train,],
                          ntree = 10001,importance = F)
      yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
      auc_mat_rf_5[k,] <- pROC::auc(dta2_vs[idc_test,]$Word_C,yhat)
}
mean(auc_mat_rf_5) 
[1] 0.7790556
quantile(auc_mat_rf_5,probs = c(0.025,0.975))
     2.5%     97.5% 
0.4444444 1.0000000 

RF_2

AUC > 0.74 : 5 variables
mtry = 1 ,nodesize = 4
auc_mat_rf_5_tune <- matrix(NA,1000,1)
for (k in 1:1000){
      set.seed(k)
      idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
                     sample(which(dta2_vs$Word_C == "L"), 3))
      idc_train <- -idc_test
      fit <- randomForest(Word_C~. ,
                          data = dta2_vs[idc_train,],
                          mtry = 1 ,nodesize = 4,
                          ntree = 10001,importance = F)
      yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
      auc_mat_rf_5_tune[k,] <- pROC::auc(dta2_vs[idc_test,]$Word_C,yhat)
}
mean(auc_mat_rf_5_tune) 
[1] 0.797
quantile(auc_mat_rf_5_tune,probs = c(0.025,0.975))
     2.5%     97.5% 
0.4444444 1.0000000 

Comparsion

Now we could compare 5 models
GLM
RF : 3 variable, default
RF : 3 variable, best
RF : 5 variable, default
RF : 5 variable, best
plotdta <- data.frame(GLM_1 = auc_mat_lm,
                      RF_3 = auc_mat_rf_3,
                      RF_3t = auc_mat_rf_3_tune,
                      RF_5 = auc_mat_rf_5,
                      RF_5t = auc_mat_rf_5_tune)
apply(plotdta,2,mean)
    GLM_1      RF_3     RF_3t      RF_5     RF_5t 
0.7911667 0.7997222 0.8253889 0.7790556 0.7970000 
apply(plotdta,2,quantile,probs = c(0.025,0.975))
          GLM_1 RF_3     RF_3t      RF_5     RF_5t
2.5%  0.5555556  0.5 0.5555556 0.4444444 0.4444444
97.5% 1.0000000  1.0 1.0000000 1.0000000 1.0000000

Comparsion

p <- melt(plotdta)
colnames(p)[1:2] <- c("Model","AUC")
Fig1 <- ggplot(p,aes(AUC,fill = Model, col = Model))+
        geom_density(alpha = 0.3)+
        theme_bw()+
        xlab("AUC")
mdta <- ddply(p, "Model", summarise, average=mean(AUC))
prodta <- ddply(p,"Model",summarise, 
                Q.025 = quantile(AUC,probs = c(0.025)),
                Q.975 = quantile(AUC,probs = c(0.975)))
Fig2 <- Fig1+
        facet_grid(Model~.)+
        geom_vline(data=mdta , aes(xintercept=average),
                   linetype="dashed", size=1, colour="red")+
        geom_vline(data = prodta,aes(xintercept=Q.025),
                   linetype="dashed", size=1, colour="darkgoldenrod")+
        geom_vline(data = prodta,aes(xintercept=Q.975),
                   linetype="dashed", size=1, colour="darkgoldenrod")
Fig1 is an interactive plot. We could click the variables to gain
the best visual expereince.
ggplotly(Fig1)
Fig2

Final Model

Other information and Dependence Plot
finalfit <- randomForest(Word_C~Channel_4_Channel_1+
                                 Channel_5_Channel_4+Channel_8_Channel_5 ,
             data = dta2_vs,
             mtry = 1 ,nodesize = 7,
             ntree = 10001,importance = T,keep.forest=T)
par(mfrow = c(1, 1))
varImpPlot(finalfit)

plot(finalfit)

par(mfrow = c(1, 3))
partialPlot(finalfit, pred.data = dta2_vs, x.var = "Channel_4_Channel_1")
partialPlot(finalfit, pred.data = dta2_vs, x.var = "Channel_5_Channel_4")
partialPlot(finalfit, pred.data = dta2_vs, x.var = "Channel_8_Channel_5")

Conclusion

1. Regression Problem might be hard for current small sample data to gain a stable
cross validation R-squared
2. From Classification Problem, we could know that
GLM have a good (0.791) AUC.
Default RF_3variables have a little bit better (0.799) AUC
Best RF_3variables have the highest (0.825) AUC.
3. We could explain two cultures of the method on Resting-State Connectivity data,
and also promote or encourage NIRS users to use both GLM and
RF approach rather than single LM model to analyze their resting state data.