The packages and functions that were used for the demonstration are shown below.
library(randomForest);library(leaps);library(pROC);library(ggplot2);
library(reshape2);library(plotly);library(ltm);
library(dplyr);library(plyr)
library(bestglm)
predict.regsubsets <- function (object ,newdata ,id ,...) {
form=as.formula (object$call [[2]])
mat=model.matrix (form ,newdata )
coefi =coef(object ,id=id)
xvars =names (coefi )
mat[,xvars ]%*% coefi
}
Data
Subject 2 is an outlier, so we deleted it.
The following analysis have two sessions.
Session 1 are regression problem, so we use the noraml Word variable for the analysis.
However, because of the lack of stablity on regression problem,
so we create a new factor variable : “Word_c”.
The subjects whose “word” scores were higher than median(word) will be classified as “H”,
and others will be “L”.
dta1 was used for regression problem. (Word)
dta2 was used for classifcation problem. (Word_c)
The analysis will use two models : Linear Model (LM) and RandomForest (RF).
dta <- readRDS("ML_NIRS_Connectivity_MeanVar.Rdata")[-12,c(2,10:39)]
dta1 <- dta
dta$Word_C <-as.factor(ifelse(dta$Word > median(dta$Word), "H","L"))
head(dta[,c(1:4,32)])
Word Sex Age Channel_2_Channel_1 Word_C
1 125 M 10.41 0.2485940 H
2 102 M 9.95 0.7715194 L
3 57 M 10.22 0.8711941 L
4 88 M 9.68 0.2212408 L
5 114 F 10.35 0.3854612 H
6 74 F 10.37 0.7481600 L
dta2 <- dta[,-1]
rownames(dta2) <- 1:nrow(dta2)
dta1 <- dta1[,c(2:31,1)]
dta1 : regression
dim(dta1)
[1] 19 31
str(dta1)
'data.frame': 19 obs. of 31 variables:
$ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 1 1 2 2 2 1 ...
$ Age : num 10.41 9.95 10.22 9.68 10.35 ...
$ Channel_2_Channel_1: num 0.249 0.772 0.871 0.221 0.385 ...
$ Channel_3_Channel_1: num 0.4629 -0.2038 0.2337 0.0604 0.2966 ...
$ Channel_3_Channel_2: num 0.2118 -0.2993 0.2036 0.0802 0.1537 ...
$ Channel_4_Channel_1: num 0.704 -0.236 0.132 -0.129 0.444 ...
$ Channel_4_Channel_2: num -0.0493 -0.4176 0.0857 0.0466 0.0346 ...
$ Channel_4_Channel_3: num 0.46 0.763 0.316 0.2 0.363 ...
$ Channel_5_Channel_1: num 0.701 0.772 0.844 -0.101 0.646 ...
$ Channel_5_Channel_2: num 0.204 0.809 0.878 0.392 0.52 ...
$ Channel_5_Channel_3: num 0.514 -0.4901 0.284 -0.0392 0.384 ...
$ Channel_5_Channel_4: num 0.411 -0.404 0.265 -0.238 0.399 ...
$ Channel_6_Channel_1: num 0.41166 0.61328 0.67807 0.00797 0.26021 ...
$ Channel_6_Channel_2: num 0.4969 0.8346 0.875 -0.0943 0.5515 ...
$ Channel_6_Channel_3: num -0.1512 -0.4591 0.2282 -0.0503 -0.0248 ...
$ Channel_6_Channel_4: num 0.1177 -0.4061 0.3255 -0.1352 0.0864 ...
$ Channel_6_Channel_5: num 0.442 0.948 0.874 -0.114 0.588 ...
$ Channel_7_Channel_1: num 0.70067 -0.18438 -0.00831 0.17769 0.13275 ...
$ Channel_7_Channel_2: num 0.1746 -0.5087 -0.0524 -0.0345 -0.2864 ...
$ Channel_7_Channel_3: num 0.6 0.534 0.505 0.058 0.27 ...
$ Channel_7_Channel_4: num 0.8218 0.6002 0.44 0.0555 0.8157 ...
$ Channel_7_Channel_5: num 0.6179 -0.2584 0.2951 0.0499 0.0664 ...
$ Channel_7_Channel_6: num 0.272 -0.354 0.107 -0.208 -0.252 ...
$ Channel_8_Channel_1: num 0.3583 -0.2319 -0.0919 0.4151 0.3768 ...
$ Channel_8_Channel_2: num -0.141 -0.36118 -0.00897 0.0775 0.0612 ...
$ Channel_8_Channel_3: num 0.296 0.2906 0.2069 -0.064 0.0494 ...
$ Channel_8_Channel_4: num 0.679 0.631 0.748 0.331 0.706 ...
$ Channel_8_Channel_5: num 0.333 -0.205 0.167 -0.34 0.377 ...
$ Channel_8_Channel_6: num -0.00803 -0.18494 0.3446 0.07539 0.19827 ...
$ Channel_8_Channel_7: num 0.72 0.573 0.506 -0.108 0.523 ...
$ Word : int 125 102 57 88 114 74 30 112 69 99 ...
dta2 : Classification
dim(dta2)
[1] 19 31
str(dta2)
'data.frame': 19 obs. of 31 variables:
$ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 1 1 2 2 2 1 ...
$ Age : num 10.41 9.95 10.22 9.68 10.35 ...
$ Channel_2_Channel_1: num 0.249 0.772 0.871 0.221 0.385 ...
$ Channel_3_Channel_1: num 0.4629 -0.2038 0.2337 0.0604 0.2966 ...
$ Channel_3_Channel_2: num 0.2118 -0.2993 0.2036 0.0802 0.1537 ...
$ Channel_4_Channel_1: num 0.704 -0.236 0.132 -0.129 0.444 ...
$ Channel_4_Channel_2: num -0.0493 -0.4176 0.0857 0.0466 0.0346 ...
$ Channel_4_Channel_3: num 0.46 0.763 0.316 0.2 0.363 ...
$ Channel_5_Channel_1: num 0.701 0.772 0.844 -0.101 0.646 ...
$ Channel_5_Channel_2: num 0.204 0.809 0.878 0.392 0.52 ...
$ Channel_5_Channel_3: num 0.514 -0.4901 0.284 -0.0392 0.384 ...
$ Channel_5_Channel_4: num 0.411 -0.404 0.265 -0.238 0.399 ...
$ Channel_6_Channel_1: num 0.41166 0.61328 0.67807 0.00797 0.26021 ...
$ Channel_6_Channel_2: num 0.4969 0.8346 0.875 -0.0943 0.5515 ...
$ Channel_6_Channel_3: num -0.1512 -0.4591 0.2282 -0.0503 -0.0248 ...
$ Channel_6_Channel_4: num 0.1177 -0.4061 0.3255 -0.1352 0.0864 ...
$ Channel_6_Channel_5: num 0.442 0.948 0.874 -0.114 0.588 ...
$ Channel_7_Channel_1: num 0.70067 -0.18438 -0.00831 0.17769 0.13275 ...
$ Channel_7_Channel_2: num 0.1746 -0.5087 -0.0524 -0.0345 -0.2864 ...
$ Channel_7_Channel_3: num 0.6 0.534 0.505 0.058 0.27 ...
$ Channel_7_Channel_4: num 0.8218 0.6002 0.44 0.0555 0.8157 ...
$ Channel_7_Channel_5: num 0.6179 -0.2584 0.2951 0.0499 0.0664 ...
$ Channel_7_Channel_6: num 0.272 -0.354 0.107 -0.208 -0.252 ...
$ Channel_8_Channel_1: num 0.3583 -0.2319 -0.0919 0.4151 0.3768 ...
$ Channel_8_Channel_2: num -0.141 -0.36118 -0.00897 0.0775 0.0612 ...
$ Channel_8_Channel_3: num 0.296 0.2906 0.2069 -0.064 0.0494 ...
$ Channel_8_Channel_4: num 0.679 0.631 0.748 0.331 0.706 ...
$ Channel_8_Channel_5: num 0.333 -0.205 0.167 -0.34 0.377 ...
$ Channel_8_Channel_6: num -0.00803 -0.18494 0.3446 0.07539 0.19827 ...
$ Channel_8_Channel_7: num 0.72 0.573 0.506 -0.108 0.523 ...
$ Word_C : Factor w/ 2 levels "H","L": 1 2 2 2 1 2 2 1 2 2 ...
Regression Problem : dta1
Feature Selection
First of all, we do the classical linear model for all variables.
Each model will contain a X variable and a Y variable. (Y~X1)
We report the R-squared, adjusted-R-Squared and p value from each variable.
fit_lm <- list()
R2_lm <- data.frame(R2 = NA, variable = NA)
aR2_lm <- data.frame(aR2 = NA, variable = NA)
p_lm <- data.frame(pvalue = NA, variable = NA)
for (i in 1:30){
fit_lm[[i]] <- summary(fit <- lm(Word~.,
data = dta1[,c(i,31)]))
names(fit_lm)[i] <- colnames(dta1)[i]
yhat <- predict(fit,data = dta1[,c(i,31)],type = "response")
R2_lm <- rbind(R2_lm,
data.frame(R2 = fit_lm[[i]]$r.squared,
variable = colnames(dta1)[i]))
aR2_lm <- rbind(aR2_lm,
data.frame(aR2 = fit_lm[[i]]$adj.r.squared,
variable = colnames(dta1)[i]))
p_lm <- rbind(p_lm, data.frame(pvalue = fit_lm[[i]]$coefficient[2,4],
variable = colnames(dta1)[i]))
}
Feature Selection
Following code provided the final data that was used for analysis
We select the variable whose R-Squared > 0.07, so the final data have 6 variable.
vs <- full_join(p_lm,R2_lm)
vs <- full_join(vs,aR2_lm)
vs <- na.omit(vs[order(vs$pvalue),c(2,1,3,4)])
head(vs,10)
variable pvalue R2 aR2
30 Channel_8_Channel_6 0.04796165 0.21085134 0.164430832
29 Channel_8_Channel_5 0.14345477 0.12157590 0.069903893
31 Channel_8_Channel_7 0.24141640 0.07975418 0.025622076
13 Channel_5_Channel_4 0.24694482 0.07798054 0.023744103
12 Channel_5_Channel_3 0.26345264 0.07294288 0.018410111
9 Channel_4_Channel_3 0.26976994 0.07111060 0.016470050
28 Channel_8_Channel_4 0.28794471 0.06610719 0.011172321
23 Channel_7_Channel_5 0.32297046 0.05745242 0.002008439
4 Channel_2_Channel_1 0.36667231 0.04816139 -0.007829121
7 Channel_4_Channel_1 0.37525910 0.04650344 -0.009584594
dta1_vs<- dta1[,c("Word",filter(vs, R2>0.07)$variable)]
str(dta1_vs)
'data.frame': 19 obs. of 7 variables:
$ Word : int 125 102 57 88 114 74 30 112 69 99 ...
$ Channel_8_Channel_6: num -0.00803 -0.18494 0.3446 0.07539 0.19827 ...
$ Channel_8_Channel_5: num 0.333 -0.205 0.167 -0.34 0.377 ...
$ Channel_8_Channel_7: num 0.72 0.573 0.506 -0.108 0.523 ...
$ Channel_5_Channel_4: num 0.411 -0.404 0.265 -0.238 0.399 ...
$ Channel_5_Channel_3: num 0.514 -0.4901 0.284 -0.0392 0.384 ...
$ Channel_4_Channel_3: num 0.46 0.763 0.316 0.2 0.363 ...
Linear Model
We used best subset selection in order to select the best model of current data in
linear context. The limition of numers of variables was 6 (all variables).
The best subset method selected the 1 variable model.
Channel8_Channel6 is the most important variable in this model.
However, the cross validation R-Squared was < 0, so we have to reconsider the reliablity
of the present setting.
nmax = 6
bs_r2 <- matrix(NA,1000,nmax,dimnames =list(NULL , paste (1:nmax) ))
for (k in 1:1000){
set.seed(k)
idc_test <- sample(1:nrow(dta1_vs),6)
idc_train <- -idc_test
best.fit <- regsubsets(Word~. ,data=dta1_vs[idc_train,],
method ="exhaustive",nvmax= nmax)
for(i in 1:nmax) {
yhat <- predict(best.fit,newdata = dta1_vs[idc_test,],
id=i,type = "response")
y <- dta1_vs[idc_test,]$Word
bs_r2 [k,i] = 1 - mean((y - yhat)^2)/var(y)
}
}
sort(apply(bs_r2 ,2, mean))
4 6 5 3 2 1
-2.302816 -2.286349 -2.227113 -1.985276 -1.398768 -1.084220
best.fit <- regsubsets(Word~. ,data=dta1_vs,method ="exhaustive",nvmax= nmax)
coef(best.fit,1)
(Intercept) Channel_8_Channel_6
105.07076 -53.16316
Linear Model
The following codes used 1000 replications to gain the cross validation R-Squared
on the best LM.
rs_mat_lm <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_test <- sample(1:nrow(dta1_vs),6)
idc_train <- -idc_test
fit <- lm(Word~Channel_8_Channel_6,data = dta1_vs[idc_train,])
yhat <- predict(fit,newdata = dta1_vs[idc_test,],type = "response")
y <- dta1_vs[idc_test,]$Word
rs_mat_lm[k,] = 1 - mean((y - yhat)^2)/var(y)
}
mean(rs_mat_lm)
[1] -0.6092389
quantile(rs_mat_lm,probs = c(0.025,0.975))
2.5% 97.5%
-5.5763401 0.4659623
Linear Model
Because of the low R-Squared, the present study used “current variables” (Ch8_Ch6)
to predict Word_c (Classification Problem).
dta1_vs_c <- dta1_vs
dta1_vs_c$Word_c <- as.factor(ifelse(dta1_vs$Word > median(dta1_vs$Word), "H","L"))
dta1_vs_c <- dta1_vs_c[,-1]
auc_mat_glm_c <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_test <- c(sample(which(dta1_vs_c$Word_c == "H"), 3),
sample(which(dta1_vs_c$Word_c == "L"), 3))
idc_train <- -idc_test
fit <- glm(Word_c~Channel_8_Channel_6,data = dta1_vs_c[idc_train,],family = "binomial")
yhat <- predict(fit,newdata = dta1_vs_c[idc_test,],type = "response")
y <- dta1_vs_c[idc_test,]$Word
auc_mat_glm_c[k,] = pROC::auc(y,yhat)
}
mean( auc_mat_glm_c)
[1] 0.6695556
quantile( auc_mat_glm_c,probs = c(0.025,0.975))
2.5% 97.5%
0.4444444 1.0000000
RF
In RF model, we used full 6 variables, default metry and default nodesize to
construct the RF model.
R-Squared was not really good as well.
rs_mat_rf <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_test <- sample(1:nrow(dta1_vs),6)
idc_train <- -idc_test
fit <- randomForest(Word~. ,
data = dta1_vs[idc_train,],
ntree = 10001,importance = F)
yhat <- predict(fit,newdata = dta1_vs[idc_test,],type = "response")
y <- dta1_vs[idc_test,]$Word
rs_mat_rf[k,] = 1 - mean((y - yhat)^2)/var(y)
}
mean(rs_mat_rf)
[1] -0.5409942
quantile(rs_mat_rf,probs = c(0.025,0.975))
2.5% 97.5%
-4.0545783 0.3578068
RF
Because of the low R-Squared, the present study used “current variables” (6 variables)
to predict Word_c (Classification Problem).
Default mrty and nodesize was used as well.
dta1_vs_c <- dta1_vs
dta1_vs_c$Word_c <- as.factor(ifelse(dta1_vs$Word > median(dta1_vs$Word), "H","L"))
dta1_vs_c <- dta1_vs_c[,-1]
auc_mat_rf_c <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_test <- c(sample(which(dta1_vs_c$Word_c == "H"), 3),
sample(which(dta1_vs_c$Word_c == "L"), 3))
idc_train <- -idc_test
fit <- randomForest(Word_c~. ,
data = dta1_vs_c[idc_train,],
ntree = 10001,importance = F)
yhat <- predict(fit,newdata = dta1_vs_c[idc_test,],type = "prob")[,2]
y <- dta1_vs_c[idc_test,]$Word
auc_mat_rf_c[k,] = pROC::auc(y,yhat)
}
mean(auc_mat_rf_c)
[1] 0.7144444
quantile(auc_mat_rf_c,probs = c(0.025,0.975))
2.5% 97.5%
0.4444444 1.0000000
Comparison
pdta <- data.frame(RS_LM = rs_mat_lm,AUC_LM = auc_mat_glm_c,
RS_RF = rs_mat_rf, AUC_RF = auc_mat_rf_c)
apply(pdta, 2, mean)
RS_LM AUC_LM RS_RF AUC_RF
-0.6092389 0.6695556 -0.5409942 0.7144444
apply(pdta, 2 , quantile,probs = c(0.025,0.975))
RS_LM AUC_LM RS_RF AUC_RF
2.5% -5.5763401 0.4444444 -4.0545783 0.4444444
97.5% 0.4659623 1.0000000 0.3578068 1.0000000
pdta <- melt(pdta)
ggplot(pdta, aes(value))+
geom_density()+
facet_wrap(~variable,scales = "free")+
theme_bw()

Classification : dta2
Because of the low reliablity of regression problem,
the present study used classification approach to gain insight into this data.
Feature selection
We provided feature selection on classification problem as well.
1 by 1 model that like the method is session1 was also used here.
The index of p_value and AUC were used for selecting variables.
fit_lm <- list()
auc_lm <- data.frame(AUC = NA, variable = NA)
p_lm <- data.frame(pvalue = NA, variable = NA)
for (i in 1:30){
fit_lm[[i]] <- summary(fit <- glm(Word_C~.,
data = dta2[,c(i,31)],family = "binomial"))
names(fit_lm)[i] <- colnames(dta2)[i]
yhat <- predict(fit,data = dta2[,c(i,31)],type = "response")
auc_lm <- rbind(auc_lm,
data.frame(AUC = pROC::auc(dta2$Word_C,yhat),
variable = colnames(dta2)[i]))
p_lm <- rbind(p_lm, data.frame(pvalue = fit_lm[[i]]$coefficient[2,4], variable = colnames(dta2)[i]))
}
Feature selection
We select the variables that AUC > 0.75 on the current setting.
The highest 3 variables were chosen.
vs <- full_join(p_lm,auc_lm)
head(na.omit(vs[order(-vs$AUC),c(2,1,3)]),10)
variable pvalue AUC
7 Channel_4_Channel_1 0.05937883 0.7777778
29 Channel_8_Channel_5 0.04705111 0.7777778
13 Channel_5_Channel_4 0.09553314 0.7555556
5 Channel_3_Channel_1 0.11573316 0.7444444
19 Channel_7_Channel_1 0.10024331 0.7444444
12 Channel_5_Channel_3 0.09629423 0.7333333
8 Channel_4_Channel_2 0.08392544 0.7222222
4 Channel_2_Channel_1 0.18213073 0.7111111
23 Channel_7_Channel_5 0.14191010 0.7111111
20 Channel_7_Channel_2 0.21802741 0.6888889
dta2_vs<- dta2[,c("Word_C",filter(vs,AUC > 0.75)$variable)]
head(dta2_vs)
Word_C Channel_4_Channel_1 Channel_5_Channel_4 Channel_8_Channel_5
1 H 0.7039091 0.4112742 0.3329433
2 L -0.2355694 -0.4043635 -0.2051132
3 L 0.1320238 0.2647561 0.1667268
4 L -0.1288285 -0.2381991 -0.3403798
5 H 0.4443495 0.3985602 0.3773816
6 L 0.3856371 0.4010637 0.4028124
LM
We performed the logistic model that was chosen from best subset selection method.
the package “bestglm” was used due to the lack of glm model seletion in “leaps” package.
However, we need to reconsider the code below.
#bs_auc_i <- matrix(NA,10000,3,dimnames =list(NULL , paste (1:3)))
#for (k in 1:10000){
# set.seed(k)
# print(k)
# idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
# sample(which(dta2_vs$Word_C == "L"), 3))
# idc_train <- -idc_test
# best.fit <- bestglm(dta2_vs[idc_train,c(2:4,1)],
# IC="AIC",method ="exhaustive",family=binomial,nvmax = 3)
# yhat <- predict(best.fit$BestModel,newdata = dta2_vs[idc_test,c(2:4,1)],
# type = "response")
# n = nrow(as.data.frame(best.fit$BestModel$coefficients)) - 1
# bs_auc_i[k,n] = pROC::auc(dta2_vs[idc_test,]$Word_C,as.numeric(yhat))
#}
#sort(apply(bs_auc_i,2, mean,na.rm =T))
LM
After Best subset selection, the best model contain 1 variable.
The selected variable was Ch8_Ch5.
best.fit <- bestglm(dta2_vs[,c(2:4,1)],
IC="AIC",method ="exhaustive",family=binomial,nvmax = 1)##???)
Morgan-Tatar search since family is non-gaussian.
summary(best.fit$BestModel)
Call:
glm(formula = y ~ ., family = family, data = Xi, weights = weights)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.7242 -0.9143 0.3820 0.8524 2.0069
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.8311 0.6805 1.221 0.2220
Channel_8_Channel_5 -3.8347 1.9310 -1.986 0.0471 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 26.287 on 18 degrees of freedom
Residual deviance: 20.690 on 17 degrees of freedom
AIC: 24.69
Number of Fisher Scoring iterations: 4
LM
We provided 1000 replications to gain the ross validation AUC on present data fitting.
auc_mat_lm <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
sample(which(dta2_vs$Word_C == "L"), 3))
idc_train <- -idc_test
# Channel_8_Channel_5
fit <- glm(Word_C~Channel_8_Channel_5,
data = dta2_vs[idc_train,],family = "binomial")
yhat <- predict(fit,newdata = dta2_vs[idc_test,],type = "response")
auc_mat_lm[k,] <- pROC::auc(dta2_vs[idc_test,]$Word_C,yhat)
}
mean(auc_mat_lm) # 0.791
[1] 0.7911667
quantile(auc_mat_lm,probs = c(0.025,0.975)) # 0.5555556 1.0000000
2.5% 97.5%
0.5555556 1.0000000
RF
In RF model, we used full 3 variables, default metry and default nodesize to
construct the RF model.
AUC was a little higher than GLM.
auc_mat_rf_3 <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
sample(which(dta2_vs$Word_C == "L"), 3))
idc_train <- -idc_test
fit <- randomForest(Word_C~. ,
data = dta2_vs[idc_train,],
ntree = 10001,importance = F)
yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
auc_mat_rf_3[k,] <- pROC::auc(dta2_vs[idc_test,]$Word_C,yhat)
}
mean(auc_mat_rf_3)
[1] 0.7997222
quantile(auc_mat_rf_3,probs = c(0.025,0.975))
2.5% 97.5%
0.5 1.0
RF
If we tune the RF model,
we could gain a much better AUC using RF model (mtry = 1, nodesize = 7)
auc_mat_rf_3_tune <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
sample(which(dta2_vs$Word_C == "L"), 3))
idc_train <- -idc_test
fit <- randomForest(Word_C~. ,
data = dta2_vs[idc_train,],
mtry = 1 ,nodesize = 7,
ntree = 10001,importance = F)
yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
auc_mat_rf_3_tune[k,] <- pROC::auc(dta2_vs[idc_test,]$Word_C,yhat)
}
mean(auc_mat_rf_3_tune)
[1] 0.8253889
quantile(auc_mat_rf_3_tune,probs = c(0.025,0.975))
2.5% 97.5%
0.5555556 1.0000000
# 9 var = 0.729
# 7 var = 0.726
# 6 var = 0.748
# 5 var = 0.779 #### mtry = 1 ,nodesize = 4 ; 0.797
# 3 var = 0.799 #### mtry = 1 ,nodesize = 7 ; 0.825
# 2 var = 0.7333889
I also try the top 5 variables in GLM and RF model
GLM : the results was as same as 3 variables, because it would only select Ch8_Ch5
RF : demonstrate below
RF_2
AUC > 0.74 : 5 variables
Default metry and nodesize
head(na.omit(cbind(p_lm[order(p_lm$pvalue),c(2,1)],
vs <- auc_lm[order(-auc_lm$AUC),c(2,1)])),10)
variable pvalue variable AUC
29 Channel_8_Channel_5 0.04705111 Channel_4_Channel_1 0.7777778
7 Channel_4_Channel_1 0.05937883 Channel_8_Channel_5 0.7777778
8 Channel_4_Channel_2 0.08392544 Channel_5_Channel_4 0.7555556
13 Channel_5_Channel_4 0.09553314 Channel_3_Channel_1 0.7444444
12 Channel_5_Channel_3 0.09629423 Channel_7_Channel_1 0.7444444
19 Channel_7_Channel_1 0.10024331 Channel_5_Channel_3 0.7333333
5 Channel_3_Channel_1 0.11573316 Channel_4_Channel_2 0.7222222
23 Channel_7_Channel_5 0.14191010 Channel_2_Channel_1 0.7111111
6 Channel_3_Channel_2 0.16463977 Channel_7_Channel_5 0.7111111
4 Channel_2_Channel_1 0.18213073 Channel_7_Channel_2 0.6888889
dta2_vs<- dta2[,c("Word_C",filter(vs,AUC > 0.74)$variable)]
auc_mat_rf_5 <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
sample(which(dta2_vs$Word_C == "L"), 3))
idc_train <- -idc_test
fit <- randomForest(Word_C~. ,
data = dta2_vs[idc_train,],
ntree = 10001,importance = F)
yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
auc_mat_rf_5[k,] <- pROC::auc(dta2_vs[idc_test,]$Word_C,yhat)
}
mean(auc_mat_rf_5)
[1] 0.7790556
quantile(auc_mat_rf_5,probs = c(0.025,0.975))
2.5% 97.5%
0.4444444 1.0000000
RF_2
AUC > 0.74 : 5 variables
mtry = 1 ,nodesize = 4
auc_mat_rf_5_tune <- matrix(NA,1000,1)
for (k in 1:1000){
set.seed(k)
idc_test <- c(sample(which(dta2_vs$Word_C == "H"), 3),
sample(which(dta2_vs$Word_C == "L"), 3))
idc_train <- -idc_test
fit <- randomForest(Word_C~. ,
data = dta2_vs[idc_train,],
mtry = 1 ,nodesize = 4,
ntree = 10001,importance = F)
yhat <- predict(fit,newdata = dta2[idc_test,],type = "prob")[,2]
auc_mat_rf_5_tune[k,] <- pROC::auc(dta2_vs[idc_test,]$Word_C,yhat)
}
mean(auc_mat_rf_5_tune)
[1] 0.797
quantile(auc_mat_rf_5_tune,probs = c(0.025,0.975))
2.5% 97.5%
0.4444444 1.0000000
Comparsion
Now we could compare 5 models
GLM
RF : 3 variable, default
RF : 3 variable, best
RF : 5 variable, default
RF : 5 variable, best
plotdta <- data.frame(GLM_1 = auc_mat_lm,
RF_3 = auc_mat_rf_3,
RF_3t = auc_mat_rf_3_tune,
RF_5 = auc_mat_rf_5,
RF_5t = auc_mat_rf_5_tune)
apply(plotdta,2,mean)
GLM_1 RF_3 RF_3t RF_5 RF_5t
0.7911667 0.7997222 0.8253889 0.7790556 0.7970000
apply(plotdta,2,quantile,probs = c(0.025,0.975))
GLM_1 RF_3 RF_3t RF_5 RF_5t
2.5% 0.5555556 0.5 0.5555556 0.4444444 0.4444444
97.5% 1.0000000 1.0 1.0000000 1.0000000 1.0000000
Comparsion
p <- melt(plotdta)
colnames(p)[1:2] <- c("Model","AUC")
Fig1 <- ggplot(p,aes(AUC,fill = Model, col = Model))+
geom_density(alpha = 0.3)+
theme_bw()+
xlab("AUC")
mdta <- ddply(p, "Model", summarise, average=mean(AUC))
prodta <- ddply(p,"Model",summarise,
Q.025 = quantile(AUC,probs = c(0.025)),
Q.975 = quantile(AUC,probs = c(0.975)))
Fig2 <- Fig1+
facet_grid(Model~.)+
geom_vline(data=mdta , aes(xintercept=average),
linetype="dashed", size=1, colour="red")+
geom_vline(data = prodta,aes(xintercept=Q.025),
linetype="dashed", size=1, colour="darkgoldenrod")+
geom_vline(data = prodta,aes(xintercept=Q.975),
linetype="dashed", size=1, colour="darkgoldenrod")
Fig1 is an interactive plot. We could click the variables to gain
the best visual expereince.
ggplotly(Fig1)
Fig2

Conclusion
1. Regression Problem might be hard for current small sample data to gain a stable
cross validation R-squared
2. From Classification Problem, we could know that
GLM have a good (0.791) AUC.
Default RF_3variables have a little bit better (0.799) AUC
Best RF_3variables have the highest (0.825) AUC.
3. We could explain two cultures of the method on Resting-State Connectivity data,
and also promote or encourage NIRS users to use both GLM and
RF approach rather than single LM model to analyze their resting state data.