Outline

Data

1. Resting State Signal

Preprocessing

1. Summarize Data
2. Connectivity

Model Building

1. SVM Method
2. Tree-Based
3. Random Forest
4. Boosting

Summary

1. Results Reporting

Data

  Subject Sex   Age Peabody      Time 1      Time 2      Time 3
1    Sub1   M 10.41     113 1.67935e-05 1.66073e-05 1.64011e-05
2    Sub1   M 10.41     113 1.21074e-05 1.20908e-05 1.20587e-05
3    Sub1   M 10.41     113 5.22465e-06 5.05071e-06 4.86925e-06
'data.frame':   160 obs. of  7 variables:
 $ Subject: Factor w/ 20 levels "Sub1","Sub10",..: 1 1 1 1 1 1 1 1 2 2 ...
 $ Sex    : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
 $ Age    : num  10.4 10.4 10.4 10.4 10.4 ...
 $ Peabody: int  113 113 113 113 113 113 113 113 119 119 ...
 $ Time 1 : num  1.68e-05 1.21e-05 5.22e-06 3.50e-06 9.27e-06 ...
 $ Time 2 : num  1.66e-05 1.21e-05 5.05e-06 3.21e-06 9.44e-06 ...
 $ Time 3 : num  1.64e-05 1.21e-05 4.87e-06 2.92e-06 9.61e-06 ...

Data Exploration

Data Exploration : Subject 2

Data Summarize

  Peabody Channel_2_Channel_1 Channel_3_Channel_1 Channel_3_Channel_2
1     113           0.2485940          0.46288574          0.21175610
2     119           0.7715194         -0.20377880         -0.29934220
3      77           0.8711941          0.23371935          0.20358104
4     112           0.2212408          0.06037749          0.08020694
5     115           0.3854612          0.29656393          0.15374033
6      93           0.7481600          0.46951380          0.23374250
'data.frame':   20 obs. of  6 variables:
 $ Peabody            : int  113 119 77 112 115 93 105 125 109 86 ...
 $ Channel_2_Channel_1: num  0.249 0.772 0.871 0.221 0.385 ...
 $ Channel_3_Channel_1: num  0.4629 -0.2038 0.2337 0.0604 0.2966 ...
 $ Channel_3_Channel_2: num  0.2118 -0.2993 0.2036 0.0802 0.1537 ...
 $ Ch2_mean           : num  1.09e-06 -4.08e-07 2.94e-07 -6.49e-07 3.64e-08 ...
 $ Ch2_var            : num  6.07e-11 9.48e-11 4.13e-11 1.19e-10 1.35e-11 ...

Data Summarize

Data Summarize

Prepration

dta <- readRDS("ML_NIRS_Connectivity_MeanVar.Rdata")
Peabodydta<- dta[-12,-c(1,2,3,5:11)]
# Log Transformation Do not Improve Performance
randomnum = 10 ; kf <- c(3,15)
cv.errors.svm<- matrix (NA ,kf[2] - kf[1] +1 ,randomnum)
cv.errors.tree<- matrix (NA ,kf[2] - kf[1] +1 ,randomnum)
cv.errors.rf<- matrix (NA ,kf[2] - kf[1] +1 ,randomnum)
cv.errors.boost<- matrix (NA ,kf[2] - kf[1] +1 ,randomnum)

Model 1 :SVM Model

svm_tune <- tune(svm,Peabody~., data =Peabodydta, 
                 kernel="radial",
                 ranges = list(cost = seq(3, 10, 0.1),
                               gamma =seq(0, 1, 0.1)),
                 tunecontrol = tune.control(sampling = "cross", 
                                            cross = 10))

Model 1 :SVM Model

for (i in 1:randomnum){
        set.seed(i)
        for (k in kf[1]:kf[2]){
                folds.svm <- sample(1:k,nrow(Peabodydta),replace =T) 
                mat <- matrix(NA,k,1)
                for (j in 1:k){
                        rst_svm <- svm(Peabody~., data = Peabodydta[folds.svm != j,],
                                       type = "eps-regression",
                                       cost = svm_tune$best.model$cost, kernel = "radial",
                                       gamma = svm_tune$best.model$gamma)
                        pred <- predict(rst_svm,Peabodydta[folds.svm == j,])
                        mat[j,1] = mean((Peabodydta$Peabody[folds.svm == j]-pred)^2,
                                        na.rm = T)
                }
        cv.errors.svm[k-2,i] <- mean(mat[j,1],na.rm = T)
        }
}

Model 1 :SVM Model

mean.cv.errors.svm <- mean(cv.errors.svm,na.rm = T)
mean.cv.errors.svm
[1] 151.4226

Model 2 : Tree Based

for (i in 1:randomnum){ 
        set.seed(i+105)
        for (k in kf[1]:kf[2]){
                folds.tree <- sample(1:k,nrow(Peabodydta),replace =T) ; mat <- matrix(NA,k,1)
                for (j in 1:k){
                       treemodel <- tree(Peabody~. ,data  = Peabodydta[folds.tree != j,])
                       if (! inherits(treemodel, "singlenode")){
                               treemodel2 <- prune.tree(treemodel,
                                                        newdata =Peabodydta[folds.tree != j,])
                               treemodel3 <- prune.tree(treemodel,
                                                        newdata = Peabodydta[folds.tree != j,],
                                                        best = treemodel2$size[which.min(treemodel2$dev)])
                               pred <- predict(treemodel3,Peabodydta[folds.tree == j,])
                               mat[j,1] = mean((Peabodydta$Peabody[folds.tree == j]-pred)^2,
                                               na.rm = T)
                       } else {
                               pred <- predict(treemodel,Peabodydta[folds.tree == j,])
                               mat[j,1] = mean((Peabodydta$Peabody[folds.tree == j]-pred)^2,
                                               na.rm = T)
                       }
                }
        cv.errors.tree[k-2,i] <- mean(mat[j,1],na.rm = T)
        }
}

Model 2 : Tree Based

mean.cv.errors.tree <- mean(cv.errors.tree,na.rm = T)
mean.cv.errors.tree
[1] 239.9233

Model 3 : Random Forest : Determine Parameter

TRF <- tuneRF(Peabodydta[,-1],Peabodydta[,1], ntreeTry=100001,stepFactor=1.5,trace = F)

Model 3 : Random Forest

for (i in 1:randomnum){
        set.seed(i+105)
        for (k in kf[1]:kf[2]){
                folds.rf <- sample(1:k,nrow(Peabodydta),replace =T) 
                mat <- matrix(NA,k,1)
                for (j in 1:k){
                        rst_rf <- randomForest(Peabody ~.,
                                               data = Peabodydta[folds.rf != j,],
                                               mtry = 14,ntree = 1000,
                                               nodesize = 1, 
                                               importance = F)
                        pred <- predict(rst_rf,Peabodydta[folds.rf == j,])
                        mat[j,1] = mean((Peabodydta$Peabody[folds.rf == j]-pred)^2,
                                        na.rm = T)
                }
                cv.errors.rf[k-2,i] <- mean(mat[j,1],na.rm = T)
        }
}

Model 3 : Random Forest

mean.cv.errors.rf <- mean(cv.errors.rf,na.rm = T)
mean.cv.errors.rf
[1] 147.9699

Model 4 : Boosting

randomnum = 2 ###
for (i in 1:randomnum){
        set.seed(i)
        for (k in (kf[1]+7):kf[2]){
                folds.boost <- sample(1:k,nrow(Peabodydta),replace =T) 
                mat <- matrix(NA,k,1)
                for (j in 1:k){
                        rst_gbm <- gbm(Peabody ~ . ,
                                       data = Peabodydta[folds.boost != j,],
                                       distribution = "gaussian",
                                       n.trees = 1000, cv.folds = 2,
                                       interaction.depth = 10, 
                                       n.minobsinnode = 1, shrinkage = 0.001)
                        pred <- predict(rst_gbm,Peabodydta[folds.boost == j,])
                        mat[j,1] = mean((Peabodydta$Peabody[folds.boost== j]-pred)^2,
                                        na.rm = T)
                }
                cv.errors.boost[k-2,i] <- mean(mat[j,1],na.rm = T)
        }
}

Model 4 : Boosting

mean.cv.errors.boost <- mean(cv.errors.boost,na.rm = T)
mean.cv.errors.boost
[1] 126.5121

Model Selection

Best Results : ???

# variable information
# R-Squared Interval ?

Model 5 :Linear Model → Variable Selection

k <- 10 ; nvmax= 10 ; seed <- 50
mean.cv.errorsP <- matrix(NA,seed,nvmax,dimnames =list(NULL , paste (1:10) ))
for (m in 1:seed){
        set.seed(m+105)
        foldsP <- sample(1:k,nrow(Peabodydta),replace =T)
        cv.errorsP <- matrix (NA ,k ,nvmax)
        for(j in 1:k){
                best.fit <- regsubsets(Peabody~. ,data=Peabodydta[foldsP != j,],
                                       method ="forward",nvmax= nvmax)
                for(i in 1:nvmax) {
                        pred <- predict(best.fit ,Peabodydta[foldsP == j,] , id=i)
                        cv.errorsP [j,i]=mean( (Peabodydta$Peabody[foldsP == j]-pred)^2 ,
                                               na.rm = T)
                }
        }
        mean.cv.errorsP[m,] <- apply(cv.errorsP ,2, mean,na.rm =T)
}

Model 5 :Linear Model → Variable Selection

sort(apply(mean.cv.errorsP,2, mean,na.rm =T))[1:10]
        1         2         3         4         5         6         7 
 459.4132  629.5945  853.4165 1191.0588 1498.7264 1787.4391 1831.0421 
        8         9        10 
1950.2570 2153.6866 2208.4213 

1 Variables

reg.bestP <- regsubsets(Peabody~. ,data=Peabodydta, nvmax =1)
coef(reg.bestP,1)
  (Intercept)       Ch4_var 
 1.171425e+02 -2.009040e+11 
cv.errors.lm<- matrix (NA ,kf[2] - kf[1] +1 ,randomnum)
Rs.lm <- matrix (NA ,kf[2] - kf[1] +1 ,randomnum)
for (i in 1:randomnum){
        set.seed(i)
        for (k in kf[1]:kf[2]){
                folds.lm <- sample(1:k,nrow(Peabodydta),replace =T) 
                mat <- matrix(NA,k,1)
                mat2 <- matrix(NA,k,1)
                for (j in 1:k){
                best.fit <- lm(Peabody~Ch4_var ,data=Peabodydta[folds.lm != j,])
                pred <- predict(best.fit ,Peabodydta[folds.lm == j,])
                mat[j,1] = mean((Peabodydta$Peabody[folds.lm == j]-pred)^2,na.rm = T)
                }
        cv.errors.lm[k-2,i] <- mean(mat[j,1],na.rm = T)
        }
}

Model 5 :Linear Model

mean.cv.errors.lm <- mean(cv.errors.lm,na.rm = T)
mean.cv.errors.lm
[1] 79.22216

Model Comparison

LM versus ???

fit1 <- summary(lm.beta(lm(Peabody~Ch4_var,data=Peabodydta)))
fit1$r.squared
[1] 0.4390768
fit1$adj.r.squared
[1] 0.4060813
# 如何以別的模型得到Fitted R-Squared ?
# or 以LM 得到R-Squared Interval