Subject Sex Age Peabody Time 1 Time 2 Time 3 1 Sub1 M 10.41 113 1.67935e-05 1.66073e-05 1.64011e-05 2 Sub1 M 10.41 113 1.21074e-05 1.20908e-05 1.20587e-05 3 Sub1 M 10.41 113 5.22465e-06 5.05071e-06 4.86925e-06
'data.frame': 160 obs. of 7 variables: $ Subject: Factor w/ 20 levels "Sub1","Sub10",..: 1 1 1 1 1 1 1 1 2 2 ... $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ... $ Age : num 10.4 10.4 10.4 10.4 10.4 ... $ Peabody: int 113 113 113 113 113 113 113 113 119 119 ... $ Channel: Factor w/ 8 levels "Channel_1","Channel_2",..: 1 2 3 4 5 6 7 8 1 2 ... $ Time 1 : num 1.68e-05 1.21e-05 5.22e-06 3.50e-06 9.27e-06 ... $ Time 2 : num 1.66e-05 1.21e-05 5.05e-06 3.21e-06 9.44e-06 ...
Peabody Channel_2_Channel_1 Channel_3_Channel_1 Channel_3_Channel_2 1 113 0.2485940 0.46288574 0.21175610 2 119 0.7715194 -0.20377880 -0.29934220 3 77 0.8711941 0.23371935 0.20358104 4 112 0.2212408 0.06037749 0.08020694 5 115 0.3854612 0.29656393 0.15374033 6 93 0.7481600 0.46951380 0.23374250
'data.frame': 20 obs. of 6 variables: $ Peabody : int 113 119 77 112 115 93 105 125 109 86 ... $ Channel_2_Channel_1: num 0.249 0.772 0.871 0.221 0.385 ... $ Channel_3_Channel_1: num 0.4629 -0.2038 0.2337 0.0604 0.2966 ... $ Channel_3_Channel_2: num 0.2118 -0.2993 0.2036 0.0802 0.1537 ... $ Ch2_mean : num 1.09e-06 -4.08e-07 2.94e-07 -6.49e-07 3.64e-08 ... $ Ch2_var : num 6.07e-11 9.48e-11 4.13e-11 1.19e-10 1.35e-11 ...
dta <- readRDS("ML_NIRS_Connectivity_MeanVar.Rdata")
Peabodydta<- dta[-12,-c(1,2,3,5:9)]
Peabodydta[,-c(1:31)] <- scale(Peabodydta[,-c(1:31)])
set.seed(105);Peabodydta$Dummy <- rnorm(19,1)
head(Peabodydta[,c(1:4,47:48)],5)
Peabody Sex Age Channel_2_Channel_1 Ch8_var Dummy 1 113 M 10.41 0.2485940 1.2503216 -0.2935239 2 119 M 9.95 0.7715194 -0.3490003 0.5338457 3 77 M 10.22 0.8711941 0.1119929 1.1338142 4 112 M 9.68 0.2212408 -0.4687394 2.2944794 5 115 F 10.35 0.3854612 -0.2651453 1.1178403
set.seed(123);TRF <- tuneRF(Peabodydta[,-1],Peabodydta[,1],
ntreeTry=1000001,stepFactor=1.5,trace = F)
set.seed(123)
varsel <- randomForest(Peabody ~.,
data = Peabodydta,
mtry = 22,ntree = 1000001,
nodesize = 1,
importance = T)
varsel$importance[order(varsel$importance[,1]),]
%IncMSE IncNodePurity Channel_5_Channel_2 -1.26692012 45.020869 Channel_8_Channel_4 -1.14270707 23.996376 Channel_8_Channel_7 -1.13758886 47.267152 Ch3_mean -0.95040574 44.851354 Channel_5_Channel_1 -0.93381523 140.873086 Channel_7_Channel_4 -0.88307192 18.878355 Channel_7_Channel_3 -0.82524806 25.899720 Channel_6_Channel_2 -0.79793277 28.887566 Channel_5_Channel_3 -0.73858884 38.803758 Ch6_var -0.68543562 25.137874 Channel_2_Channel_1 -0.64943523 51.395019 Ch7_mean -0.56183720 14.197490 Channel_8_Channel_2 -0.53611799 41.885418 Ch1_mean -0.53125383 14.326344 Channel_6_Channel_1 -0.52952659 36.316023 Channel_8_Channel_3 -0.50709153 25.096908 Dummy -0.42222303 18.498763 Ch4_mean -0.40603089 24.584581 Channel_6_Channel_5 -0.36654484 13.256720 Channel_8_Channel_6 -0.35720776 20.174673 Ch6_mean -0.29821483 31.482808 Ch1_var -0.19415619 10.096153 Channel_7_Channel_5 -0.16318440 18.757238 Channel_5_Channel_4 -0.13157746 15.051297 Channel_4_Channel_3 -0.13107029 11.307917 Channel_6_Channel_3 -0.11000326 6.615137 Channel_8_Channel_5 -0.09631405 22.299722 Channel_4_Channel_2 -0.08597172 20.725603 Sex -0.06464237 4.036111 Channel_7_Channel_1 -0.06276849 56.358516 Age -0.04887143 8.172189 Channel_6_Channel_4 -0.04829703 20.888482 Channel_7_Channel_2 -0.02576633 11.681928 Ch8_mean 0.02768164 75.289578 Ch2_var 0.05543804 9.240934 Channel_8_Channel_1 0.05830780 63.252828 Channel_7_Channel_6 0.19000152 9.149676 Channel_3_Channel_2 0.29901188 50.330804 Ch8_var 0.44866303 48.422163 Channel_4_Channel_1 0.68733961 36.212874 Ch3_var 0.92824122 130.204295 Ch2_mean 0.93501184 45.674473 Ch5_var 1.98610028 76.698862 Ch5_mean 2.10173097 230.840079 Channel_3_Channel_1 9.00271448 113.636496 Ch7_var 10.96934208 365.067988 Ch4_var 16.40177844 360.466172
varsel$importance[order(-varsel$importance[,1]),][1:5,]
%IncMSE IncNodePurity Ch4_var 16.401778 360.46617 Ch7_var 10.969342 365.06799 Channel_3_Channel_1 9.002714 113.63650 Ch5_mean 2.101731 230.84008 Ch5_var 1.986100 76.69886
Peabodydta <-Peabodydta[,c("Peabody",
rownames(varsel$importance
[which(varsel$importance[,1] >= 1),]))]
randomnum = 100 ; kf <- c(3,19) cv.errors.svm<- matrix (NA ,kf[2] - kf[1] +1 ,randomnum) cv.errors.tree<- matrix (NA ,kf[2] - kf[1] +1 ,randomnum) cv.errors.rf<- matrix (NA ,kf[2] - kf[1] +1 ,randomnum) cv.errors.boost<- matrix (NA ,kf[2] - kf[1] +1 ,randomnum)
svm_tune <- tune(svm,Peabody~., data =Peabodydta,
kernel="radial",
ranges = list(cost = seq(45,55 ,0.1),
gamma =seq(0, 1, 0.1)),
tunecontrol = tune.control(sampling = "cross",
cross = 19))
for (i in 1:randomnum){
set.seed(i)
for (k in kf[1]:kf[2]){
folds.svm <- sample(1:k,nrow(Peabodydta),replace =T)
mat <- matrix(NA,k,1)
for (j in 1:k){
rst_svm <- svm(Peabody~., data = Peabodydta[folds.svm != j,],
type = "eps-regression",
cost = svm_tune$best.model$cost, kernel = "radial",
gamma = svm_tune$best.model$gamma)
pred <- predict(rst_svm,Peabodydta[folds.svm == j,])
mat[j,1] = mean((Peabodydta$Peabody[folds.svm == j]-pred)^2,
na.rm = T)
}
cv.errors.svm[k-2,i] <- mean(mat[j,1],na.rm = T)
}
}
mean.cv.errors.svm <- mean(cv.errors.svm,na.rm = T) mean.cv.errors.svm
[1] 92.50797
for (i in 1:randomnum){
set.seed(i)
for (k in kf[1]:kf[2]){
folds.tree <- sample(1:k,nrow(Peabodydta),replace =T) ; mat <- matrix(NA,k,1)
for (j in 1:k){
treemodel <- tree(Peabody~. ,data = Peabodydta[folds.tree != j,])
if (! inherits(treemodel, "singlenode")){
treemodel2 <- prune.tree(treemodel,
newdata =Peabodydta[folds.tree != j,])
treemodel3 <- prune.tree(treemodel,
newdata = Peabodydta[folds.tree != j,],
best = treemodel2$size[which.min(treemodel2$dev)])
pred <- predict(treemodel3,Peabodydta[folds.tree == j,])
mat[j,1] = mean((Peabodydta$Peabody[folds.tree == j]-pred)^2,
na.rm = T)
} else {
pred <- predict(treemodel,Peabodydta[folds.tree == j,])
mat[j,1] = mean((Peabodydta$Peabody[folds.tree == j]-pred)^2,
na.rm = T)
}
}
cv.errors.tree[k-2,i] <- mean(mat[j,1],na.rm = T)
}
}
mean.cv.errors.tree <- mean(cv.errors.tree,na.rm = T) mean.cv.errors.tree
[1] 197.1351
TRF <- tuneRF(Peabodydta[,-1],Peabodydta[,1], ntreeTry=100001,stepFactor=0.9,trace = F)
for (i in 1:randomnum){
set.seed(i)
for (k in kf[1]:kf[2]){
folds.rf <- sample(1:k,nrow(Peabodydta),replace =T)
mat <- matrix(NA,k,1)
for (j in 1:k){
rst_rf <- randomForest(Peabody ~.,
data = Peabodydta[folds.rf != j,],
mtry = 1,ntree = 1000,
nodesize = 1,
importance = F)
pred <- predict(rst_rf,Peabodydta[folds.rf == j,])
mat[j,1] = mean((Peabodydta$Peabody[folds.rf == j]-pred)^2,
na.rm = T)
}
cv.errors.rf[k-2,i] <- mean(mat[j,1],na.rm = T)
}
}
mean.cv.errors.rf <- mean(cv.errors.rf,na.rm = T) mean.cv.errors.rf
[1] 91.13164
randomnum = 2
for (i in 1:randomnum){
set.seed(i)
for (k in kf[1]:kf[2]){
folds.boost <- sample(1:k,nrow(Peabodydta),replace =T)
mat <- matrix(NA,k,1)
for (j in 1:k){
rst_gbm <- gbm(Peabody ~ . ,
data = Peabodydta[folds.boost != j,],
distribution = "gaussian",
n.trees = 1000, cv.folds = 2,
interaction.depth = 10,
n.minobsinnode = 0.1, shrinkage = 0.001)
M_hat <- which.min(rst_gbm$cv.error) ## ???
pred <- predict(rst_gbm,Peabodydta[folds.boost == j,],n.trees = M_hat)
mat[j,1] = mean((Peabodydta$Peabody[folds.boost== j]-pred)^2,
na.rm = T)
}
cv.errors.boost[k-2,i] <- mean(mat[j,1],na.rm = T)
}
}
mean.cv.errors.boost <- mean(cv.errors.boost,na.rm = T) mean.cv.errors.boost
[1] 59.0104
set.seed(105);RF.RS <- matrix(NA,1000)
for (i in 1:1000){
idc_train <- sample(c(T, F), nrow(Peabodydta), replace = T, prob = c(.7, .3))
idc_test <- !idc_train
rst_rf <- randomForest(Peabody ~.,data = Peabodydta[idc_train,],
mtry = 1,ntree = 1000,nodesize = 1, importance = F)
pred <- predict(rst_rf,Peabodydta[idc_test,])
RF.RS [i,1] = 1 - (mean((Peabodydta[idc_test,]$Peabody-pred)^2)/var(Peabodydta[idc_test,]$Peabody))
}
print(RSQ <- quantile(RF.RS,probs = c(0.025,0.975),na.rm = T))
2.5% 97.5% -1.8582223 0.8704307
# variable I
k <- 10 ; nvmax= 5 ; seed <- 50
mean.cv.errorsP <- matrix(NA,seed,nvmax,dimnames =list(NULL , paste (1:5) ))
for (m in 1:seed){
set.seed(m)
foldsP <- sample(1:k,nrow(Peabodydta),replace =T)
cv.errorsP <- matrix (NA ,k ,nvmax)
for(j in 1:k){
best.fit <- regsubsets(Peabody~. ,data=Peabodydta[foldsP != j,],
method ="forward",nvmax= nvmax)
for(i in 1:nvmax) {
pred <- predict(best.fit ,Peabodydta[foldsP == j,] , id=i)
cv.errorsP [j,i]=mean( (Peabodydta$Peabody[foldsP == j]-pred)^2 ,
na.rm = T)
}
}
mean.cv.errorsP[m,] <- apply(cv.errorsP ,2, mean,na.rm =T)
}
sort(apply(mean.cv.errorsP,2, mean,na.rm =T))[1:5]
5 1 2 4 3 456.5688 498.1490 501.6620 507.0396 526.2155
randomnum = 100 reg.bestP <- regsubsets(Peabody~. ,data=Peabodydta, nvmax =1) coef(reg.bestP,1)
(Intercept) Ch4_var 107.42105 -8.10443
cv.errors.lm<- matrix (NA ,kf[2] - kf[1] +1 ,randomnum)
for (i in 1:randomnum){
set.seed(i)
for (k in kf[1]:kf[2]){
folds.lm <- sample(1:k,nrow(Peabodydta),replace =T)
mat <- matrix(NA,k,1)
mat2 <- matrix(NA,k,1)
for (j in 1:k){
best.fit <- lm(Peabody~Ch4_var ,data=Peabodydta[folds.lm != j,])
pred <- predict(best.fit ,Peabodydta[folds.lm == j,])
mat[j,1] = mean((Peabodydta$Peabody[folds.lm == j]-pred)^2,na.rm = T)
}
cv.errors.lm[k-2,i] <- mean(mat[j,1],na.rm = T)
}
}
mean.cv.errors.lm <- mean(cv.errors.lm,na.rm = T) mean.cv.errors.lm
[1] 101.8346
fit1 <- summary(lm.beta(lm(Peabody~.,data=Peabodydta))) fit1$r.squared
[1] 0.593395
fit1$adj.r.squared
[1] 0.4370085
LM.RS <- matrix(NA,1000)
for (i in 1:1000){
idc_train <- sample(c(T, F), nrow(Peabodydta), replace = T, prob = c(.7, .3))
idc_test <- !idc_train
rst_rf <- lm(Peabody ~.,data = Peabodydta[idc_train,])
pred <- predict(rst_rf,Peabodydta[idc_test,])
LM.RS [i,1] = 1 - (mean((Peabodydta[idc_test,]$Peabody-pred)^2)/var(Peabodydta[idc_test,]$Peabody))
}
print(LMRSQ <- quantile(LM.RS,probs = c(0.025,0.975),na.rm = T))
2.5% 97.5% -23.9457253 0.7565058