library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
library(AppliedPredictiveModeling)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
set.seed(290875)
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1[order(-rfImp1$Overall), , drop = FALSE]
## Overall
## V1 8.58724062
## V4 7.88939319
## V2 6.52329383
## V5 2.23383955
## V3 0.80900957
## V6 0.18348535
## V7 0.05305724
## V10 0.04062263
## V9 -0.04990517
## V8 -0.05687335
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
set.seed(290875)
model3 <- cforest(y ~ ., data = simulated[,1:11],
control = cforest_unbiased(ntree = 50))
rfImp3 <- varImp(model3)
rfImp3[order(-rfImp3$Overall), , drop = FALSE]
## Overall
## V1 7.876370569
## V4 7.663030163
## V2 6.957624604
## V5 2.142503187
## V3 0.147828002
## V7 0.027086136
## V8 -0.008739985
## V9 -0.014319658
## V10 -0.028399237
## V6 -0.054397847
set.seed(290875)
model4 <- cforest(y ~ ., data = simulated,
control = cforest_unbiased(ntree = 50))
rfImp4 <- varImp(model4)
rfImp4[order(-rfImp4$Overall), , drop = FALSE]
## Overall
## V4 7.82169148
## V2 6.09618909
## duplicated1 4.56383879
## V1 4.55477176
## V5 1.94271163
## V3 0.04272389
## V8 0.03834108
## V9 0.02950669
## V7 0.00824838
## V6 -0.04574537
## V10 -0.04802574
library(gbm)
## Loaded gbm 2.1.5
set.seed(290875)
model5 <- gbm(y ~., data = simulated[,1:11], distribution = "gaussian")
rfImp5 <- varImp(model5, numTrees = 50)
rfImp5[order(-rfImp5$Overall), , drop = FALSE]
## Overall
## V4 4104.4032
## V1 4063.8211
## V2 2837.7922
## V5 1214.4972
## V3 518.8255
## V6 0.0000
## V7 0.0000
## V8 0.0000
## V9 0.0000
## V10 0.0000
set.seed(290875)
model6 <- gbm(y ~., data = simulated, distribution = "gaussian")
rfImp6 <- varImp(model6, numTrees = 50)
rfImp6[order(-rfImp6$Overall), , drop = FALSE]
## Overall
## V4 4232.5788
## V2 2748.8758
## V1 2629.7275
## duplicated1 1574.3343
## V5 1207.0833
## V3 511.0473
## V6 0.0000
## V7 0.0000
## V8 0.0000
## V9 0.0000
## V10 0.0000
library(Cubist)
set.seed(290875)
model7 <- cubist(simulated[,1:10], simulated[,11])
rfImp7 <- varImp(model7, numTrees = 50)
rfImp7[order(-rfImp7$Overall), , drop = FALSE]
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
set.seed(290875)
model8 <- cubist(simulated[,c(1:10, 12)], simulated[,11])
rfImp8 <- varImp(model8, numTrees = 50)
rfImp8[order(-rfImp8$Overall), , drop = FALSE]
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
## duplicated1 0
library(ipred)
set.seed(290875)
model9 <- ipredbagg(simulated[,11], simulated[,1:10])
rfImp9 <- varImp(model9)
rfImp9[order(-rfImp9$Overall), , drop = FALSE]
## Overall
## V4 2.8364141
## V5 2.4308660
## V1 2.1101728
## V2 2.0065794
## V3 1.1900782
## V7 1.0167097
## V10 0.9523974
## V6 0.8543279
## V9 0.8313089
## V8 0.5636161
set.seed(290875)
model10 <- ipredbagg(simulated[,11], simulated[,c(1:10, 12)])
rfImp10 <- varImp(model10)
rfImp10[order(-rfImp10$Overall), , drop = FALSE]
## Overall
## V4 2.6920516
## V5 2.3375201
## V2 2.0569515
## V1 1.7814206
## duplicated1 1.7317937
## V3 0.9495417
## V7 0.9226783
## V10 0.8373111
## V6 0.8058508
## V9 0.6922958
## V8 0.4719232
library(rpart)
set.seed(66)
X1 <- rep(0:1,each=100)
Y <- X1 + rnorm(200,mean=0,sd=5)
set.seed(76)
X2 <- rnorm(200,mean=0,sd=3)
df <- data.frame(Y=Y,X1=X1,X2=X2)
fit <- rpart(Y~X1+X2, data=df,control=rpart.control(maxdepth=1))
firstSplit <- data.frame(Predictor=rownames(fit$splits)[1])
## correlations beteween the last run of predictors and target variable
cor(X1, Y)
## [1] 0.1063086
cor(X2, Y)
## [1] 0.05820655
## important list of variable in the last for loop
varImp(fit)
## Overall
## X1 0.01130152
## X2 0.01073563
## simulate 100 times to get the frequency of the first split predictor
freqPred <- data.frame()
for (i in 1:1000 ) {
set.seed(65+i)
X1 <- rep(0:1,each=100)
Y <- X1 + rnorm(200,mean=0,sd=5)
set.seed(75+i)
X2 <- rnorm(200,mean=0,sd=3)
df <- data.frame(Y=Y,X1=X1,X2=X2)
fit <- rpart(Y~X1+X2, data=df,control=rpart.control(maxdepth=1))
firstSplit <- data.frame(Predictor=rownames(fit$splits)[1])
freqPred <- rbind(freqPred, firstSplit)
}
## frequency of the last run first split variable
table(freqPred)
## freqPred
## X1 X2
## 275 676
The bagging fraction tuning is to train and retain the model with a fraction of randomly selected sample from training data, it increases the accuracy of the model. The higher fraction number will resolve a lower number of predictors in important metric.
##
## Call:
## resamples.default(x = list(gbmTune11 = gbmTune11, gbmTune99 = gbmTune99))
##
## Models: gbmTune11, gbmTune99
## Number of resamples: 10
## Performance metrics: MAE, RMSE, Rsquared
## Time estimates for: everything, final model fit
##
## Call:
## summary.resamples(object = resamps)
##
## Models: gbmTune11, gbmTune99
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## gbmTune11 0.3938461 0.4376460 0.4842888 0.4758602 0.5110284 0.5408262 0
## gbmTune99 0.5200724 0.5566837 0.5996649 0.6020552 0.6483551 0.6866974 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## gbmTune11 0.5554513 0.6033647 0.6407151 0.6449073 0.6809191 0.7428969 0
## gbmTune99 0.6934028 0.7368130 0.8220993 0.8207102 0.8930624 0.9728628 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## gbmTune11 0.8581825 0.884978 0.9122083 0.9004813 0.9147748 0.9273709 0
## gbmTune99 0.7463732 0.811773 0.8508886 0.8436585 0.8727189 0.9244406 0
## change interaction.depth to 10
gbmGrid <- expand.grid(interaction.depth = 15,
n.trees = seq(100, 1000, by = 50),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 10)
set.seed(100)
gbmTune <- train(solTrainXtrans, solTrainY,
method = "gbm",
tuneGrid = gbmGrid,
trControl = gbmctrl,
verbose = FALSE)
gbmGrid1 <- expand.grid(interaction.depth = gbmTune$bestTune$interaction.depth,
n.trees = gbmTune$bestTune$n.trees,
shrinkage = 0.1,
n.minobsinnode = 10)
gbmGrid9 <- expand.grid(interaction.depth = gbmTune$bestTune$interaction.depth,
n.trees = gbmTune$bestTune$n.trees,
shrinkage = 0.9,
n.minobsinnode = 10)
set.seed(100)
gbmTune11 <- train(solTrainXtrans, solTrainY,
method = "gbm",
tuneGrid = gbmGrid1,
trControl = gbmctrl,
bag.fraction = 0.1,
verbose = FALSE)
gbmImp11 <- varImp(gbmTune11, scale = FALSE)
set.seed(100)
gbmTune99 <- train(solTrainXtrans, solTrainY,
method = "gbm",
tuneGrid = gbmGrid9,
trControl = gbmctrl,
bag.fraction = 0.9,
verbose = FALSE)
gbmImp99 <- varImp(gbmTune99, scale = FALSE)
plot11 <- plot(gbmImp11, top=25, scales = list(y = list(cex = .95)))
plot99 <- plot(gbmImp99, top=25, scales = list(y = list(cex = .95)))
print(plot11, split=c(1,1,2,1), more=TRUE)
print(plot99, split=c(2,1,2,1))
data(ChemicalManufacturingProcess)
X <- subset(ChemicalManufacturingProcess,select= -Yield)
Y <- subset(ChemicalManufacturingProcess,select="Yield")
set.seed(517)
trainingRows <- createDataPartition(Y$Yield,
p = 0.7,
list = FALSE)
x_train <- X[trainingRows,]
y_train <- Y[trainingRows,]
x_test <- X[-trainingRows,]
y_test <- Y[-trainingRows,]
## re-process x_train and apply to x_train and x_test
pp <- preProcess(x_train,method=c("BoxCox","knnImpute", "corr", "nzv"))
ppx_train <- predict(pp,x_train)
ppx_test <- predict(pp,x_test)
#Set-up trainControl
set.seed(517)
ctrl <- trainControl(method = "boot", number = 25)
set.seed(614)
rpartGrid <- expand.grid(maxdepth= seq(1,10,by=1))
rpartFit <- train(x = ppx_train, y = y_train,
method = "rpart2",
metric = "Rsquared",
tuneGrid = rpartGrid,
trControl = ctrl)
rpartTest <- predict(rpartFit, ppx_test)
rpartResults <- postResample(pred = rpartTest, obs = y_test)
rpartResults$model <- 'rpart'
## Warning in rpartResults$model <- "rpart": Coercing LHS to a list
set.seed(614)
rfGrid <- expand.grid(mtry=seq(2,38,by=3))
rfFit <- train(x = ppx_train, y = y_train,
method = "rf",
tuneGrid = rfGrid,
metric = "Rsquared",
importance = TRUE,
trControl = ctrl)
rfTest <- predict(rfFit, ppx_test)
rfResults <- postResample(pred = rfTest, obs = y_test)
rfResults$model <- 'rf'
## Warning in rfResults$model <- "rf": Coercing LHS to a list
set.seed(614)
gbmGrid <- expand.grid(interaction.depth=seq(1,6,by=1),
n.trees=c(25,50,100,200),
shrinkage=c(0.01,0.05,0.1,0.2),
n.minobsinnode = 10)
gbmFit <- train(x = ppx_train, y = y_train,
method = "gbm",
metric = "Rsquared",
verbose = FALSE,
tuneGrid = gbmGrid,
trControl = ctrl)
gbmTest <- predict(gbmFit, ppx_test)
gbmResults <- postResample(pred = gbmTest, obs = y_test)
gbmResults$model <- 'GBM'
## Warning in gbmResults$model <- "GBM": Coercing LHS to a list
set.seed(614)
cubistGrid <- expand.grid(committees = c(1, 5, 10, 20, 50, 100),
neighbors = c(0, 1, 3, 5, 7))
cubistFit <- train(x = ppx_train, y = y_train,
method = "cubist",
verbose = FALSE,
metric = "Rsquared",
tuneGrid = cubistGrid,
trControl = ctrl)
cubistTest <- predict(cubistFit, ppx_test)
cubistResults <- postResample(pred = cubistTest, obs = y_test)
cubistResults$model = "Cubist"
## Warning in cubistResults$model = "Cubist": Coercing LHS to a list
## The optimal resampling
set.seed(66)
resamps <- resamples(list(rpart = rpartFit,
rf = rfFit,
gbm = gbmFit,
cubist = cubistFit))
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: rpart, rf, gbm, cubist
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rpart 0.9054745 1.0760905 1.1782372 1.1625023 1.2347820 1.463845 0
## rf 0.7740582 0.8830424 0.9414969 0.9513566 0.9864606 1.159635 0
## gbm 0.7552799 0.8450022 0.9025720 0.9158401 0.9399823 1.133064 0
## cubist 0.6233482 0.7618054 0.8242190 0.8332911 0.9285006 1.039944 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rpart 1.1780149 1.4101903 1.488830 1.497973 1.567249 1.901338 0
## rf 1.0418982 1.1410412 1.237875 1.238107 1.344908 1.475146 0
## gbm 0.9985069 1.1119110 1.194964 1.204948 1.310769 1.432115 0
## cubist 0.7768187 0.9970293 1.102398 1.098788 1.201172 1.408280 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rpart 0.1619850 0.3148644 0.3554588 0.3742278 0.4284422 0.6511646 0
## rf 0.3654442 0.5056151 0.5554625 0.5674966 0.6177079 0.7530056 0
## gbm 0.4068387 0.5200218 0.5729414 0.5701034 0.6308198 0.7555806 0
## cubist 0.3723552 0.5719488 0.6237306 0.6363099 0.7062009 0.8344895 0
trellis.par.set(caretTheme())
dotplot(resamps, metric = "Rsquared")
Test_results <- rbind(as.data.frame(rpartResults),
as.data.frame(rfResults),
as.data.frame(gbmResults),
as.data.frame(cubistResults)
)
Test_results <- Test_results[order(Test_results[, 1]),]
Test_results
## RMSE Rsquared MAE model
## 4 0.9344845 0.7645525 0.6599864 Cubist
## 2 1.1288476 0.6810362 0.8608642 rf
## 3 1.1673448 0.6261712 0.9069216 GBM
## 1 1.4110512 0.4806575 1.0839033 rpart
lmImp <- varImp(cubistFit, scale = FALSE)
plot(lmImp, top=10, scales = list(y = list(cex = 0.8)))
library(partykit)
## Loading required package: libcoin
##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner,
## node_surv, node_terminal, varimp
plot(as.party(rpartFit$finalModel),gp=gpar(fontsize=11))
References:
https://github.com/topepo/APM_Exercises/blob/master/Ch_08.Rnw