Use the Boston data:
#install.packages("ISLR")
library(ISLR)
library(MASS)
data(Boston)
The randomForest package can be used for bagging and boosting:
# package to be used for random forests and bagging
#install.packages("randomForest")
library(randomForest)
set.seed(1)
train<-sample(1:dim(Boston)[1], floor(dim(Boston)[1]/2))
boston.test<-Boston[-train, "medv"]
# bagging --> m=p
# uses by default 500 trees
bag.boston<-randomForest(medv~., data=Boston, subset = train,
mtry=13, importance=TRUE)
# test error
yhat.bag<-predict(bag.boston, newdata=Boston[-train,])
plot(yhat.bag, boston.test)
abline(0,1)
mean((yhat.bag-boston.test)^2)
## [1] 23.4579
# decrease number of trees
# ntree = 25
bag.boston2<-randomForest(medv~., data=Boston, subset = train,
mtry=13, ntree=25, importance=TRUE)
yhat.bag2<-predict(bag.boston2, newdata=Boston[-train,])
mean((yhat.bag2-boston.test)^2)
## [1] 22.79211
# change number of predictors used in the random forest
# default is p/3
# mtry = 6
set.seed(1)
rf.boston<-randomForest(medv~., data=Boston, subset=train,
mtry=6, importance=TRUE)
yhat.rf<-predict(rf.boston, newdata=Boston[-train,])
mean((yhat.rf-boston.test)^2)
## [1] 19.62021
# variable importance
importance(rf.boston)
## %IncMSE IncNodePurity
## crim 16.697017 1076.08786
## zn 3.625784 88.35342
## indus 4.968621 609.53356
## chas 1.061432 52.21793
## nox 13.518179 709.87339
## rm 32.343305 7857.65451
## age 13.272498 612.21424
## dis 9.032477 714.94674
## rad 2.878434 95.80598
## tax 9.118801 364.92479
## ptratio 8.467062 823.93341
## black 7.579482 275.62272
## lstat 27.129817 6027.63740
varImpPlot(rf.boston)
You will need a new package:
# BOOSTING
#install.packages("gbm")
library(gbm)
set.seed(1)
boost.boston<-gbm(medv~., data=Boston[train, ], distribution="gaussian",
n.trees=5000, interaction.depth = 4)
# relative influence plot
summary(boost.boston)
## var rel.inf
## rm rm 43.9919329
## lstat lstat 33.1216941
## crim crim 4.2604167
## dis dis 4.0111090
## nox nox 3.4353017
## black black 2.8267554
## age age 2.6113938
## ptratio ptratio 2.5403035
## tax tax 1.4565654
## indus indus 0.8008740
## rad rad 0.6546400
## zn zn 0.1446149
## chas chas 0.1443986
# relative dependence plots
# marginal effect of the selected var
par(mfrow=c(1,2))
plot(boost.boston, i="rm")
plot(boost.boston, i="lstat")
# test error of boost
# (lambda) shrinkage = 0.001
yhat.boost<-predict(boost.boston, newdata=Boston[-train,],
n.trees = 5000)
mean((yhat.boost-boston.test)^2)
## [1] 18.84709
# change the learning parameter
# (lambda) shrinkage = 0.1
boost.boston2<-gbm(medv~., data=Boston[train, ], distribution="gaussian",
n.trees=5000, interaction.depth = 4,
shrinkage=0.1, verbose = F)
yhat.boost2<-predict(boost.boston2, newdata=Boston[-train,],
n.trees = 5000)
mean((yhat.boost2-boston.test)^2)
## [1] 18.18255