library(MASS)
attach(Boston)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(tree)
train<-sample(1:nrow(Boston),nrow(Boston)/2, replace=FALSE)
test<-(-train)
set.seed(1)
(bag.boston<-randomForest(medv~., data=Boston, subset=train, mtry=13, importance=TRUE))
##
## Call:
## randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 13
##
## Mean of squared residuals: 11.56246
## % Var explained: 86.07
(tree.boston<-tree(medv~., data=Boston))
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 506 42720.0 22.53
## 2) rm < 6.941 430 17320.0 19.93
## 4) lstat < 14.4 255 6632.0 23.35
## 8) dis < 1.38485 5 390.7 45.58 *
## 9) dis > 1.38485 250 3721.0 22.91
## 18) rm < 6.543 195 1636.0 21.63 *
## 19) rm > 6.543 55 643.2 27.43 *
## 5) lstat > 14.4 175 3373.0 14.96
## 10) crim < 6.99237 101 1151.0 17.14 *
## 11) crim > 6.99237 74 1086.0 11.98 *
## 3) rm > 6.941 76 6059.0 37.24
## 6) rm < 7.437 46 1900.0 32.11
## 12) lstat < 11.455 41 844.2 33.50 *
## 13) lstat > 11.455 5 329.8 20.74 *
## 7) rm > 7.437 30 1099.0 45.10
## 14) ptratio < 17.9 25 340.7 46.82 *
## 15) ptratio > 17.9 5 312.7 36.48 *
- mtry=13 : All 13 predictors should be considere for each split of the tree(Bagging)
yhat.bag<-predict(bag.boston, newdata=Boston[-train,])
yhat.deci<-predict(tree.boston, newdata=Boston[-train,])
boston.test<-Boston[test,"medv"]
plot(yhat.bag, boston.test)
abline(0,1)

bag.MSE<-mean((yhat.bag-boston.test)^2)
tree.MSE<-mean((yhat.deci-boston.test)^2)
bag.MSE
## [1] 17.40305
tree.MSE
## [1] 14.11123
ranfor.boston<-randomForest(medv~., data=Boston, subset=train, mtry=13, ntree=25)
yhat.ranfor<-predict(ranfor.boston, newdata=Boston[-train, ])
mean((yhat.ranfor-boston.test)^2)
## [1] 18.07934
- Use ‘ntree’ argument to construct random forest
- We use smaller ‘mtry’ argument to construct random forest.
- Function ‘randomForest’ and argument ‘ntree’ by default use p/3 as number of variables used in each randomForest.
- Use sqrt(p) = 6
set.seed(1)
ranfor.boston<-randomForest(medv~., data=Boston, subset=train, mtry=6, importance=TRUE)
yhat.ranfor<-predict(ranfor.boston, newdata=Boston[-train,])
mean((yhat.ranfor-boston.test)^2)
## [1] 15.70374
importance(ranfor.boston)
## %IncMSE IncNodePurity
## crim 14.7097975 1096.00475
## zn 2.8944704 46.44833
## indus 9.9276775 1187.59402
## chas 0.6570508 79.36860
## nox 14.1701021 993.85271
## rm 33.4621783 7687.23174
## age 10.0383578 483.39384
## dis 12.3564793 781.06893
## rad 3.4832610 89.89916
## tax 9.9176462 453.64507
## ptratio 11.5770813 797.94518
## black 5.6896218 229.60868
## lstat 29.9034437 6613.21033
varImpPlot(ranfor.boston)

- Use ‘importance’ function to view importance of each variables.
- Percentage increase in MSE : Mean decrease of accuracy on prediction of out of bag samples.
- Increase in node purity : Increase in node purity by averaging over all trees.
- In regression tree, impurity is measured by training RSS, in classification tree by deviance
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
set.seed(1)
boost.boston<-gbm(medv~., data=Boston[train, ], distribution="gaussian", n.trees=5000, interaction.depth=4)
summary(boost.boston)

## var rel.inf
## lstat lstat 41.75912842
## rm rm 41.74563760
## dis dis 3.83358434
## nox nox 2.96171654
## ptratio ptratio 2.84991557
## crim crim 2.40553037
## tax tax 1.60969837
## age age 1.13246368
## indus indus 0.77170407
## black black 0.62031932
## rad rad 0.18412934
## chas chas 0.08263796
## zn zn 0.04353442
- summary returns relative importance.
par(mfrow=c(1,2))
plot(boost.boston, i="rm")
plot(boost.boston, i="lstat")

- Marginal influence on medv.
yhat.boost<-predict(boost.boston, newdata=Boston[test,], n.trees=5000)
mean((yhat.boost-boston.test)^2)
## [1] 18.45072
boost.boston<-gbm(medv~., data=Boston[train, ],distribution="gaussian", n.trees=5000, interaction.depth=4, shrinkage=0.2)
yhat.boost<-predict(boost.boston, newdata=Boston[test,], n.trees=5000)
mean((yhat.boost-boston.test)^2)
## [1] 16.47216