We use Boston census dataset from the “MASS” package.

Random Forest

Builds a lot of Bushy trees and reduces variance by averaging them.

Dataset

cat("\014")

require(randomForest)

## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.

require(MASS)

## Loading required package: MASS

set.seed(11)
dim(Boston)

## [1] 506  14

train=sample(1:nrow(Boston),300)
head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7

Fitting Random Forest

We use medv as the response variable

rf.Boston = randomForest(medv~.,data=Boston,subset=train) #fitting on taining data
rf.Boston

## 
## Call:
##  randomForest(formula = medv ~ ., data = Boston, subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 11.8932
##                     % Var explained: 86.53

Mean Squared error and % variance based on the out-of-bag(OOB)

obb.err = double(13)
test.err = double(13)
for(mtry in 1:13){
  fit = randomForest(medv~.,data=Boston,subset=train,mtry=mtry,ntree=400)
  obb.err[mtry]=fit$mse[400]
  pred = predict(fit,Boston[-train,])
  test.err[mtry]=with(Boston[-train,],mean(medv-pred)^2)
  cat(mtry," _ ")
}

## 1  _ 2  _ 3  _ 4  _ 5  _ 6  _ 7  _ 8  _ 9  _ 10  _ 11  _ 12  _ 13  _

matplot(1:mtry,cbind(test.err,obb.err),pch=19,col=c("red","blue"),type="b",ylab = "Mean Squared Error")
legend("topright",legend = c("OOB","Test"),pch=19,col=c("red","blue"))

Boosting

require(gbm)

## Loading required package: gbm
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1

boost.Boston = gbm(medv~.,Boston[train,],distribution = "gaussian",n.trees=10000,shrinkage =0.01,interaction.depth = 4)
summary(boost.Boston)

##             var     rel.inf
## lstat     lstat 40.38785380
## rm           rm 28.58605517
## dis         dis  7.53430222
## nox         nox  5.37502089
## crim       crim  5.31436668
## age         age  3.09625442
## black     black  2.98131112
## ptratio ptratio  2.83025417
## tax         tax  1.61257569
## indus     indus  1.45168676
## rad         rad  0.58268277
## zn           zn  0.15764005
## chas       chas  0.08999627

plot(boost.Boston,i="lstat")

plot(boost.Boston,i="rm")

Use CV to predict no.of trees

n.trees =seq(from=100,to=10000,by=100)
predmat = predict(boost.Boston,newdata=Boston[-train,],n.trees=n.trees)
dim(predmat)

## [1] 206 100

berr = with(Boston[-train,],apply((predmat-medv)^2,2,mean))
plot(n.trees,berr,ylab="MSE",xlab="# trees",main="Boosting test error")
abline(h=min(test.err),col="red")

Random Forest and Boosting

Arvind Saini

Random Forest

Dataset

Fitting Random Forest

Boosting