Machine learning: Random forests

# Boston Housing Data
library(mlbench)
data(BostonHousing)
dim(BostonHousing)

## [1] 506  14

head(BostonHousing)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio      b
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7

library(rpart) #classification and regression trees
library(partykit) #treeplots

## Loading required package: grid

library(randomForest) #random forests

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

library(gbm) #gradient boosting

## Loading required package: survival

## Loading required package: lattice

## Loading required package: splines

## Loading required package: parallel

## Loaded gbm 2.1.3

library(caret) #tune hyper-parameters

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

set.seed(123) #random number generator
ind = sample(2, nrow(BostonHousing), replace=TRUE, prob=c(0.5, 0.5))
train = BostonHousing[ind==1,] #the training data set
test = BostonHousing[ind==2,] #the test data set
str(test) #confirm it worked

## 'data.frame':    271 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02729 0.02985 0.17004 0.11747 ...
##  $ zn     : num  18 0 0 12.5 12.5 0 0 0 0 0 ...
##  $ indus  : num  2.31 7.07 2.18 7.87 7.87 8.14 8.14 8.14 8.14 8.14 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.458 0.524 0.524 0.538 0.538 0.538 0.538 0.538 ...
##  $ rm     : num  6.58 7.18 6.43 6 6.01 ...
##  $ age    : num  65.2 61.1 58.7 85.9 82.9 84.5 29.3 81.7 36.6 94.4 ...
##  $ dis    : num  4.09 4.97 6.06 6.59 6.23 ...
##  $ rad    : num  1 2 3 5 5 4 4 4 4 4 ...
##  $ tax    : num  296 242 222 311 311 307 307 307 307 307 ...
##  $ ptratio: num  15.3 17.8 18.7 15.2 15.2 21 21 21 21 21 ...
##  $ b      : num  397 393 394 387 397 ...
##  $ lstat  : num  4.98 4.03 5.21 17.1 13.27 ...
##  $ medv   : num  24 34.7 28.7 18.9 18.9 18.2 23.1 17.5 20.2 18.4 ...

set.seed(123)
rf.bos = randomForest(medv~., data=train)
print(rf.bos)

## 
## Call:
##  randomForest(formula = medv ~ ., data = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 12.14877
##                     % Var explained: 83.35

plot(rf.bos)

which.min(rf.bos$mse)

## [1] 56

set.seed(123)
rf.bos.2 = randomForest(medv~., data=train, ntree=56)
print(rf.bos.2)

## 
## Call:
##  randomForest(formula = medv ~ ., data = train, ntree = 56) 
##                Type of random forest: regression
##                      Number of trees: 56
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 11.29431
##                     % Var explained: 84.52

varImpPlot(rf.bos.2, main="Variable Importance Plot - housing price/median value prediction")

importance(rf.bos.2)

##         IncNodePurity
## crim        722.81418
## zn          129.90428
## indus       925.02509
## chas        251.29222
## nox        1282.80723
## rm         4864.97979
## age         746.77640
## dis        1069.01165
## rad          97.65133
## tax         483.52550
## ptratio    1224.88331
## b           399.50504
## lstat      4739.85812

rf.bos.test = predict(rf.bos.2, newdata=test)
rf.resid = rf.bos.test - test$medv #calculate residual
mean(rf.resid^2)

## [1] 15.83453

Machine learning: Random forests

Kushan De Silva

August 4, 2017