# Boston Housing Data
library(mlbench)
data(BostonHousing)
dim(BostonHousing)
## [1] 506 14
head(BostonHousing)
## crim zn indus chas nox rm age dis rad tax ptratio b
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## lstat medv
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
library(rpart) #classification and regression trees
library(partykit) #treeplots
## Loading required package: grid
library(randomForest) #random forests
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(gbm) #gradient boosting
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
library(caret) #tune hyper-parameters
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
set.seed(123) #random number generator
ind = sample(2, nrow(BostonHousing), replace=TRUE, prob=c(0.5, 0.5))
train = BostonHousing[ind==1,] #the training data set
test = BostonHousing[ind==2,] #the test data set
str(test) #confirm it worked
## 'data.frame': 271 obs. of 14 variables:
## $ crim : num 0.00632 0.02729 0.02985 0.17004 0.11747 ...
## $ zn : num 18 0 0 12.5 12.5 0 0 0 0 0 ...
## $ indus : num 2.31 7.07 2.18 7.87 7.87 8.14 8.14 8.14 8.14 8.14 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.458 0.524 0.524 0.538 0.538 0.538 0.538 0.538 ...
## $ rm : num 6.58 7.18 6.43 6 6.01 ...
## $ age : num 65.2 61.1 58.7 85.9 82.9 84.5 29.3 81.7 36.6 94.4 ...
## $ dis : num 4.09 4.97 6.06 6.59 6.23 ...
## $ rad : num 1 2 3 5 5 4 4 4 4 4 ...
## $ tax : num 296 242 222 311 311 307 307 307 307 307 ...
## $ ptratio: num 15.3 17.8 18.7 15.2 15.2 21 21 21 21 21 ...
## $ b : num 397 393 394 387 397 ...
## $ lstat : num 4.98 4.03 5.21 17.1 13.27 ...
## $ medv : num 24 34.7 28.7 18.9 18.9 18.2 23.1 17.5 20.2 18.4 ...
set.seed(123)
rf.bos = randomForest(medv~., data=train)
print(rf.bos)
##
## Call:
## randomForest(formula = medv ~ ., data = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 12.14877
## % Var explained: 83.35
plot(rf.bos)

which.min(rf.bos$mse)
## [1] 56
set.seed(123)
rf.bos.2 = randomForest(medv~., data=train, ntree=56)
print(rf.bos.2)
##
## Call:
## randomForest(formula = medv ~ ., data = train, ntree = 56)
## Type of random forest: regression
## Number of trees: 56
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 11.29431
## % Var explained: 84.52
varImpPlot(rf.bos.2, main="Variable Importance Plot - housing price/median value prediction")

importance(rf.bos.2)
## IncNodePurity
## crim 722.81418
## zn 129.90428
## indus 925.02509
## chas 251.29222
## nox 1282.80723
## rm 4864.97979
## age 746.77640
## dis 1069.01165
## rad 97.65133
## tax 483.52550
## ptratio 1224.88331
## b 399.50504
## lstat 4739.85812
rf.bos.test = predict(rf.bos.2, newdata=test)
rf.resid = rf.bos.test - test$medv #calculate residual
mean(rf.resid^2)
## [1] 15.83453