# Boston Housing Data
library(mlbench)
data(BostonHousing)
dim(BostonHousing)
## [1] 506  14
head(BostonHousing)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio      b
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7
library(rpart) #classification and regression trees
library(partykit) #treeplots
## Loading required package: grid
library(randomForest) #random forests
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(gbm) #gradient boosting
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
library(caret) #tune hyper-parameters
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
set.seed(123) #random number generator
ind = sample(2, nrow(BostonHousing), replace=TRUE, prob=c(0.7, 0.3))
train = BostonHousing[ind==1,] #the training data set
test = BostonHousing[ind==2,] #the test data set
str(test) #confirm it worked
## 'data.frame':    143 obs. of  14 variables:
##  $ crim   : num  0.0273 0.0324 0.069 0.1446 0.2249 ...
##  $ zn     : num  0 0 0 12.5 12.5 0 0 0 0 0 ...
##  $ indus  : num  7.07 2.18 2.18 7.87 7.87 8.14 8.14 8.14 8.14 8.14 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.469 0.458 0.458 0.524 0.524 0.538 0.538 0.538 0.538 0.538 ...
##  $ rm     : num  6.42 7 7.15 6.17 6.38 ...
##  $ age    : num  78.9 45.8 54.2 96.1 94.3 56.5 69.5 98.1 100 85.7 ...
##  $ dis    : num  4.97 6.06 6.06 5.95 6.35 ...
##  $ rad    : num  2 3 3 5 5 4 4 4 4 4 ...
##  $ tax    : num  242 222 222 311 311 307 307 307 307 307 ...
##  $ ptratio: num  17.8 18.7 18.7 15.2 15.2 21 21 21 21 21 ...
##  $ b      : num  397 395 397 397 393 ...
##  $ lstat  : num  9.14 2.94 5.33 19.15 20.45 ...
##  $ medv   : num  21.6 33.4 36.2 27.1 15 19.9 18.2 13.6 14.5 13.9 ...
tree.bos = rpart(medv~., data=train)
print(tree.bos$cptable)
##           CP nsplit rel error    xerror       xstd
## 1 0.48447752      0 1.0000000 1.0038649 0.09653179
## 2 0.15413728      1 0.5155225 0.6433070 0.07168069
## 3 0.07010783      2 0.3613852 0.4485012 0.05879828
## 4 0.03549787      3 0.2912774 0.3884635 0.05328858
## 5 0.02513162      4 0.2557795 0.3384210 0.05115412
## 6 0.01230944      5 0.2306479 0.3035548 0.04926311
## 7 0.01105293      6 0.2183384 0.2919353 0.04724678
## 8 0.01000000      7 0.2072855 0.2853874 0.04697146
plotcp(tree.bos)

cp = min(tree.bos$cptable[8,])
prune.tree.bos = prune(tree.bos, cp = cp)
plot(as.party(tree.bos))

plot(as.party(prune.tree.bos))

party.bos.test = predict(prune.tree.bos, newdata=test)
rpart.resid = party.bos.test - test$medv #calculate residuals
mean(rpart.resid^2) #caluclate MSE
## [1] 20.57714