ISLR: Chapter 8 Decision Trees

Sameer Mathur

Boosting

About the Dataset

Here we are using Boston Dataset Which is the Inbuilt dataset in MASS Package.

# loading the package in library
library(tree)
library(MASS)
# attaching the inbuilt dataset
attach(Boston)
# dimentions of the dataset
dim(Boston)

[1] 506  14

# column names of the data set
colnames(Boston)

 [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
 [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

# structure of the dataset
str(Boston)

'data.frame':   506 obs. of  14 variables:
 $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
 $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
 $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
 $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
 $ rm     : num  6.58 6.42 7.18 7 7.15 ...
 $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
 $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
 $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
 $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
 $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
 $ black  : num  397 397 393 395 397 ...
 $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
 $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

Creating Traineng Set

# taking half of the observations in the training set
train <- sample(1:nrow(Boston), nrow(Boston)/2)
# printing the length of the training set
length(train)

[1] 253

Fitting the Model using gbm Package

library(gbm)
set.seed(1)
boost.boston <- gbm(medv~.,data = Boston[train,],
                distribution = "gaussian",
                n.trees=5000,interaction.depth = 4)
summary(boost.boston)

plot of chunk unnamed-chunk-4

            var    rel.inf
lstat     lstat 41.8834533
rm           rm 21.5803777
dis         dis 10.8908422
crim       crim  6.2776541
black     black  4.8532530
age         age  4.0709990
nox         nox  3.7580345
ptratio ptratio  2.3496203
tax         tax  1.7941645
indus     indus  0.9525328
rad         rad  0.8948436
zn           zn  0.3880341
chas       chas  0.3061908

par(mfrow=c(1,2))
plot(boost.boston,i="rm")

plot of chunk unnamed-chunk-5

plot(boost.boston,i="lstat")

plot of chunk unnamed-chunk-6

# predicting the model 
set.seed(1)
boston.test <- Boston[-train,"medv"]
yhat.boost <- predict(boost.boston,
              newdata = Boston[-train,],n.trees=5000)
# mean
mean((yhat.boost-boston.test)^2)

[1] 12.7053

# model with extra parameters
boost.boston <- gbm(medv~.,data=Boston[train,],
                distribution="gaussian",
                n.trees=5000,interaction.depth=4,shrinkage=0.2,verbose=F)
# predicting the model 
yhat.boost <- predict(boost.boston,
              newdata = Boston[-train,],n.trees=5000)
# mean
mean((yhat.boost-boston.test)^2)

[1] 15.06824