Sameer Mathur
Boosting
Here we are using Boston Dataset Which is the Inbuilt dataset in MASS Package.
# loading the package in library
library(tree)
library(MASS)
# attaching the inbuilt dataset
attach(Boston)
# dimentions of the dataset
dim(Boston)
[1] 506 14
# column names of the data set
colnames(Boston)
[1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
[8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
# structure of the dataset
str(Boston)
'data.frame': 506 obs. of 14 variables:
$ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
$ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
$ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
$ chas : int 0 0 0 0 0 0 0 0 0 0 ...
$ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
$ rm : num 6.58 6.42 7.18 7 7.15 ...
$ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
$ dis : num 4.09 4.97 4.97 6.06 6.06 ...
$ rad : int 1 2 2 3 3 3 5 5 5 5 ...
$ tax : num 296 242 242 222 222 222 311 311 311 311 ...
$ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
$ black : num 397 397 393 395 397 ...
$ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
$ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# taking half of the observations in the training set
train <- sample(1:nrow(Boston), nrow(Boston)/2)
# printing the length of the training set
length(train)
[1] 253
library(gbm)
set.seed(1)
boost.boston <- gbm(medv~.,data = Boston[train,],
distribution = "gaussian",
n.trees=5000,interaction.depth = 4)
summary(boost.boston)
var rel.inf
lstat lstat 41.8834533
rm rm 21.5803777
dis dis 10.8908422
crim crim 6.2776541
black black 4.8532530
age age 4.0709990
nox nox 3.7580345
ptratio ptratio 2.3496203
tax tax 1.7941645
indus indus 0.9525328
rad rad 0.8948436
zn zn 0.3880341
chas chas 0.3061908
par(mfrow=c(1,2))
plot(boost.boston,i="rm")
plot(boost.boston,i="lstat")
# predicting the model
set.seed(1)
boston.test <- Boston[-train,"medv"]
yhat.boost <- predict(boost.boston,
newdata = Boston[-train,],n.trees=5000)
# mean
mean((yhat.boost-boston.test)^2)
[1] 12.7053
# model with extra parameters
boost.boston <- gbm(medv~.,data=Boston[train,],
distribution="gaussian",
n.trees=5000,interaction.depth=4,shrinkage=0.2,verbose=F)
# predicting the model
yhat.boost <- predict(boost.boston,
newdata = Boston[-train,],n.trees=5000)
# mean
mean((yhat.boost-boston.test)^2)
[1] 15.06824