Sameer Mathur
Bagging and Random Forests
Here we are using Boston Dataset Which is the Inbuilt dataset in MASS Package.
# loading the package in library
library(tree)
library(randomForest)
library(MASS)
# attaching the inbuilt dataset
attach(Boston)
# dimentions of the dataset
dim(Boston)
[1] 506 14
# column names of the data set
colnames(Boston)
[1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
[8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
# structure of the dataset
str(Boston)
'data.frame': 506 obs. of 14 variables:
$ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
$ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
$ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
$ chas : int 0 0 0 0 0 0 0 0 0 0 ...
$ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
$ rm : num 6.58 6.42 7.18 7 7.15 ...
$ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
$ dis : num 4.09 4.97 4.97 6.06 6.06 ...
$ rad : int 1 2 2 3 3 3 5 5 5 5 ...
$ tax : num 296 242 242 222 222 222 311 311 311 311 ...
$ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
$ black : num 397 397 393 395 397 ...
$ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
$ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# taking half of the observations in the training set
train <- sample(1:nrow(Boston), nrow(Boston)/2)
# printing the length of the training set
length(train)
[1] 253
set.seed(1)
# fitting the model
bag.boston <- randomForest(medv~.,data=Boston,subset=train,mtry=13,importance=TRUE)
# printing the results
bag.boston
Call:
randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE, subset = train)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 13
Mean of squared residuals: 12.3362
% Var explained: 85.17
# predicting the model
yhat.bag <- predict(bag.boston,newdata = Boston[-train,])
set.seed(1)
boston.test <- Boston[-train,"medv"]
# plotting the actual and predicted results
plot(yhat.bag, boston.test)
abline(0,1)
# printing the mean
mean((yhat.bag-boston.test)^2)
[1] 17.0252
set.seed(1)
rf.boston <- randomForest(medv~.,data=Boston,
subset=train,mtry=6,importance=TRUE)
yhat.rf <- predict(rf.boston,newdata=Boston[-train,])
mean((yhat.rf-boston.test)^2)
[1] 15.56485
importance(rf.boston)
%IncMSE IncNodePurity
crim 13.182151 1306.56936
zn 1.699846 51.06358
indus 11.729247 1455.57393
chas 0.821280 39.08210
nox 12.476105 801.21508
rm 28.885745 5851.92481
age 11.297420 368.57067
dis 13.286979 905.79988
rad 4.816117 103.48120
tax 9.917540 405.63586
ptratio 10.939010 1044.35915
black 7.768680 389.45790
lstat 33.671829 8073.92692
varImpPlot(rf.boston)