Sameer Mathur
Fitting Regresssion Trees
Here we are using Boston Dataset Which is the Inbuilt dataset in MASS Package.
# loading the package in library
library(tree)
library(MASS)
# attaching the inbuilt dataset
attach(Boston)
# dimentions of the dataset
dim(Boston)
[1] 506 14
# column names of the data set
colnames(Boston)
[1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
[8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
# structure of the dataset
str(Boston)
'data.frame': 506 obs. of 14 variables:
$ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
$ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
$ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
$ chas : int 0 0 0 0 0 0 0 0 0 0 ...
$ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
$ rm : num 6.58 6.42 7.18 7 7.15 ...
$ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
$ dis : num 4.09 4.97 4.97 6.06 6.06 ...
$ rad : int 1 2 2 3 3 3 5 5 5 5 ...
$ tax : num 296 242 242 222 222 222 311 311 311 311 ...
$ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
$ black : num 397 397 393 395 397 ...
$ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
$ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# taking half of the observations in the training set
train <- sample(1:nrow(Boston), nrow(Boston)/2)
# printing the length of the training set
length(train)
[1] 253
# fitting regression tree
tree.boston <- tree(medv~.,Boston,subset=train)
# printing the summary
summary(tree.boston)
Regression tree:
tree(formula = medv ~ ., data = Boston, subset = train)
Variables actually used in tree construction:
[1] "rm" "lstat" "crim"
Number of terminal nodes: 7
Residual mean deviance: 11.41 = 2808 / 246
Distribution of residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-10.3300 -2.0500 -0.1837 0.0000 1.9430 17.1500
# plotting the regression tree
plot(tree.boston)
text(tree.boston,pretty=0)
# cross validation for pruning the tree
cv.boston <- cv.tree(tree.boston)
plot(cv.boston$size,cv.boston$dev,type='b')
# prune the tree to the size 5
prune.boston <- prune.tree(tree.boston,best=5)
plot(prune.boston)
text(prune.boston,pretty=0)
# check the accuracy of the model using test set
yhat <- predict(tree.boston,newdata=Boston[-train,])
boston.test <- Boston[-train,"medv"]
plot(yhat,boston.test)
abline(0,1)
# mean
mean((yhat-boston.test)^2)
[1] 29.13286