p <- seq(0, 1, 0.01)
gini.index <- 2 * p * (1 - p)
class.error <- 1 - pmax(p, 1 - p)
entropy <- - (p * log(p) + (1 - p) * log(1 - p))
matplot(p, cbind(class.error, gini.index, entropy), col = c("red", "green", "blue" ), pch = 16, main = "Classification Tree Measures", xlab = "p-hat_mk values", ylab = "Splitting Criterion")
legend("bottom",pch=12, title = "Measures", col=c("cadetblue2", "salmon", "seagreen2"), legend=c("Classification Error", "Gini Index", "Entropy"), box.lty = 1)
library(ISLR)
attach(Carseats)
set.seed (1)
train = sample(1:nrow(Carseats), nrow(Carseats)/2)
Carseats.train <- Carseats[train, ]
Carseats.test <- Carseats[-train, ]
tree.carseats <- tree(Sales ~ ., data = Carseats.train)
summary(tree.carseats)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats.train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
The variables used in the constructing the tree are Shelveloc, Price, Age, Advertising, ComPrice, and US.
plot(tree.carseats)
text(tree.carseats ,pretty=0)
pred <- predict(tree.carseats, newdata = Carseats.test)
mean((pred - Carseats.test$Sales)^2)
## [1] 4.922039
The test MSE is about 4.922.
set.seed(12345)
cv <- cv.tree(tree.carseats)
plot(cv$size, cv$dev, type = "b")
tree.min <- which.min(cv$dev)
tree.min
## [1] 1
prune.carseats=prune.tree(tree.carseats, best=5)
plot(prune.carseats)
text(prune.carseats ,pretty=0)
pred2 <- predict(prune.carseats, newdata = Carseats.test)
mean((pred2 - Carseats.test$Sales)^2)
## [1] 5.186482
The Test MSE for the Cross-Validated Final Tree Model is 5.186 which is greater than our previous MSE.
bag.carseats <- randomForest(Sales ~ ., data = Carseats.train, mtry = 10, ntree = 500, importance = TRUE)
yhat.bag <- predict(bag.carseats, newdata = Carseats.test)
mean((yhat.bag - Carseats.test$Sales)^2)
## [1] 2.645498
Bagging produces a test MSE of 2.645. This Test MSE is better than the unpruned Regression Tree Model that we previously obtained.
importance(bag.carseats)
## %IncMSE IncNodePurity
## CompPrice 26.3124066 168.055452
## Income 4.8473665 93.243504
## Advertising 13.1440973 106.151221
## Population -0.7208588 57.048912
## Price 54.4206377 490.245169
## ShelveLoc 46.0200384 376.195941
## Age 16.0552510 157.258005
## Education 0.1673251 44.397453
## Urban -0.5789925 9.348182
## US 3.5045450 16.437524
Price and Shelveloc are the most important.
set.seed(12345)
rf.carseats <- randomForest(Sales ~ ., data = Carseats.train, mtry = 3, ntree = 500, importance = TRUE)
rf.carseats
##
## Call:
## randomForest(formula = Sales ~ ., data = Carseats.train, mtry = 3, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 3.295601
## % Var explained: 58.09
yhat.rf <- predict(rf.carseats, newdata = Carseats.test)
mean((yhat.rf - Carseats.test$Sales)^2)
## [1] 3.055149
Test MSE is 3.055.
importance(rf.carseats)
## %IncMSE IncNodePurity
## CompPrice 15.1109575 156.04402
## Income 3.2514115 125.75625
## Advertising 9.2901175 107.99331
## Population -1.2978251 98.39866
## Price 38.7208398 390.11971
## ShelveLoc 33.2658320 290.54948
## Age 14.2573824 176.19310
## Education 1.3006035 75.27627
## Urban -0.7511157 17.13160
## US 5.5020566 35.29788
With the Random Forest Method, the model is able to explain 58.09% of the Variance of Sales on the Training Dataset. Additionally, we are able to see that Price is the most importance variable in tree generated with Random Forest.
set.seed(12345)
attach(OJ)
train <- sample(1:nrow(OJ), 800)
OJ.train <- OJ[train, ]
OJ.test <- OJ[-train, ]
tree.oj <- tree(Purchase ~ ., data = OJ.train, method = "class")
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train, method = "class")
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7724 = 612.5 / 793
## Misclassification error rate: 0.17 = 136 / 800
There are 7 terminal nodes and a training error rate of 0.17.
tree.oj
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1059.000 CH ( 0.62500 0.37500 )
## 2) LoyalCH < 0.5036 349 425.200 MM ( 0.29799 0.70201 )
## 4) LoyalCH < 0.276142 170 138.400 MM ( 0.14118 0.85882 )
## 8) LoyalCH < 0.0356415 55 9.996 MM ( 0.01818 0.98182 ) *
## 9) LoyalCH > 0.0356415 115 115.100 MM ( 0.20000 0.80000 ) *
## 5) LoyalCH > 0.276142 179 246.100 MM ( 0.44693 0.55307 )
## 10) PriceDiff < 0.065 76 80.790 MM ( 0.22368 0.77632 ) *
## 11) PriceDiff > 0.065 103 137.600 CH ( 0.61165 0.38835 ) *
## 3) LoyalCH > 0.5036 451 334.500 CH ( 0.87805 0.12195 )
## 6) LoyalCH < 0.705699 138 164.300 CH ( 0.71739 0.28261 )
## 12) PriceDiff < 0.265 89 120.100 CH ( 0.59551 0.40449 ) *
## 13) PriceDiff > 0.265 49 22.570 CH ( 0.93878 0.06122 ) *
## 7) LoyalCH > 0.705699 313 126.300 CH ( 0.94888 0.05112 ) *
I’ll pick node 8. The split criterion is LoyalCHD < 0.0356, the number of observations in this branch is 55 with a deviance of 9.996 and an overall prediction for the branch of MM. Less than 2% of the observations in that branch take the value of CH, and the remaining 98% take the value of MM.
plot(tree.oj)
text(tree.oj, pretty = 0)
tree.pred <- predict(tree.oj, OJ.test, type = "class")
table(tree.pred, OJ.test$Purchase)
##
## tree.pred CH MM
## CH 145 47
## MM 8 70
(8+38) / 270
## [1] 0.1703704
Test error rate is 17% .
cv.oj <- cv.tree(tree.oj, FUN = prune.misclass)
cv.oj
## $size
## [1] 7 4 2 1
##
## $dev
## [1] 160 160 169 300
##
## $k
## [1] -Inf 0.0 11.5 141.0
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.oj$size, cv.oj$dev, type = "b", xlab = "Tree size", ylab = "Cross-Validation Error Rate")
Which tree size corresponds to the lowest cross-validated classi- fication error rate? Tree size 7 has the lowest classification error rate.
Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.
prune.oj <- prune.misclass(tree.oj, best = 7)
plot(prune.oj)
text(prune.oj, pretty = 0)
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train, method = "class")
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7724 = 612.5 / 793
## Misclassification error rate: 0.17 = 136 / 800
The test error rate is 0.17 which is equal to the 9 terminal nodes that has a training error rate of 0.17.
prune.pred <- predict(prune.oj, OJ.test, type = "class")
table(prune.pred, OJ.test$Purchase)
##
## prune.pred CH MM
## CH 145 47
## MM 8 70
(47+8)/270
## [1] 0.2037037
Pruning slightly increases the test error rate to 0.20.