p = seq(0, 1, 0.01)
gini.index = 2 * p * (1 - p)
class.error = 1 - pmax(p, 1 - p)
cross.entropy = - (p * log(p) + (1 - p) * log(1 - p))
matplot(p, cbind(gini.index, class.error, cross.entropy), col = c("orange", "gray", "blue"))
library(ISLR)
set.seed(1)
train = sample(1:nrow(Carseats), nrow(Carseats) / 2)
Carseats.train = Carseats[train, ]
Carseats.test = Carseats[-train, ]
library(tree)
tree.carseats = tree(Sales ~ ., data = Carseats.train)
summary(tree.carseats)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats.train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Advertising" "CompPrice"
## [6] "US"
## Number of terminal nodes: 18
## Residual mean deviance: 2.167 = 394.3 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.88200 -0.88200 -0.08712 0.00000 0.89590 4.09900
plot(tree.carseats)
text(tree.carseats, pretty = 0)
yhat = predict(tree.carseats, newdata = Carseats.test)
mean((yhat - Carseats.test$Sales)^2)
## [1] 4.922039
The MSE is about 3.4.
cv.carseats = cv.tree(tree.carseats)
plot(cv.carseats$size, cv.carseats$dev, type = "b")
tree.min = which.min(cv.carseats$dev)
points(tree.min, cv.carseats$dev[tree.min], col = "red", cex = 2, pch = 20)
In this case, the tree size of 2 is slected by cross validation. we now prune the tree to obtain the 2-node tree.
prune.carseats = prune.tree(tree.carseats, best = 8)
plot(prune.carseats)
text(prune.carseats, pretty = 0)
yhat = predict(prune.carseats, newdata = Carseats.test)
mean((yhat - Carseats.test$Sales)^2)
## [1] 5.113254
By pruning the tree, we see an increase of the mSE to 3.8
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
bag.carseats = randomForest(Sales ~ ., data = Carseats.train, mtry = 10, ntree = 500, importance = TRUE)
yhat.bag = predict(bag.carseats, newdata = Carseats.test)
mean((yhat.bag - Carseats.test$Sales)^2)
## [1] 2.657296
Bagging decreased the MSE to 1.4
importance(bag.carseats)
## %IncMSE IncNodePurity
## CompPrice 23.07909904 171.185734
## Income 2.82081527 94.079825
## Advertising 11.43295625 99.098941
## Population -3.92119532 59.818905
## Price 54.24314632 505.887016
## ShelveLoc 46.26912996 361.962753
## Age 14.24992212 159.740422
## Education -0.07662320 46.738585
## Urban 0.08530119 8.453749
## US 4.34349223 15.157608
We see that “Price” and “ShelveLoc” are the two most important variables.
rf.carseats = randomForest(Sales ~ ., data = Carseats.train, mtry = 3, ntree = 500, importance = TRUE)
yhat.rf = predict(rf.carseats, newdata = Carseats.test)
mean((yhat.rf - Carseats.test$Sales)^2)
## [1] 3.049406
We have a MSE of 1.7.
importance(rf.carseats)
## %IncMSE IncNodePurity
## CompPrice 12.9489323 158.48521
## Income 2.2754686 129.59400
## Advertising 8.9977589 111.94374
## Population -2.2513981 102.84599
## Price 33.4226950 391.60804
## ShelveLoc 34.0233545 290.56502
## Age 12.2185108 171.83302
## Education 0.2592124 71.65413
## Urban 1.1382113 14.76798
## US 4.1925335 33.75554
In the this table too, we see that “Price” and “ShelveLoc” are the most important variables.
set.seed(1)
train = sample(1:nrow(OJ), 800)
OJ.train = OJ[train, ]
OJ.test = OJ[-train, ]
tree.oj = tree(Purchase ~ ., data = OJ.train)
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
The fiited tree has training error rate of .1588 and has 9 terminal nodes.
tree.oj
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196196 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196196 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
we choose the node labelled 8. The split criterion is LoyalCH < 0.035, the number of observationsis 59 and a deviance of 10.14 and an overall prediction for the branch of MM. Less than 2% of the observations in that branch take the value of CH, and the remaining 98% take the value of MM.
plot(tree.oj)
text(tree.oj, pretty = 0)
tree.pred <- predict(tree.oj, OJ.test, type = "class")
table(tree.pred, OJ.test$Purchase)
##
## tree.pred CH MM
## CH 160 38
## MM 8 64
1-(160+64)/(160+38+8+64)
## [1] 0.1703704
The test error rate is about 17%.
cv.oj = cv.tree(tree.oj, FUN = prune.misclass)
cv.oj
## $size
## [1] 9 8 7 4 2 1
##
## $dev
## [1] 150 150 149 158 172 315
##
## $k
## [1] -Inf 0.000000 3.000000 4.333333 10.500000 151.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.oj$size, cv.oj$dev, type = "b", xlab = "Tree size", ylab = "Deviance")
Which tree size corresponds to the lowest cross-validated classification error rate? The 7-node tree is the smallest tree with the lowest classification error rate.
Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.
prune.oj = prune.misclass(tree.oj, best = 7)
plot(prune.oj)
text(prune.oj, pretty = 0)
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
summary(prune.oj)
##
## Classification tree:
## snip.tree(tree = tree.oj, nodes = c(4L, 10L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff" "PctDiscMM"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7748 = 614.4 / 793
## Misclassification error rate: 0.1625 = 130 / 800
The misclassification error rate is slightly higher for the pruned tree (.1625 vs .1588).
prune.pred = predict(prune.oj, OJ.test, type = "class")
table(prune.pred, OJ.test$Purchase)
##
## prune.pred CH MM
## CH 160 36
## MM 8 66
1-(160+66)/(160+36+8+66)
## [1] 0.162963
The unpruned tree has a test error rate of 16.29%.