p <- seq(0, 1, 0.01)
gini.index <- 2 * p * (1 - p)
class.error <- 1 - pmax(p, 1 - p)
cross.entropy <- - (p * log(p) + (1 - p) * log(1 - p))
matplot(p, cbind(gini.index, class.error, cross.entropy), col = c("red", "green", "blue"))

#(A.)
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(tree)
## Warning: package 'tree' was built under R version 3.6.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
attach(Carseats)
#(A.)
set.seed(0)
n <- nrow(Carseats)
p <- ncol(Carseats) - 1
train <- sample(1:n, n/2)
test <- (1:n)[-train]
#(B.)
rtree.carseats <- tree(Sales ~ ., data = Carseats[train, ])
summary(rtree.carseats)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats[train, ])
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Income" "CompPrice" "US"
## Number of terminal nodes: 17
## Residual mean deviance: 2.441 = 446.7 / 183
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.90700 -0.99560 -0.03944 0.00000 1.01200 3.77600
plot(rtree.carseats)
text(rtree.carseats, pretty = 0)

print(rtree.carseats)
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 1655.00 7.574
## 2) ShelveLoc: Bad,Medium 155 925.80 6.799
## 4) Price < 124.5 96 549.10 7.579
## 8) Age < 52 41 191.30 8.754
## 16) ShelveLoc: Bad 16 81.45 7.392
## 32) Income < 65.5 9 32.43 6.137 *
## 33) Income > 65.5 7 16.61 9.006 *
## 17) ShelveLoc: Medium 25 61.21 9.626
## 34) Price < 106 13 23.45 10.490 *
## 35) Price > 106 12 17.49 8.688 *
## 9) Age > 52 55 259.00 6.703
## 18) Price < 92 9 25.91 9.280 *
## 19) Price > 92 46 161.70 6.199
## 38) ShelveLoc: Bad 14 48.40 4.768 *
## 39) ShelveLoc: Medium 32 72.01 6.826
## 78) CompPrice < 114.5 11 14.05 5.765 *
## 79) CompPrice > 114.5 21 39.09 7.381 *
## 5) Price > 124.5 59 223.10 5.529
## 10) CompPrice < 134.5 30 96.56 4.726
## 20) Age < 67.5 24 55.89 5.219
## 40) Price < 132.5 12 22.55 6.105 *
## 41) Price > 132.5 12 14.50 4.333 *
## 21) Age > 67.5 6 11.52 2.755 *
## 11) CompPrice > 134.5 29 87.20 6.360
## 22) Age < 44.5 14 32.48 7.262 *
## 23) Age > 44.5 15 32.67 5.517 *
## 3) ShelveLoc: Good 45 315.10 10.240
## 6) Price < 136.5 36 152.40 11.100
## 12) Price < 109.5 18 46.28 12.170 *
## 13) Price > 109.5 18 64.47 10.020
## 26) US: No 6 11.47 8.282 *
## 27) US: Yes 12 25.79 10.890 *
## 7) Price > 136.5 9 32.00 6.834 *
y.hat <- predict(rtree.carseats, newdata = Carseats[test, ])
test.MSE <- mean((y.hat - Carseats[test, ]$Sales)^2)
print(test.MSE)
## [1] 4.276852
#Test MSE is about 4.28.
#(C.)
cv.carseats <- cv.tree(rtree.carseats)
names(cv.carseats)
## [1] "size" "dev" "k" "method"
print(cv.carseats)
## $size
## [1] 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
##
## $dev
## [1] 1074.122 1097.694 1097.694 1107.972 1108.590 1080.886 1092.554 1123.180
## [9] 1151.382 1149.899 1149.899 1134.674 1217.885 1262.475 1316.610 1277.419
## [17] 1676.291
##
## $k
## [1] -Inf 18.83282 18.87206 20.27236 22.04537 27.21361 29.14616
## [8] 32.41127 39.33786 41.23946 41.60250 48.67832 71.43696 98.75811
## [15] 130.73089 153.56960 413.77594
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.carseats$size, cv.carseats$dev, type = "b")

prune.carseats <- prune.tree(rtree.carseats, best = 6)
plot(prune.carseats)
text(prune.carseats, pretty = 0)

y.hat <- predict(prune.carseats, newdata = Carseats[test, ])
prune.MSE <- mean((y.hat - Carseats[test, ]$Sales)^2)
print(prune.MSE)
## [1] 4.736334
#Test MSE is about 4.74.
#(D.)
carseats.bag <- randomForest(Sales ~ ., data = Carseats, mtry = p, ntree = 500, importance = TRUE, subset = train)
y.hat <- predict(carseats.bag, newdata = Carseats[test, ])
mse.bag <- mean((Carseats[test, ]$Sales - y.hat)^2)
print(mse.bag)
## [1] 2.597362
#Test MSE is about 2.60.
plot(carseats.bag)

ibag <- importance(carseats.bag)
print(ibag[order(ibag[, 1]), ])
## %IncMSE IncNodePurity
## Population -2.660192 49.23199
## Education 2.396716 44.92580
## US 2.711138 13.52099
## Urban 3.303058 11.36380
## Income 3.331959 79.14007
## Advertising 16.217643 116.83583
## Age 17.283679 167.54657
## CompPrice 22.815491 171.93564
## Price 52.792602 495.11024
## ShelveLoc 54.280908 464.99885
#(E.)
carseats.rf <- randomForest(Sales ~ ., data = Carseats, ntree = 500, mtry = p/3, importance = TRUE, subset = train)
y.hat <- predict(carseats.rf, newdata = Carseats[test, ])
mse.rf <- mean((Carseats[test, ]$Sales - y.hat)^2)
print(mse.rf)
## [1] 2.988168
#Test MSE is about 2.99.
plot(carseats.rf)

irf <- importance(carseats.rf)
print(irf[order(irf[, 1]), ])
## %IncMSE IncNodePurity
## Population -0.9029990 97.56670
## Urban 0.1259916 19.06768
## Income 2.1277547 115.04105
## Education 3.1350209 76.48365
## US 6.6536420 30.78831
## CompPrice 13.1921517 166.43324
## Advertising 13.5428052 121.69371
## Age 14.3583472 205.55877
## ShelveLoc 34.0735184 330.22667
## Price 36.8150937 387.12126
library(ISLR)
library(tree)
library(randomForest)
#(A.)
set.seed(1)
train <- sample(1:nrow(OJ), 800)
OJ.train <- OJ[train,]
OJ.test <- OJ[-train,]
#(B.)
tree.oj <- tree(Purchase ~ ., data = OJ.train)
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
#The fitted tree has 8 terminal nodes, and a training error rate of 0.1588.
#(C.)
tree.oj
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196197 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196197 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
#I picked the node labelled 8, which is a terminal node because of the asterisk.
#(D.)
plot(tree.oj)
text(tree.oj, pretty = 0)

#The most important indicator of “Purchase” appears to be “LoyalCH”, since the first branch differentiates the intensity of customer brand loyalty to CH.
#(E.)
tree.pred <- predict(tree.oj, OJ.test, type = "class")
table(tree.pred, OJ.test$Purchase)
##
## tree.pred CH MM
## CH 160 38
## MM 8 64
mean(tree.pred!=OJ.test$Purchase)
## [1] 0.1703704
#Test error rate is about 17.03%.
#(F.)
cv.oj <- cv.tree(tree.oj, FUN = prune.misclass)
cv.oj
## $size
## [1] 9 8 7 4 2 1
##
## $dev
## [1] 150 150 149 158 172 315
##
## $k
## [1] -Inf 0.000000 3.000000 4.333333 10.500000 151.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
#(G.)
plot(cv.oj$size, cv.oj$dev, type = "b", xlab = "Tree size", ylab = "Deviance")

#(H.)
#We may see that the 7-node tree is the smallest tree with the lowest classification error rate.
#(I.)
prune.oj <- prune.misclass(tree.oj, best = 7)
plot(prune.oj)
text(prune.oj, pretty = 0)

#(J.)
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
summary(prune.oj)
##
## Classification tree:
## snip.tree(tree = tree.oj, nodes = c(4L, 10L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff" "PctDiscMM"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7748 = 614.4 / 793
## Misclassification error rate: 0.1625 = 130 / 800
#The misclassification error rate is slightly higher for the pruned tree (0.1588 vs 0.1625).
#(K.)
prune.pred <- predict(prune.oj, OJ.test, type = "class")
table(prune.pred, OJ.test$Purchase)
##
## prune.pred CH MM
## CH 160 36
## MM 8 66
mean(prune.pred!=OJ.test$Purchase)
## [1] 0.162963
#Test Error rate is about 16.30%