p <- seq(0, 1, 0.01)
gini.index <- 2 * p * (1 - p)
class.error <- 1 - pmax(p, 1 - p)
cross.entropy <- - (p * log(p) + (1 - p) * log(1 - p))
matplot(p, cbind(gini.index, class.error, cross.entropy), col = c("red", "green", "blue"))

#(A.)
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(tree)
## Warning: package 'tree' was built under R version 3.6.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
attach(Carseats)

#(A.)
set.seed(0)

n <- nrow(Carseats)
p <- ncol(Carseats) - 1  

train <- sample(1:n, n/2)
test <- (1:n)[-train]

#(B.)
rtree.carseats <- tree(Sales ~ ., data = Carseats[train, ])
summary(rtree.carseats)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats[train, ])
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price"     "Age"       "Income"    "CompPrice" "US"       
## Number of terminal nodes:  17 
## Residual mean deviance:  2.441 = 446.7 / 183 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.90700 -0.99560 -0.03944  0.00000  1.01200  3.77600
plot(rtree.carseats)
text(rtree.carseats, pretty = 0)

print(rtree.carseats)
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 200 1655.00  7.574  
##    2) ShelveLoc: Bad,Medium 155  925.80  6.799  
##      4) Price < 124.5 96  549.10  7.579  
##        8) Age < 52 41  191.30  8.754  
##         16) ShelveLoc: Bad 16   81.45  7.392  
##           32) Income < 65.5 9   32.43  6.137 *
##           33) Income > 65.5 7   16.61  9.006 *
##         17) ShelveLoc: Medium 25   61.21  9.626  
##           34) Price < 106 13   23.45 10.490 *
##           35) Price > 106 12   17.49  8.688 *
##        9) Age > 52 55  259.00  6.703  
##         18) Price < 92 9   25.91  9.280 *
##         19) Price > 92 46  161.70  6.199  
##           38) ShelveLoc: Bad 14   48.40  4.768 *
##           39) ShelveLoc: Medium 32   72.01  6.826  
##             78) CompPrice < 114.5 11   14.05  5.765 *
##             79) CompPrice > 114.5 21   39.09  7.381 *
##      5) Price > 124.5 59  223.10  5.529  
##       10) CompPrice < 134.5 30   96.56  4.726  
##         20) Age < 67.5 24   55.89  5.219  
##           40) Price < 132.5 12   22.55  6.105 *
##           41) Price > 132.5 12   14.50  4.333 *
##         21) Age > 67.5 6   11.52  2.755 *
##       11) CompPrice > 134.5 29   87.20  6.360  
##         22) Age < 44.5 14   32.48  7.262 *
##         23) Age > 44.5 15   32.67  5.517 *
##    3) ShelveLoc: Good 45  315.10 10.240  
##      6) Price < 136.5 36  152.40 11.100  
##       12) Price < 109.5 18   46.28 12.170 *
##       13) Price > 109.5 18   64.47 10.020  
##         26) US: No 6   11.47  8.282 *
##         27) US: Yes 12   25.79 10.890 *
##      7) Price > 136.5 9   32.00  6.834 *
y.hat <- predict(rtree.carseats, newdata = Carseats[test, ])
test.MSE <- mean((y.hat - Carseats[test, ]$Sales)^2)
print(test.MSE)
## [1] 4.276852
#Test MSE is about 4.28.

#(C.)
cv.carseats <- cv.tree(rtree.carseats)
names(cv.carseats)
## [1] "size"   "dev"    "k"      "method"
print(cv.carseats)
## $size
##  [1] 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1
## 
## $dev
##  [1] 1074.122 1097.694 1097.694 1107.972 1108.590 1080.886 1092.554 1123.180
##  [9] 1151.382 1149.899 1149.899 1134.674 1217.885 1262.475 1316.610 1277.419
## [17] 1676.291
## 
## $k
##  [1]      -Inf  18.83282  18.87206  20.27236  22.04537  27.21361  29.14616
##  [8]  32.41127  39.33786  41.23946  41.60250  48.67832  71.43696  98.75811
## [15] 130.73089 153.56960 413.77594
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
plot(cv.carseats$size, cv.carseats$dev, type = "b")

prune.carseats <- prune.tree(rtree.carseats, best = 6)

plot(prune.carseats)
text(prune.carseats, pretty = 0)

y.hat <- predict(prune.carseats, newdata = Carseats[test, ])
prune.MSE <- mean((y.hat - Carseats[test, ]$Sales)^2)
print(prune.MSE)
## [1] 4.736334
#Test MSE is about 4.74.

#(D.)
carseats.bag <- randomForest(Sales ~ ., data = Carseats, mtry = p, ntree = 500, importance = TRUE, subset = train)
y.hat <- predict(carseats.bag, newdata = Carseats[test, ])
mse.bag <- mean((Carseats[test, ]$Sales - y.hat)^2)
print(mse.bag)
## [1] 2.597362
#Test MSE is about 2.60.

plot(carseats.bag)

ibag <- importance(carseats.bag)
print(ibag[order(ibag[, 1]), ])
##               %IncMSE IncNodePurity
## Population  -2.660192      49.23199
## Education    2.396716      44.92580
## US           2.711138      13.52099
## Urban        3.303058      11.36380
## Income       3.331959      79.14007
## Advertising 16.217643     116.83583
## Age         17.283679     167.54657
## CompPrice   22.815491     171.93564
## Price       52.792602     495.11024
## ShelveLoc   54.280908     464.99885
#(E.)
carseats.rf <- randomForest(Sales ~ ., data = Carseats, ntree = 500, mtry = p/3, importance = TRUE, subset = train)
y.hat <- predict(carseats.rf, newdata = Carseats[test, ])
mse.rf <- mean((Carseats[test, ]$Sales - y.hat)^2)
print(mse.rf)
## [1] 2.988168
#Test MSE is about 2.99.

plot(carseats.rf)

irf <- importance(carseats.rf)
print(irf[order(irf[, 1]), ])
##                %IncMSE IncNodePurity
## Population  -0.9029990      97.56670
## Urban        0.1259916      19.06768
## Income       2.1277547     115.04105
## Education    3.1350209      76.48365
## US           6.6536420      30.78831
## CompPrice   13.1921517     166.43324
## Advertising 13.5428052     121.69371
## Age         14.3583472     205.55877
## ShelveLoc   34.0735184     330.22667
## Price       36.8150937     387.12126
library(ISLR)
library(tree)
library(randomForest)

#(A.)
set.seed(1)
train <- sample(1:nrow(OJ), 800)
OJ.train <- OJ[train,]
OJ.test <- OJ[-train,]

#(B.)
tree.oj <- tree(Purchase ~ ., data = OJ.train)
summary(tree.oj)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## [5] "PctDiscMM"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7432 = 587.8 / 791 
## Misclassification error rate: 0.1588 = 127 / 800
#The fitted tree has 8 terminal nodes, and a training error rate of 0.1588.

#(C.)
tree.oj
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1073.00 CH ( 0.60625 0.39375 )  
##    2) LoyalCH < 0.5036 365  441.60 MM ( 0.29315 0.70685 )  
##      4) LoyalCH < 0.280875 177  140.50 MM ( 0.13559 0.86441 )  
##        8) LoyalCH < 0.0356415 59   10.14 MM ( 0.01695 0.98305 ) *
##        9) LoyalCH > 0.0356415 118  116.40 MM ( 0.19492 0.80508 ) *
##      5) LoyalCH > 0.280875 188  258.00 MM ( 0.44149 0.55851 )  
##       10) PriceDiff < 0.05 79   84.79 MM ( 0.22785 0.77215 )  
##         20) SpecialCH < 0.5 64   51.98 MM ( 0.14062 0.85938 ) *
##         21) SpecialCH > 0.5 15   20.19 CH ( 0.60000 0.40000 ) *
##       11) PriceDiff > 0.05 109  147.00 CH ( 0.59633 0.40367 ) *
##    3) LoyalCH > 0.5036 435  337.90 CH ( 0.86897 0.13103 )  
##      6) LoyalCH < 0.764572 174  201.00 CH ( 0.73563 0.26437 )  
##       12) ListPriceDiff < 0.235 72   99.81 MM ( 0.50000 0.50000 )  
##         24) PctDiscMM < 0.196197 55   73.14 CH ( 0.61818 0.38182 ) *
##         25) PctDiscMM > 0.196197 17   12.32 MM ( 0.11765 0.88235 ) *
##       13) ListPriceDiff > 0.235 102   65.43 CH ( 0.90196 0.09804 ) *
##      7) LoyalCH > 0.764572 261   91.20 CH ( 0.95785 0.04215 ) *
#I picked the node labelled 8, which is a terminal node because of the asterisk.

#(D.)
plot(tree.oj)
text(tree.oj, pretty = 0)

#The most important indicator of “Purchase” appears to be “LoyalCH”, since the first branch differentiates the intensity of customer brand loyalty to CH.

#(E.)
tree.pred <- predict(tree.oj, OJ.test, type = "class")
table(tree.pred, OJ.test$Purchase)
##          
## tree.pred  CH  MM
##        CH 160  38
##        MM   8  64
mean(tree.pred!=OJ.test$Purchase)
## [1] 0.1703704
#Test error rate is about 17.03%.

#(F.)
cv.oj <- cv.tree(tree.oj, FUN = prune.misclass)
cv.oj
## $size
## [1] 9 8 7 4 2 1
## 
## $dev
## [1] 150 150 149 158 172 315
## 
## $k
## [1]       -Inf   0.000000   3.000000   4.333333  10.500000 151.000000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
#(G.)
plot(cv.oj$size, cv.oj$dev, type = "b", xlab = "Tree size", ylab = "Deviance")

#(H.)
#We may see that the 7-node tree is the smallest tree with the lowest classification error rate.

#(I.)
prune.oj <- prune.misclass(tree.oj, best = 7)
plot(prune.oj)
text(prune.oj, pretty = 0)

#(J.)
summary(tree.oj)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## [5] "PctDiscMM"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7432 = 587.8 / 791 
## Misclassification error rate: 0.1588 = 127 / 800
summary(prune.oj)
## 
## Classification tree:
## snip.tree(tree = tree.oj, nodes = c(4L, 10L))
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "ListPriceDiff" "PctDiscMM"    
## Number of terminal nodes:  7 
## Residual mean deviance:  0.7748 = 614.4 / 793 
## Misclassification error rate: 0.1625 = 130 / 800
#The misclassification error rate is slightly higher for the pruned tree (0.1588 vs 0.1625).

#(K.)
prune.pred <- predict(prune.oj, OJ.test, type = "class")
table(prune.pred, OJ.test$Purchase)
##           
## prune.pred  CH  MM
##         CH 160  36
##         MM   8  66
mean(prune.pred!=OJ.test$Purchase)
## [1] 0.162963
#Test Error rate is about 16.30%