knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

Chapter 8 Questions

3.

pm1 <- seq(0, 1, by = 0.01)
gini <- 2 * pm1 * (1 - pm1)
class_error <- 1 - pmax(pm1, 1 - pm1)
entropy <- ifelse(pm1 == 0 | pm1 == 1, 0,
                  -pm1 * log2(pm1) - (1 - pm1) * log2(1 - pm1))

plot(pm1, gini, type = "l", lwd = 2, col = "blue", ylim = c(0, 1),
     ylab = "Impurity Measure", xlab = expression(p[m1]),
     main = "Impurity Measures vs. Class Probability")
lines(pm1, class_error, col = "red", lwd = 2, lty = 2)
lines(pm1, entropy, col = "darkgreen", lwd = 2, lty = 3)

legend("top", legend = c("Gini", "Classification Error", "Entropy"),
       col = c("blue", "red", "darkgreen"), lty = 1:3, lwd = 2)

8.

library(ISLR2)
library(tree)
library(randomForest)
library(BART)
data(Carseats)

a.

set.seed(1)

train_index <- sample(1:nrow(Carseats), nrow(Carseats)/2)

train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]

b.

tree_model <- tree(Sales ~ ., data = train)
summary(tree_model)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Advertising" "CompPrice"  
## [6] "US"         
## Number of terminal nodes:  18 
## Residual mean deviance:  2.167 = 394.3 / 182 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.88200 -0.88200 -0.08712  0.00000  0.89590  4.09900
plot(tree_model)

preds <- predict(tree_model, newdata = test)
mse <- mean((preds - test$Sales)^2)
mse
## [1] 4.922039

c.

set.seed(2)
cv_result <- cv.tree(tree_model)

plot(cv_result$size, cv_result$dev, type = "b",
     xlab = "Tree Size (Number of Terminal Nodes)",
     ylab = "Deviance (CV Error)",
     main = "Cross-Validation Results")

best_size <- cv_result$size[which.min(cv_result$dev)]
best_size
## [1] 18
pruned_tree <- prune.tree(tree_model, best = best_size)
plot(pruned_tree)

pruned_pred <- predict(pruned_tree, newdata = test)
mse_pruned <- mean((pruned_pred - test$Sales)^2)
mse_pruned
## [1] 4.922039

d.

set.seed(1)
bag_model <- randomForest(Sales ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)

bag_pred <- predict(bag_model, newdata = test)
mse_bag <- mean((bag_pred - test$Sales)^2)
mse_bag
## [1] 2.605253
importance(bag_model)
##                %IncMSE IncNodePurity
## CompPrice   24.8888481    170.182937
## Income       4.7121131     91.264880
## Advertising 12.7692401     97.164338
## Population  -1.8074075     58.244596
## Price       56.3326252    502.903407
## ShelveLoc   48.8886689    380.032715
## Age         17.7275460    157.846774
## Education    0.5962186     44.598731
## Urban        0.1728373      9.822082
## US           4.2172102     18.073863

e. If a small number of variables is used, each tree is more different from the others, which can help the model avoid over fitting and make better general predictions. If m is too small, the trees may miss important variables and make worse predictions. If a large m is used, the trees become more similar, which can reduce the benefit of using a forest of many trees. In this data set, choosing a middle value for m gave the best test performance and it allowed the model to use useful information without over fitting the training data.

set.seed(1)
rf_model <- randomForest(Sales ~ ., data = train, mtry = 4, importance = TRUE)

rf_pred <- predict(rf_model, newdata = test)
mse_rf <- mean((rf_pred - test$Sales)^2)
mse_rf
## [1] 2.787584
importance(rf_model)
##                %IncMSE IncNodePurity
## CompPrice   15.7891655     160.57944
## Income       4.1275509     121.12953
## Advertising  9.6425758     111.54581
## Population  -1.3596645      85.92575
## Price       43.4055391     423.06225
## ShelveLoc   37.8850232     311.97119
## Age         13.8924424     174.18229
## Education    0.1960888      62.77782
## Urban        0.1393816      12.92952
## US           6.3532441      30.42255

f. Using BART to predict how many car seats a store might sell, it performed very well, with a low test error, meaning its guesses were usually close to the real numbers. BART was especially good at avoiding mistakes that come from over fitting. Overall, BART gave accurate predictions and helped highlight which store features most affect sales.

set.seed(1)

x_train <- train[, -which(names(train) == "Sales")]
y_train <- train$Sales
x_test <- test[, -which(names(test) == "Sales")]
y_test <- test$Sales

bart_model <- gbart(x.train = x_train, y.train = y_train, x.test = x_test)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 14, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 1.000000
## xp1,xp[np*p]: 111.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.273474,3,0.23074,7.57815
## *****sigma: 1.088371
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 7s
## trcnt,tecnt: 1000,1000
bart_preds <- bart_model$yhat.test.mean
mse_bart <- mean((bart_preds - y_test)^2)
mse_bart
## [1] 1.450842

9.

data(OJ)

a.

set.seed(1)
train_index <- sample(1:nrow(OJ), 800)

train <- OJ[train_index, ]
test <- OJ[-train_index, ]

b.

tree_model <- tree(Purchase ~ ., data = train)
summary(tree_model)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## [5] "PctDiscMM"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7432 = 587.8 / 791 
## Misclassification error rate: 0.1588 = 127 / 800

c. The tree works by asking yes-or-no questions and then splitting the data based on those answers. Each final decision point in the tree is called a terminal node, and the tree being modeled has 9 terminal nodes. After testing the tree on the same data it was trained on, it got about 85% of the answers right, which means it had a training error rate of about 15%.

plot(tree_model)

d. Evaluating node 8, the model looked at how loyal a customer is to Tropicana(CH). If their loyalty score was less than 0.0356, they were put into this group. There were 59 people in this group, and the tree predicted that most of them (about 98%) would not buy Tropicana and instead chose Minute Maid. This shows that customers who weren’t very loyal to Tropicana are still more likely to choose Minute Maid. The tree uses these kinds of patterns to help stores predict what customers are likely to buy.

tree_model
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1073.00 CH ( 0.60625 0.39375 )  
##    2) LoyalCH < 0.5036 365  441.60 MM ( 0.29315 0.70685 )  
##      4) LoyalCH < 0.280875 177  140.50 MM ( 0.13559 0.86441 )  
##        8) LoyalCH < 0.0356415 59   10.14 MM ( 0.01695 0.98305 ) *
##        9) LoyalCH > 0.0356415 118  116.40 MM ( 0.19492 0.80508 ) *
##      5) LoyalCH > 0.280875 188  258.00 MM ( 0.44149 0.55851 )  
##       10) PriceDiff < 0.05 79   84.79 MM ( 0.22785 0.77215 )  
##         20) SpecialCH < 0.5 64   51.98 MM ( 0.14062 0.85938 ) *
##         21) SpecialCH > 0.5 15   20.19 CH ( 0.60000 0.40000 ) *
##       11) PriceDiff > 0.05 109  147.00 CH ( 0.59633 0.40367 ) *
##    3) LoyalCH > 0.5036 435  337.90 CH ( 0.86897 0.13103 )  
##      6) LoyalCH < 0.764572 174  201.00 CH ( 0.73563 0.26437 )  
##       12) ListPriceDiff < 0.235 72   99.81 MM ( 0.50000 0.50000 )  
##         24) PctDiscMM < 0.196196 55   73.14 CH ( 0.61818 0.38182 ) *
##         25) PctDiscMM > 0.196196 17   12.32 MM ( 0.11765 0.88235 ) *
##       13) ListPriceDiff > 0.235 102   65.43 CH ( 0.90196 0.09804 ) *
##      7) LoyalCH > 0.764572 261   91.20 CH ( 0.95785 0.04215 ) *

e.

test_preds <- predict(tree_model, newdata = test, type = "class")

conf_matrix <- table(Predicted = test_preds, Actual = test$Purchase)
print(conf_matrix)
##          Actual
## Predicted  CH  MM
##        CH 160  38
##        MM   8  64
test_error_rate <- mean(test_preds != test$Purchase)
print(test_error_rate)
## [1] 0.1703704

f.

set.seed(1)
cv_result <- cv.tree(tree_model, FUN = prune.misclass)
print(cv_result)
## $size
## [1] 9 8 7 4 2 1
## 
## $dev
## [1] 145 145 146 146 167 315
## 
## $k
## [1]       -Inf   0.000000   3.000000   4.333333  10.500000 151.000000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
best_size <- cv_result$size[which.min(cv_result$dev)]
best_size
## [1] 9

g.

set.seed(1)
cv_result <- cv.tree(tree_model, FUN = prune.misclass)

plot(cv_result$size, cv_result$dev, type = "b", pch = 19,
     xlab = "Tree Size (Number of Terminal Nodes)",
     ylab = "CV Classification Error",
     main = "Cross-Validated Error vs. Tree Size")

h. Tree size 9

i.

pruned_tree <- prune.misclass(tree_model, best = best_size)

plot(pruned_tree)
text(pruned_tree, pretty = 0)
title("Pruned Classification Tree")

j and k. They are all the same since the pruned tree and the unpruned tree are the same.

train_pred_unpruned <- predict(tree_model, newdata = train, type = "class")
train_error_unpruned <- mean(train_pred_unpruned != train$Purchase)

train_pred_pruned <- predict(pruned_tree, newdata = train, type = "class")
train_error_pruned <- mean(train_pred_pruned != train$Purchase)

test_pred_unpruned <- predict(tree_model, newdata = test, type = "class")
test_error_unpruned <- mean(test_pred_unpruned != test$Purchase)

test_pred_pruned <- predict(pruned_tree, newdata = test, type = "class")
test_error_pruned <- mean(test_pred_pruned != test$Purchase)

data.frame(
  Model = c("Unpruned Tree", "Pruned Tree"),
  Train_Error = c(train_error_unpruned, train_error_pruned),
  Test_Error = c(test_error_unpruned, test_error_pruned)
)
##           Model Train_Error Test_Error
## 1 Unpruned Tree     0.15875  0.1703704
## 2   Pruned Tree     0.15875  0.1703704