3.

p <- seq(0, 1, length = 100)
gini  <- 2 * p * (1 - p)
error <- pmin(p, 1 - p)  
entropy <- -(
  ifelse(p == 0, 0, p * log2(p)) +
  ifelse(p == 1, 0, (1 - p) * log2(1 - p))
)
plot(p, gini,
     type = "l", lwd = 2,
     ylim = c(0, 1),
     xlab = expression(hat(p)[m1]),
     ylab = "Impurity / Entropy"
)
lines(p, error,  lwd = 2, lty = 2)
lines(p, entropy, lwd = 2, lty = 3)
legend("top",
       legend = c("Gini index", "Classification error", "Entropy"),
       lty    = c(1, 2, 3),
       lwd    = 2
)

8

a.

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(tree)
## Warning: package 'tree' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
install.packages("BayesTree")
## Installing package into 'C:/Users/austr/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'BayesTree' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'BayesTree'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\austr\AppData\Local\R\win-library\4.4\00LOCK\BayesTree\libs\x64\BayesTree.dll
## to
## C:\Users\austr\AppData\Local\R\win-library\4.4\BayesTree\libs\x64\BayesTree.dll:
## Permission denied
## Warning: restored 'BayesTree'
## 
## The downloaded binary packages are in
##  C:\Users\austr\AppData\Local\Temp\RtmpycxC1W\downloaded_packages
library(BayesTree)
## Warning: package 'BayesTree' was built under R version 4.4.3
set.seed(42)

train_idx <- sample(seq_len(nrow(Carseats)), size = 0.7*nrow(Carseats))
Car_train <- Carseats[train_idx, ]
Car_test  <- Carseats[-train_idx, ]
tree.carseats <- tree(Sales ~ ., data = Car_train)

b.

install.packages("rpart.plot")   
## Installing package into 'C:/Users/austr/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'rpart.plot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\austr\AppData\Local\Temp\RtmpycxC1W\downloaded_packages
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
rt <- rpart(Sales ~ ., data = Car_train, method = "anova")


rpart.plot(rt,
           type    = 2,      
           extra   = 101,    
           tweak   = 1.2,    
           shadow.col = "gray90",
           fallen.leaves = TRUE,
           main    = "Regression Tree for Carseats (rpart.plot)")

pred.tree <- predict(tree.carseats, newdata = Car_test)
mse.tree  <- mean((Car_test$Sales - pred.tree)^2)
mse.tree
## [1] 4.214264

The MSE that I obtain is 4.214264

c.

cv.carseats <- cv.tree(tree.carseats, FUN = prune.tree)
plot(cv.carseats$size, cv.carseats$dev, type = "b",
     xlab = "Tree size (# terminal nodes)",
     ylab = "CV deviance")

best.size    <- cv.carseats$size[which.min(cv.carseats$dev)]
pruned.tree  <- prune.tree(tree.carseats, best = best.size)
plot(pruned.tree); text(pruned.tree, pretty = 0)

pred.pruned <- predict(pruned.tree, newdata = Car_test)
mse.pruned  <- mean((Car_test$Sales - pred.pruned)^2)
mse.pruned
## [1] 4.170757

Pruning the tree did improve the test MSE, because it decreased from 4.214264 to 4.170757.

d.

set.seed(42)
bag.carseats <- randomForest(
  Sales ~ .,
  data = Car_train,
  mtry = ncol(Car_train) - 1,  
  ntree = 500,
  importance = TRUE
)
pred.bag <- predict(bag.carseats, newdata = Car_test)
mse.bag  <- mean((Car_test$Sales - pred.bag)^2)
mse.bag
## [1] 2.008767
importance(bag.carseats)
##                 %IncMSE IncNodePurity
## CompPrice   36.17125995    284.639884
## Income       9.17309008    122.646819
## Advertising 21.48508768    174.232464
## Population   0.07498017     83.290458
## Price       61.55615122    648.631596
## ShelveLoc   68.93170258    585.299090
## Age         18.06108045    197.675735
## Education    4.10601657     58.406892
## Urban        2.52710047      9.704960
## US           1.77709308      8.463419
varImpPlot(bag.carseats)

The test MSE that I obtained is 2.008767.

e.

set.seed(42)

rf.carseats <- randomForest(
  Sales ~ .,
  data = Car_train,
  mtry = floor((ncol(Car_train)-1)/3),
  ntree = 500,
  importance = TRUE
)
pred.rf <- predict(rf.carseats, newdata = Car_test)
mse.rf  <- mean((Car_test$Sales - pred.rf)^2)
mse.rf
## [1] 2.642094

The test MSE that I obtain is 2.642094.

importance(rf.carseats)
##                %IncMSE IncNodePurity
## CompPrice   17.6317246     216.58010
## Income       5.3918611     171.93417
## Advertising 15.4372996     208.01838
## Population   1.0167576     147.55649
## Price       40.9793223     507.22018
## ShelveLoc   46.1777066     461.41692
## Age         12.0906006     235.55478
## Education    2.8932704     102.44743
## Urban        0.3027265      19.85041
## US           3.5234743      30.83360
varImpPlot(rf.carseats)

mtry.values <- c(2, 4, 6, 8, 10)
mse.by.mtry  <- sapply(mtry.values, function(m) {
  rf <- randomForest(Sales ~ ., data = Car_train, mtry = m, ntree = 200)
  mean((Car_test$Sales - predict(rf, Car_test))^2)
})
data.frame(mtry = mtry.values, TestMSE = mse.by.mtry)
##   mtry  TestMSE
## 1    2 3.298154
## 2    4 2.414009
## 3    6 2.097258
## 4    8 2.019671
## 5   10 1.916187

From these results we can see that raising m reduces bias and lowers the test error. When m is a small value, the trees are diverse but they have high bias.

f.

set.seed(42)
bart.carseats <- bart(
  x.train = model.matrix(Sales ~ . - 1, data = Car_train),
  y.train = Car_train$Sales,
  x.test  = model.matrix(Sales ~ . - 1, data = Car_test),
  ntree   = 50
)
## 
## 
## Running BART with numeric y
## 
## number of trees: 50
## Prior:
##  k: 2.000000
##  degrees of freedom in sigma prior: 3
##  quantile in sigma prior: 0.900000
##  power and base for tree prior: 2.000000 0.950000
##  use quantiles for rule cut points: 0
## data:
##  number of training observations: 280
##  number of test observations: 120
##  number of explanatory variables: 12
## 
## 
## Cutoff rules c in x<=c vs x>c
## Number of cutoffs: (var: number of possible c):
## (1: 100) (2: 100) (3: 100) (4: 100) (5: 100) 
## (6: 100) (7: 100) (8: 100) (9: 100) (10: 100) 
## (11: 100) (12: 100) 
## 
## 
## Running mcmc loop:
## iteration: 100 (of 1100)
## iteration: 200 (of 1100)
## iteration: 300 (of 1100)
## iteration: 400 (of 1100)
## iteration: 500 (of 1100)
## iteration: 600 (of 1100)
## iteration: 700 (of 1100)
## iteration: 800 (of 1100)
## iteration: 900 (of 1100)
## iteration: 1000 (of 1100)
## iteration: 1100 (of 1100)
## time for loop: 2
## 
## Tree sizes, last iteration:
## 2 2 3 2 2 2 2 2 2 4 3 2 1 3 3 3 3 2 2 2 
## 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 
## 3 2 2 3 2 2 2 2 2 2 Variable Usage, last iteration (var:count):
## (1: 10) (2: 2) (3: 7) (4: 4) (5: 11) 
## (6: 7) (7: 6) (8: 2) (9: 3) (10: 2) 
## (11: 1) (12: 5) 
## DONE BART 11-2-2014
pred.bart <- bart.carseats$yhat.test.mean
mse.bart  <- mean((Car_test$Sales - pred.bart)^2)
mse.bart
## [1] 1.249302

Based on these results, we can see that there are many small trees, which is expected. These trees are heavily relying on the key predictors with the highest variable usage counts. The MSE value is very good, the lower than the previous methods.

9.

a.

library(ISLR2)   
## Warning: package 'ISLR2' was built under R version 4.4.2
## 
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit
library(tree)
set.seed(123)

n   <- nrow(OJ)
idx <- sample(n, 800)
OJ.train <- OJ[idx, ]
OJ.test  <- OJ[-idx, ]
oj.tree <- tree(Purchase ~ ., data = OJ.train)

b.

train.pred <- predict(oj.tree, OJ.train, type = "class")
train.err  <- mean(train.pred != OJ.train$Purchase)
cat("Training error rate =", round(train.err, 4), "\n")
## Training error rate = 0.165
oj.tree <- tree(Purchase ~ ., data = OJ.train)

oj.sum <- summary(oj.tree)
oj.sum
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH"   "PriceDiff"
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7625 = 603.9 / 792 
## Misclassification error rate: 0.165 = 132 / 800

Only two predictors were used, LoyalCH and PriceDiff, and all the observations were carved into 8 nodes. Thus, the tree has 8 nodes. The training error rate value is 0.165, or 16.5%.

c.

oj.tree
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1071.00 CH ( 0.60875 0.39125 )  
##    2) LoyalCH < 0.5036 350  415.10 MM ( 0.28000 0.72000 )  
##      4) LoyalCH < 0.276142 170  131.00 MM ( 0.12941 0.87059 )  
##        8) LoyalCH < 0.0356415 56   10.03 MM ( 0.01786 0.98214 ) *
##        9) LoyalCH > 0.0356415 114  108.90 MM ( 0.18421 0.81579 ) *
##      5) LoyalCH > 0.276142 180  245.20 MM ( 0.42222 0.57778 )  
##       10) PriceDiff < 0.05 74   74.61 MM ( 0.20270 0.79730 ) *
##       11) PriceDiff > 0.05 106  144.50 CH ( 0.57547 0.42453 ) *
##    3) LoyalCH > 0.5036 450  357.10 CH ( 0.86444 0.13556 )  
##      6) PriceDiff < -0.39 27   32.82 MM ( 0.29630 0.70370 ) *
##      7) PriceDiff > -0.39 423  273.70 CH ( 0.90071 0.09929 )  
##       14) LoyalCH < 0.705326 130  135.50 CH ( 0.78462 0.21538 )  
##         28) PriceDiff < 0.145 43   58.47 CH ( 0.58140 0.41860 ) *
##         29) PriceDiff > 0.145 87   62.07 CH ( 0.88506 0.11494 ) *
##       15) LoyalCH > 0.705326 293  112.50 CH ( 0.95222 0.04778 ) *

I will pick terminal 4. It has 170 nodes, deviance value of 131.00, predicts Minute Maid for everyone, and the missclass rate is 0.12941 (1-0.87059), or 12.94%. In conclusion, the customers in this segment have very low CH-loyalty.

d.

plot(oj.tree)
text(oj.tree, pretty = 0, cex = 0.7)
title("Classification Tree for OJ (training set)")

We can determine that CH loyalty is the dominant predictor. Also, we can see that price difference matters only after you establish a moderate level of brand loyalty. Additionally, we can see that there are noticable price penalties that can overcome even strong loyalty. We can also see that very weak loyalty almost always defaults to MM regardless of price, unless there is a good enough offer.

e.

test.pred <- predict(oj.tree, OJ.test, type = "class")
cm <- table(OJ.test$Purchase, test.pred)
cm
##     test.pred
##       CH  MM
##   CH 150  16
##   MM  34  70
test.err <- mean(test.pred != OJ.test$Purchase)
cat("Test error rate =", round(test.err, 4), "\n")
## Test error rate = 0.1852

The test error rate is 0.1852, or 18.52%.

f.

set.seed(123)
cv.oj <- cv.tree(oj.tree, FUN = prune.tree, method = "misclass")
cv.oj
## $size
## [1] 8 5 3 2 1
## 
## $dev
## [1] 139 139 157 167 313
## 
## $k
## [1] -Inf    0    8   11  154
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
best_idx  <- which.min(cv.oj$dev)

# 2) Look up the corresponding tree size
opt_size  <- cv.oj$size[best_idx]

# 3) Compare to your full‐tree size
full_size <- sum(oj.tree$frame$var == "<leaf>")

cat("Full tree size:", full_size, "\n",
    "CV-optimal size:", opt_size, "\n")
## Full tree size: 8 
##  CV-optimal size: 8

g.

plot(cv.oj$size, cv.oj$dev, type = "b",
     xlab = "Tree size (no. terminal nodes)",
     ylab = "CV misclass. error",
     main = "CV for OJ Tree")

h.

opt_size <- cv.oj$size[which.min(cv.oj$dev)]
cat("Optimal tree size =", opt_size, "\n")
## Optimal tree size = 8

The tree size that corresponds to the lowest level cross-validated classification error rate is 8 nodes.

i.

pruned.oj <- prune.tree(oj.tree, best = 5, method = "misclass")
plot(pruned.oj)
text(pruned.oj, pretty = 0, cex = 0.7)
title("Pruned OJ Tree")

j.

train.pred1 <- predict(oj.tree,    OJ.train, type = "class")
err1        <- mean(train.pred1 != OJ.train$Purchase)
train.pred2 <- predict(pruned.oj,  OJ.train, type = "class")
err2        <- mean(train.pred2 != OJ.train$Purchase)

cat("Unpruned training error:", round(err1,4), "\n",
    "Pruned   training error:", round(err2,4), "\n")
## Unpruned training error: 0.165 
##  Pruned   training error: 0.165

They are the same value because pruning merged branches that all predicted the same class, so no observation’s predicted label, and thus no misclassification actually changed.

k.

test.pred1 <- predict(oj.tree,   OJ.test, type = "class")
test.err1  <- mean(test.pred1 != OJ.test$Purchase)
test.pred2 <- predict(pruned.oj, OJ.test, type = "class")
test.err2  <- mean(test.pred2 != OJ.test$Purchase)
cat("Unpruned test error:", round(test.err1,4), "\n",
    "Pruned   test error:", round(test.err2,4), "\n")
## Unpruned test error: 0.1852 
##  Pruned   test error: 0.1852

They are the same value because pruning only removed splits whose leaves all agreed on each test‐case’s predicted class, so no test observations changed labels and the misclassification rate stayed at 0.1852.