STAT 4230 Homework #3

Problem #1

(a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations, setting seed to 1 and using createDataPartition with Purchase as the response.

library(ISLR2)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

data("OJ")

set.seed(1)

train_index <- createDataPartition(OJ$Purchase, p = 800 / nrow(OJ), list = FALSE)

train <- OJ[train_index, ]
test  <- OJ[-train_index, ]

dim(train)

## [1] 801  18

dim(test)

## [1] 269  18

(b) Set seed to 2025. Fit a basic tree to the training data, with all other variables as predictors. Plot the tree and annotate it. What is the training error rate? How many terminal nodes does the tree have?

library(ISLR2)
library(tree)
library(caret)

data("OJ")

set.seed(2025)
train_index <- createDataPartition(OJ$Purchase, p = 800 / nrow(OJ), list = FALSE)
train <- OJ[train_index, ]
test <- OJ[-train_index, ]

tree.oj <- tree(Purchase ~ ., data = train)

plot(tree.oj)
text(tree.oj, pretty = 0)

tree.pred <- predict(tree.oj, newdata = train, type = "class")
train_err <- mean(tree.pred != train$Purchase)
train_err

## [1] 0.1622971

summary(tree.oj)$size

## [1] 8

(c) Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?

tree.pred.test <- predict(tree.oj, newdata = test, type = "class")
confusion <- table(Predicted = tree.pred.test, Actual = test$Purchase)
confusion

##          Actual
## Predicted  CH  MM
##        CH 138  20
##        MM  26  85

test_err <- mean(tree.pred.test != test$Purchase)
test_err

## [1] 0.1710037

(d) Set seed to 2025. Apply the cv.tree() function from the tree package to the training set in order to determine the optimal tree size based on mis-classification rates. Produce a plot with tree size on the x-axis and cross-validated classification error rate on the y-axis.

set.seed(2025)
cv.oj <- cv.tree(tree.oj, FUN = prune.misclass)
plot(cv.oj$size, cv.oj$dev, type = "b",
     xlab = "Tree Size", ylab = "CV Misclassification Error")

(e) Produce a pruned tree corresponding to the optimal tree size from (d) and annotate it.

min_err <- min(cv.oj$dev)
target <- min_err * 1.10
opt_size <- min(cv.oj$size[cv.oj$dev <= target])

prune.oj <- prune.misclass(tree.oj, best = opt_size)

plot(prune.oj)
text(prune.oj, pretty = 0)

(f) Compare the training error rates between the pruned and unpruned trees. Which is higher? Compare the test error rates between the pruned and unpruned trees. Which is higher?

pred_train_unpruned <- predict(tree.oj, train, type="class")
pred_train_pruned   <- predict(prune.oj, train, type="class")

train_err_unpruned <- mean(pred_train_unpruned != train$Purchase)
train_err_pruned   <- mean(pred_train_pruned   != train$Purchase)

pred_test_unpruned <- predict(tree.oj, test, type="class")
pred_test_pruned   <- predict(prune.oj, test, type="class")

test_err_unpruned <- mean(pred_test_unpruned != test$Purchase)
test_err_pruned   <- mean(pred_test_pruned   != test$Purchase)

c(train_err_unpruned, train_err_pruned, test_err_unpruned, test_err_pruned)

## [1] 0.1735331 0.1935081 0.1710037 0.1821561

Problem #2

(a) Check and ensure appropriate factorization of the outcome variable Own. Split the data set into a training set and a test set. Set seed to 123 and create an 80/20 data partition using createDataPartition.

library(ISLR2)
library(caret)

data("Credit")

Credit$Own <- as.factor(Credit$Own)

set.seed(123)
train_idx <- createDataPartition(Credit$Own, p = 0.8, list = FALSE)

train <- Credit[train_idx, ]
test  <- Credit[-train_idx, ]

(b) Set seed to 2025. Use the bagging approach with the training data in order to predict Own using all other variables, with 500 trees. Plot the variable importance and list the 3 most important variables based on node purity. Print the test confusion ma- trix and report the test mis-classification rate. Also calculate the OOB (Out of bag) misclassification rate.

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(2025)

bag.credit <- randomForest(Own ~ ., data = train, mtry = ncol(train) - 1, ntree = 500, importance = TRUE)

varImpPlot(bag.credit, type = 2)

imp <- importance(bag.credit, type = 2)
head(sort(imp[,1], decreasing = TRUE), 3)

##   Income      Age    Limit 
## 31.97469 31.12184 22.13308

pred_test <- predict(bag.credit, test)
tbl <- table(Predicted = pred_test, Actual = test$Own)
tbl

##          Actual
## Predicted No Yes
##       No  16  18
##       Yes 22  23

mean(pred_test != test$Own)

## [1] 0.5063291

bag.credit$err.rate[500, "OOB"]

##       OOB 
## 0.4641745

(c) Set seed to 2025. Improve the bagging model by trying out 50, 100, 200, 500 trees, keeping all else unchanged. And pick the best performing model based on the OOB error (mis-classification) rate. Also, calculate the test error for this final bagging model.

library(randomForest)

set.seed(2025)
trees <- c(50, 100, 200, 500)

models <- lapply(trees, function(t) {
  randomForest(Own ~ ., data = train, mtry = ncol(train)-1, ntree = t)
})

oob_errors <- sapply(models, function(m) m$err.rate[nrow(m$err.rate), "OOB"])

best_model <- models[[which.min(oob_errors)]]

pred_test <- predict(best_model, test)
mean(pred_test != test$Own)

## [1] 0.5189873

(d) Set seed to 2025. Use random forests to analyze this training data. Use 500 trees and start with an m equal to the rounded value of sqrt(p). Plot the variable importance and list the 3 most important variables based on node purity. Print the test confusion matrix and report the test mis-classification rate. Also calculate the OOB (Out of bag) misclassification rate.

library(randomForest)

set.seed(2025)

p <- ncol(train) - 1
mtry_val <- round(sqrt(p))

rf.credit <- randomForest(Own ~ ., data = train, mtry = mtry_val, ntree = 500, importance = TRUE)

varImpPlot(rf.credit, type = 2)

imp <- importance(rf.credit, type = 2)
head(sort(imp[,1], decreasing = TRUE), 3)

##   Income      Age    Limit 
## 26.91730 26.03310 25.23826

pred_test <- predict(rf.credit, test)
table(Predicted = pred_test, Actual = test$Own)

##          Actual
## Predicted No Yes
##       No  16  18
##       Yes 22  23

mean(pred_test != test$Own)

## [1] 0.5063291

rf.credit$err.rate[500, "OOB"]

##       OOB 
## 0.4890966

(e) Set seed to 2025. Improve the RF model by trying out mtry values from 2 to 9, with 500 trees each. And pick the best performing model based on the OOB error (mis-classification) rate. Also, calculate the test error for this final RF model.

library(randomForest)

set.seed(2025)
m_vals <- 2:9

rf_models <- lapply(m_vals, function(m) {
  randomForest(Own ~ ., data = train, mtry = m, ntree = 500)
})

oob_errs <- sapply(rf_models, function(m) m$err.rate[500, "OOB"])

best_rf <- rf_models[[which.min(oob_errs)]]

pred_test <- predict(best_rf, test)
mean(pred_test != test$Own)

## [1] 0.5316456

Problem #3

**(a) Remove any rows with missing values, and then log-transform the salaries. Report the final dimensions and a summary of the transformed target variable.

library(ISLR2)
library(gbm)

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

data("Hitters")
Hitters <- na.omit(Hitters)

Hitters$Salary <- log(Hitters$Salary)

dim(Hitters)

## [1] 263  20

summary(Hitters$Salary)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.212   5.247   6.052   5.927   6.620   7.808

**(b) Create a training set consisting of the first 200 observations, and a test set consisting of the remaining observations.

train <- Hitters[1:200, ]
test  <- Hitters[201:nrow(Hitters), ]

(c) Set seed to 2025. Perform boosting on the training set with 1,000 trees for a range of values of the shrinkage parameter λ from 0.001 to 0.251 (in increments of 0.05). Use a tree depth of 3. Print the test MSE for the optimal λ. Produce a plot with different λ values on the x-axis and the corresponding test set MSE on the y-axis. State your conclusions.

library(gbm)

set.seed(2025)
lambdas <- seq(0.001, 0.251, by = 0.05)

mse_vals <- sapply(lambdas, function(lam) {
  m <- gbm(Salary ~ ., data = train,
           distribution = "gaussian",
           n.trees = 1000,
           shrinkage = lam,
           interaction.depth = 3)
  pred <- predict(m, newdata = test, n.trees = 1000)
  mean((test$Salary - pred)^2)
})

plot(lambdas, mse_vals, type = "b",
     xlab = "Shrinkage (λ)", ylab = "Test MSE")

min(mse_vals)

## [1] 0.2711232

lambdas[which.min(mse_vals)]

## [1] 0.051

(d) Set seed to 2025. Perform boosting on the training set with 1,000 trees for a shrinkage parameter λ of 0.1. Evaluate tree depth values of 2, 4, 6, 8. Print the test MSE for the optimal depth. Produce a plot with different depth values on the x-axis and the corresponding test set MSE on the y-axis. State your conclusions.

library(gbm)

set.seed(2025)
depths <- c(2, 4, 6, 8)

mse_vals <- sapply(depths, function(d) {
  m <- gbm(Salary ~ ., data = train,
           distribution = "gaussian",
           n.trees = 1000,
           shrinkage = 0.1,
           interaction.depth = d)
  pred <- predict(m, newdata = test, n.trees = 1000)
  mean((test$Salary - pred)^2)
})

plot(depths, mse_vals, type = "b",
     xlab = "Tree Depth", ylab = "Test MSE")

min(mse_vals)

## [1] 0.2809147

depths[which.min(mse_vals)]

## [1] 2

(e) et seed to 2025. Simultaneously explore λ and tree depth in combination across the grid values in (c) and (d) to find the optimal combination. Set seed to 2025 again and find the test MSE for this final optimal model; compare it to the models in (c) and (d). Which variables appear to be the most important predictors? Plot and discuss. Hint: Use a nested for loop, i.e., one parameter varies wihtin another.

library(gbm)

set.seed(2025)

lambdas <- seq(0.001, 0.251, by = 0.05)
depths  <- c(2, 4, 6, 8)

results <- data.frame()

for (lam in lambdas) {
  for (d in depths) {
    m <- gbm(Salary ~ ., data = train,
             distribution = "gaussian",
             n.trees = 1000,
             shrinkage = lam,
             interaction.depth = d)
    pred <- predict(m, newdata = test, n.trees = 1000)
    mse  <- mean((test$Salary - pred)^2)
    
    results <- rbind(results, data.frame(lambda = lam, depth = d, mse = mse))
  }
}

best <- results[which.min(results$mse), ]

final_model <- gbm(Salary ~ ., data = train,
                   distribution = "gaussian",
                   n.trees = 1000,
                   shrinkage = best$lambda,
                   interaction.depth = best$depth)

best$mse

## [1] 0.2592961

summary(final_model)

##                 var    rel.inf
## CWalks       CWalks 22.6318984
## CAtBat       CAtBat 15.3972880
## Walks         Walks  6.6273417
## PutOuts     PutOuts  6.1040691
## CRBI           CRBI  6.0360319
## CHits         CHits  5.4806510
## CRuns         CRuns  4.5063919
## Years         Years  4.4325760
## RBI             RBI  4.4033618
## CHmRun       CHmRun  3.9093431
## Assists     Assists  3.8578443
## AtBat         AtBat  3.8412164
## Hits           Hits  3.6487515
## Runs           Runs  3.2573339
## HmRun         HmRun  2.7658495
## Errors       Errors  1.9365964
## NewLeague NewLeague  0.4850165
## League       League  0.3691300
## Division   Division  0.3093086

STAT 4230 Homework #3

Ty Gelman

11/10/2025

STAT 4230 Homework #3

Problem #1

Problem #2

Problem #3