(a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations, setting seed to 1 and using createDataPartition with Purchase as the response.
library(ISLR2)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data("OJ")
set.seed(1)
train_index <- createDataPartition(OJ$Purchase, p = 800 / nrow(OJ), list = FALSE)
train <- OJ[train_index, ]
test <- OJ[-train_index, ]
dim(train)
## [1] 801 18
dim(test)
## [1] 269 18
(b) Set seed to 2025. Fit a basic tree to the training data, with all other variables as predictors. Plot the tree and annotate it. What is the training error rate? How many terminal nodes does the tree have?
library(ISLR2)
library(tree)
library(caret)
data("OJ")
set.seed(2025)
train_index <- createDataPartition(OJ$Purchase, p = 800 / nrow(OJ), list = FALSE)
train <- OJ[train_index, ]
test <- OJ[-train_index, ]
tree.oj <- tree(Purchase ~ ., data = train)
plot(tree.oj)
text(tree.oj, pretty = 0)
tree.pred <- predict(tree.oj, newdata = train, type = "class")
train_err <- mean(tree.pred != train$Purchase)
train_err
## [1] 0.1622971
summary(tree.oj)$size
## [1] 8
(c) Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?
tree.pred.test <- predict(tree.oj, newdata = test, type = "class")
confusion <- table(Predicted = tree.pred.test, Actual = test$Purchase)
confusion
## Actual
## Predicted CH MM
## CH 138 20
## MM 26 85
test_err <- mean(tree.pred.test != test$Purchase)
test_err
## [1] 0.1710037
(d) Set seed to 2025. Apply the cv.tree() function from the tree package to the training set in order to determine the optimal tree size based on mis-classification rates. Produce a plot with tree size on the x-axis and cross-validated classification error rate on the y-axis.
set.seed(2025)
cv.oj <- cv.tree(tree.oj, FUN = prune.misclass)
plot(cv.oj$size, cv.oj$dev, type = "b",
xlab = "Tree Size", ylab = "CV Misclassification Error")
(e) Produce a pruned tree corresponding to the optimal tree size from (d) and annotate it.
min_err <- min(cv.oj$dev)
target <- min_err * 1.10
opt_size <- min(cv.oj$size[cv.oj$dev <= target])
prune.oj <- prune.misclass(tree.oj, best = opt_size)
plot(prune.oj)
text(prune.oj, pretty = 0)
(f) Compare the training error rates between the pruned and unpruned trees. Which is higher? Compare the test error rates between the pruned and unpruned trees. Which is higher?
pred_train_unpruned <- predict(tree.oj, train, type="class")
pred_train_pruned <- predict(prune.oj, train, type="class")
train_err_unpruned <- mean(pred_train_unpruned != train$Purchase)
train_err_pruned <- mean(pred_train_pruned != train$Purchase)
pred_test_unpruned <- predict(tree.oj, test, type="class")
pred_test_pruned <- predict(prune.oj, test, type="class")
test_err_unpruned <- mean(pred_test_unpruned != test$Purchase)
test_err_pruned <- mean(pred_test_pruned != test$Purchase)
c(train_err_unpruned, train_err_pruned, test_err_unpruned, test_err_pruned)
## [1] 0.1735331 0.1935081 0.1710037 0.1821561
(a) Check and ensure appropriate factorization of the outcome variable Own. Split the data set into a training set and a test set. Set seed to 123 and create an 80/20 data partition using createDataPartition.
library(ISLR2)
library(caret)
data("Credit")
Credit$Own <- as.factor(Credit$Own)
set.seed(123)
train_idx <- createDataPartition(Credit$Own, p = 0.8, list = FALSE)
train <- Credit[train_idx, ]
test <- Credit[-train_idx, ]
(b) Set seed to 2025. Use the bagging approach with the training data in order to predict Own using all other variables, with 500 trees. Plot the variable importance and list the 3 most important variables based on node purity. Print the test confusion ma- trix and report the test mis-classification rate. Also calculate the OOB (Out of bag) misclassification rate.
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(2025)
bag.credit <- randomForest(Own ~ ., data = train, mtry = ncol(train) - 1, ntree = 500, importance = TRUE)
varImpPlot(bag.credit, type = 2)
imp <- importance(bag.credit, type = 2)
head(sort(imp[,1], decreasing = TRUE), 3)
## Income Age Limit
## 31.97469 31.12184 22.13308
pred_test <- predict(bag.credit, test)
tbl <- table(Predicted = pred_test, Actual = test$Own)
tbl
## Actual
## Predicted No Yes
## No 16 18
## Yes 22 23
mean(pred_test != test$Own)
## [1] 0.5063291
bag.credit$err.rate[500, "OOB"]
## OOB
## 0.4641745
(c) Set seed to 2025. Improve the bagging model by trying out 50, 100, 200, 500 trees, keeping all else unchanged. And pick the best performing model based on the OOB error (mis-classification) rate. Also, calculate the test error for this final bagging model.
library(randomForest)
set.seed(2025)
trees <- c(50, 100, 200, 500)
models <- lapply(trees, function(t) {
randomForest(Own ~ ., data = train, mtry = ncol(train)-1, ntree = t)
})
oob_errors <- sapply(models, function(m) m$err.rate[nrow(m$err.rate), "OOB"])
best_model <- models[[which.min(oob_errors)]]
pred_test <- predict(best_model, test)
mean(pred_test != test$Own)
## [1] 0.5189873
(d) Set seed to 2025. Use random forests to analyze this training data. Use 500 trees and start with an m equal to the rounded value of sqrt(p). Plot the variable importance and list the 3 most important variables based on node purity. Print the test confusion matrix and report the test mis-classification rate. Also calculate the OOB (Out of bag) misclassification rate.
library(randomForest)
set.seed(2025)
p <- ncol(train) - 1
mtry_val <- round(sqrt(p))
rf.credit <- randomForest(Own ~ ., data = train, mtry = mtry_val, ntree = 500, importance = TRUE)
varImpPlot(rf.credit, type = 2)
imp <- importance(rf.credit, type = 2)
head(sort(imp[,1], decreasing = TRUE), 3)
## Income Age Limit
## 26.91730 26.03310 25.23826
pred_test <- predict(rf.credit, test)
table(Predicted = pred_test, Actual = test$Own)
## Actual
## Predicted No Yes
## No 16 18
## Yes 22 23
mean(pred_test != test$Own)
## [1] 0.5063291
rf.credit$err.rate[500, "OOB"]
## OOB
## 0.4890966
(e) Set seed to 2025. Improve the RF model by trying out mtry values from 2 to 9, with 500 trees each. And pick the best performing model based on the OOB error (mis-classification) rate. Also, calculate the test error for this final RF model.
library(randomForest)
set.seed(2025)
m_vals <- 2:9
rf_models <- lapply(m_vals, function(m) {
randomForest(Own ~ ., data = train, mtry = m, ntree = 500)
})
oob_errs <- sapply(rf_models, function(m) m$err.rate[500, "OOB"])
best_rf <- rf_models[[which.min(oob_errs)]]
pred_test <- predict(best_rf, test)
mean(pred_test != test$Own)
## [1] 0.5316456
**(a) Remove any rows with missing values, and then log-transform the salaries. Report the final dimensions and a summary of the transformed target variable.
library(ISLR2)
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
data("Hitters")
Hitters <- na.omit(Hitters)
Hitters$Salary <- log(Hitters$Salary)
dim(Hitters)
## [1] 263 20
summary(Hitters$Salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.212 5.247 6.052 5.927 6.620 7.808
**(b) Create a training set consisting of the first 200 observations, and a test set consisting of the remaining observations.
train <- Hitters[1:200, ]
test <- Hitters[201:nrow(Hitters), ]
(c) Set seed to 2025. Perform boosting on the training set with 1,000 trees for a range of values of the shrinkage parameter λ from 0.001 to 0.251 (in increments of 0.05). Use a tree depth of 3. Print the test MSE for the optimal λ. Produce a plot with different λ values on the x-axis and the corresponding test set MSE on the y-axis. State your conclusions.
library(gbm)
set.seed(2025)
lambdas <- seq(0.001, 0.251, by = 0.05)
mse_vals <- sapply(lambdas, function(lam) {
m <- gbm(Salary ~ ., data = train,
distribution = "gaussian",
n.trees = 1000,
shrinkage = lam,
interaction.depth = 3)
pred <- predict(m, newdata = test, n.trees = 1000)
mean((test$Salary - pred)^2)
})
plot(lambdas, mse_vals, type = "b",
xlab = "Shrinkage (λ)", ylab = "Test MSE")
min(mse_vals)
## [1] 0.2711232
lambdas[which.min(mse_vals)]
## [1] 0.051
(d) Set seed to 2025. Perform boosting on the training set with 1,000 trees for a shrinkage parameter λ of 0.1. Evaluate tree depth values of 2, 4, 6, 8. Print the test MSE for the optimal depth. Produce a plot with different depth values on the x-axis and the corresponding test set MSE on the y-axis. State your conclusions.
library(gbm)
set.seed(2025)
depths <- c(2, 4, 6, 8)
mse_vals <- sapply(depths, function(d) {
m <- gbm(Salary ~ ., data = train,
distribution = "gaussian",
n.trees = 1000,
shrinkage = 0.1,
interaction.depth = d)
pred <- predict(m, newdata = test, n.trees = 1000)
mean((test$Salary - pred)^2)
})
plot(depths, mse_vals, type = "b",
xlab = "Tree Depth", ylab = "Test MSE")
min(mse_vals)
## [1] 0.2809147
depths[which.min(mse_vals)]
## [1] 2
(e) et seed to 2025. Simultaneously explore λ and tree depth in combination across the grid values in (c) and (d) to find the optimal combination. Set seed to 2025 again and find the test MSE for this final optimal model; compare it to the models in (c) and (d). Which variables appear to be the most important predictors? Plot and discuss. Hint: Use a nested for loop, i.e., one parameter varies wihtin another.
library(gbm)
set.seed(2025)
lambdas <- seq(0.001, 0.251, by = 0.05)
depths <- c(2, 4, 6, 8)
results <- data.frame()
for (lam in lambdas) {
for (d in depths) {
m <- gbm(Salary ~ ., data = train,
distribution = "gaussian",
n.trees = 1000,
shrinkage = lam,
interaction.depth = d)
pred <- predict(m, newdata = test, n.trees = 1000)
mse <- mean((test$Salary - pred)^2)
results <- rbind(results, data.frame(lambda = lam, depth = d, mse = mse))
}
}
best <- results[which.min(results$mse), ]
final_model <- gbm(Salary ~ ., data = train,
distribution = "gaussian",
n.trees = 1000,
shrinkage = best$lambda,
interaction.depth = best$depth)
best$mse
## [1] 0.2592961
summary(final_model)
## var rel.inf
## CWalks CWalks 22.6318984
## CAtBat CAtBat 15.3972880
## Walks Walks 6.6273417
## PutOuts PutOuts 6.1040691
## CRBI CRBI 6.0360319
## CHits CHits 5.4806510
## CRuns CRuns 4.5063919
## Years Years 4.4325760
## RBI RBI 4.4033618
## CHmRun CHmRun 3.9093431
## Assists Assists 3.8578443
## AtBat AtBat 3.8412164
## Hits Hits 3.6487515
## Runs Runs 3.2573339
## HmRun HmRun 2.7658495
## Errors Errors 1.9365964
## NewLeague NewLeague 0.4850165
## League League 0.3691300
## Division Division 0.3093086