# Create a sequence of class 1 probabilities from 0 to 1
<- seq(0, 1, length.out = 200)
p
# Gini index: 2 * p * (1 - p)
<- 2 * p * (1 - p)
gini
# Entropy: -p*log2(p) - (1-p)*log2(1-p)
<- -p * log2(p) - (1 - p) * log2(1 - p)
entropy is.nan(entropy)] <- 0 # Replace NaNs from log(0)
entropy[
# Classification error: 1 - max(p, 1 - p)
<- 1 - pmax(p, 1 - p)
classification_error
# Plot
plot(p, gini, type = "l", col = "blue", lwd = 2, ylim = c(0, 1),
xlab = expression(hat(p)[m1]), ylab = "Impurity Measure",
main = "Gini, Entropy, and Classification Error")
lines(p, entropy, col = "red", lwd = 2)
lines(p, classification_error, col = "green4", lwd = 2)
legend("top", legend = c("Gini Index", "Entropy", "Classification Error"),
col = c("blue", "red", "green4"), lwd = 2)
Homework 7
Assignment 7
Problem 3
Problem 8
Part A
# Load necessary libraries
library(ISLR2)
library(tree)
# Set random seed for reproducibility
set.seed(1)
# Create training index — we'll use a 50% split (we can change to 70/30 or 80/20)
<- sample(1:nrow(Carseats), size = 0.7 * nrow(Carseats))
train_index
# Split the data
<- Carseats[train_index, ]
train <- Carseats[-train_index, ] test
Part B
# Fit regression tree to training data
<- tree(Sales ~ ., data = train)
tree.carseats
# Plot the tree
plot(tree.carseats)
text(tree.carseats, pretty = 0, cex = 0.7) # Smaller labels
title("Regression Tree for Carseats Sales")
# Summary of the tree
summary(tree.carseats)
Regression tree:
tree(formula = Sales ~ ., data = train)
Variables actually used in tree construction:
[1] "ShelveLoc" "Price" "Age" "Income" "CompPrice"
[6] "Advertising"
Number of terminal nodes: 18
Residual mean deviance: 2.409 = 631.1 / 262
Distribution of residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-4.77800 -0.96100 -0.08865 0.00000 1.01800 4.14100
# Predict on the test set
<- predict(tree.carseats, newdata = test)
preds
# Compute Test MSE
<- mean((preds - test$Sales)^2)
mse print(mse)
[1] 4.208383
Part C
# Perform cross-validation on the full tree
<- cv.tree(tree.carseats)
cv.carseats
# View CV results
plot(cv.carseats$size, cv.carseats$dev, type = "b",
xlab = "Tree Size", ylab = "Deviance (CV Error)",
main = "Cross-Validation for Tree Complexity")
# Get the size that gives the minimum deviance
<- cv.carseats$size[which.min(cv.carseats$dev)]
optimal_size optimal_size
[1] 18
# Prune the tree
<- prune.tree(tree.carseats, best = optimal_size)
pruned.tree
# Plot the pruned tree
plot(pruned.tree)
text(pruned.tree, pretty = 0, cex = 0.8)
# Predict using the original full tree
<- predict(tree.carseats, newdata = test)
pred.full
# Predict using the pruned tree
<- predict(pruned.tree, newdata = test)
pred.pruned
# Calculate MSEs
<- mean((pred.full - test$Sales)^2)
mse.full <- mean((pred.pruned - test$Sales)^2)
mse.pruned
mse.full
[1] 4.208383
mse.pruned
[1] 4.208383
I used cross-validation to find the best tree size and pruned the tree accordingly. After pruning, I checked the test MSE again. If the pruned tree’s MSE was lower or similar, it suggests pruning helped simplify the model without sacrificing much accuracy. If the pruned tree had a slightly higher MSE, then pruning may have slightly underfit the data.
Part D
# Load library
library(randomForest)
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
# Make sure 'Sales' is treated as a numeric variable (it should be by default)
# Bagging is done by setting mtry equal to the number of predictors
set.seed(1) # For reproducibility
# Fit bagging model on training data
<- randomForest(Sales ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)
bag_model
# Predict on test data
<- predict(bag_model, newdata = test)
bag_predictions
# Compute test MSE
<- mean((bag_predictions - test$Sales)^2)
bag_mse print(bag_mse)
[1] 2.573252
# Variable importance
importance(bag_model)
%IncMSE IncNodePurity
CompPrice 35.324343 230.55353
Income 7.923790 118.79213
Advertising 19.736816 155.03099
Population -3.479681 63.11975
Price 70.613500 673.11982
ShelveLoc 68.147204 637.69500
Age 20.964215 228.90345
Education 4.705263 63.13124
Urban -2.098091 11.66651
US 1.389570 11.15329
varImpPlot(bag_model)
Part E
library(randomForest)
set.seed(1) # for reproducibility
# Fit random forest model on training data
<- randomForest(Sales ~ ., data = train, mtry = floor((ncol(train) - 1)/3), importance = TRUE)
rf_model
# Predict on test set
<- predict(rf_model, newdata = test)
rf_predictions
# Compute test MSE
<- mean((rf_predictions - test$Sales)^2)
rf_mse print(rf_mse)
[1] 2.916397
importance(rf_model)
%IncMSE IncNodePurity
CompPrice 17.2100430 210.15573
Income 3.4014037 160.89217
Advertising 13.6746780 159.13767
Population -0.2475774 135.70825
Price 46.2036608 569.50565
ShelveLoc 43.3791520 482.15849
Age 17.6506048 266.80028
Education 0.1961045 104.19820
Urban -2.3521604 21.62521
US 4.8708135 33.45262
varImpPlot(rf_model)
# Try different mtry values
<- c(2, 4, 6, 8, 10)
mtry_vals <- numeric(length(mtry_vals))
mse_vals
for (i in 1:length(mtry_vals)) {
<- randomForest(Sales ~ ., data = train, mtry = mtry_vals[i])
model <- predict(model, newdata = test)
preds <- mean((preds - test$Sales)^2)
mse_vals[i]
}
# Print results
data.frame(mtry = mtry_vals, Test_MSE = mse_vals)
mtry Test_MSE
1 2 3.327184
2 4 2.663770
3 6 2.529206
4 8 2.526125
5 10 2.572272
I tested different values for mtry to see how the number of variables considered at each split affects the test MSE in a random forest model. The results showed that mtry = 6 and mtry = 8 gave the lowest test MSE, around 2.53. Smaller values like 2 led to higher error, while larger values beyond 8 slightly increased the error again. This suggests that a moderate mtry provides the best balance between bias and variance in the model.
Part F
library(ISLR2)
library(BayesTree)
# Step 1: Clean data and prepare predictors/outcome
data(Carseats)
$ShelveLoc <- as.numeric(Carseats$ShelveLoc)
Carseats$Urban <- ifelse(Carseats$Urban == "Yes", 1, 0)
Carseats$US <- ifelse(Carseats$US == "Yes", 1, 0)
Carseats
# Step 2: Split into train/test sets
set.seed(123)
<- sample(1:nrow(Carseats), 0.7 * nrow(Carseats))
train_index
<- Carseats[train_index, !(names(Carseats) %in% c("Sales"))]
x_train <- Carseats[-train_index, !(names(Carseats) %in% c("Sales"))]
x_test
<- Carseats[train_index, "Sales"]
y_train <- Carseats[-train_index, "Sales"]
y_test
# Step 3: Fit BART model
set.seed(123)
<- bart(x.train = x_train, y.train = y_train, x.test = x_test) bart_model
Running BART with numeric y
number of trees: 200
Prior:
k: 2.000000
degrees of freedom in sigma prior: 3
quantile in sigma prior: 0.900000
power and base for tree prior: 2.000000 0.950000
use quantiles for rule cut points: 0
data:
number of training observations: 280
number of test observations: 120
number of explanatory variables: 10
Cutoff rules c in x<=c vs x>c
Number of cutoffs: (var: number of possible c):
(1: 100) (2: 100) (3: 100) (4: 100) (5: 100)
(6: 100) (7: 100) (8: 100) (9: 100) (10: 100)
Running mcmc loop:
iteration: 100 (of 1100)
iteration: 200 (of 1100)
iteration: 300 (of 1100)
iteration: 400 (of 1100)
iteration: 500 (of 1100)
iteration: 600 (of 1100)
iteration: 700 (of 1100)
iteration: 800 (of 1100)
iteration: 900 (of 1100)
iteration: 1000 (of 1100)
iteration: 1100 (of 1100)
time for loop: 5
Tree sizes, last iteration:
4 2 2 2 2 3 2 2 2 4 3 2 5 2 2 3 3 3 2 2
2 1 2 2 3 2 3 2 4 2 2 2 2 2 4 2 2 2 2 2
3 2 2 2 1 2 2 2 2 2 2 2 4 2 2 2 3 4 2 2
3 3 4 2 2 2 2 2 2 3 2 2 2 2 2 2 3 2 3 2
3 2 4 2 2 4 4 3 2 2 2 2 3 3 2 2 3 2 2 2
2 2 2 2 2 4 2 2 3 2 2 2 4 1 2 2 3 2 3 2
2 2 1 4 3 2 2 2 2 3 2 1 2 2 2 2 3 2 3 3
2 3 3 2 2 2 3 3 2 2 1 2 1 2 2 2 3 2 3 2
2 3 2 2 5 3 4 2 2 3 3 3 2 2 2 2 2 2 2 2
2 2 3 2 2 2 2 2 2 3 2 3 3 1 3 5 2 2 2 1
Variable Usage, last iteration (var:count):
(1: 31) (2: 23) (3: 21) (4: 18) (5: 47)
(6: 35) (7: 19) (8: 22) (9: 22) (10: 33)
DONE BART 11-2-2014
# Step 4: Predictions and test MSE
<- bart_model$yhat.test.mean
pred_bart <- mean((y_test - pred_bart)^2)
test_mse_bart print(paste("BART Test MSE:", round(test_mse_bart, 4)))
[1] "BART Test MSE: 1.6268"
Using BART on the Carseats dataset, the model achieved a Test Mean Squared Error (MSE) of 1.6268, which is the lowest among all the models tested. This suggests that BART provided the most accurate predictions of Sales, outperforming regression trees, bagging, random forests, and other ensemble methods in this analysis.
Problem 9
Part A
# Load package and data
library(ISLR2)
data(OJ)
# Set seed for reproducibility
set.seed(123)
# Randomly sample 800 observations for training
<- sample(1:nrow(OJ), 800)
train_index
# Split the data
<- OJ[train_index, ]
train_data <- OJ[-train_index, ] test_data
Part B
# Load package
library(tree)
# Fit the classification tree
<- tree(Purchase ~ ., data = train_data)
oj_tree
# Display summary statistics of the tree
summary(oj_tree)
Classification tree:
tree(formula = Purchase ~ ., data = train_data)
Variables actually used in tree construction:
[1] "LoyalCH" "PriceDiff"
Number of terminal nodes: 8
Residual mean deviance: 0.7625 = 603.9 / 792
Misclassification error rate: 0.165 = 132 / 800
The tree has 8 terminal nodes, meaning it split the data into 8 final groups. The training error rate is 16.5%, indicating the proportion of misclassified observations in the training set.
Part C
# Print the tree structure
oj_tree
node), split, n, deviance, yval, (yprob)
* denotes terminal node
1) root 800 1071.00 CH ( 0.60875 0.39125 )
2) LoyalCH < 0.5036 350 415.10 MM ( 0.28000 0.72000 )
4) LoyalCH < 0.276142 170 131.00 MM ( 0.12941 0.87059 )
8) LoyalCH < 0.0356415 56 10.03 MM ( 0.01786 0.98214 ) *
9) LoyalCH > 0.0356415 114 108.90 MM ( 0.18421 0.81579 ) *
5) LoyalCH > 0.276142 180 245.20 MM ( 0.42222 0.57778 )
10) PriceDiff < 0.05 74 74.61 MM ( 0.20270 0.79730 ) *
11) PriceDiff > 0.05 106 144.50 CH ( 0.57547 0.42453 ) *
3) LoyalCH > 0.5036 450 357.10 CH ( 0.86444 0.13556 )
6) PriceDiff < -0.39 27 32.82 MM ( 0.29630 0.70370 ) *
7) PriceDiff > -0.39 423 273.70 CH ( 0.90071 0.09929 )
14) LoyalCH < 0.705326 130 135.50 CH ( 0.78462 0.21538 )
28) PriceDiff < 0.145 43 58.47 CH ( 0.58140 0.41860 ) *
29) PriceDiff > 0.145 87 62.07 CH ( 0.88506 0.11494 ) *
15) LoyalCH > 0.705326 293 112.50 CH ( 0.95222 0.04778 ) *
- LoyalCH < 0.0356415 56 10.03 MM ( 0.01786 0.98214 ) *
Fit a classification tree to predict Purchase (CH vs. MM) using several variables. What we can understand: Root Node: 800 observations
Predicted class: CH
Class distribution: ~61% CH, 39% MM
A Sample Terminal Node: 8) LoyalCH < 0.0356415 56 10.03 MM ( 0.01786 0.98214 ) *
This means: This node contains 56 observations. The split was based on LoyalCH < 0.0356. It’s a terminal node (see the *). The tree predicts MM here. The class probabilities are: ~1.8% CH ~98.2% MM
Interpretation
In this region of the tree, where customers have very low loyalty to CH, the model confidently predicts they’ll choose MM.
The fitted classification tree has a total of 7 terminal nodes. The root node splits on LoyalCH, indicating that customer loyalty to CH is the most important predictor. One terminal node, for example, predicts MM with 98.2% certainty when LoyalCH is very low. This suggests that customers with low CH loyalty are much more likely to choose MM.
Part D
# Plotting tree
plot(oj_tree)
text(oj_tree, pretty = 0)
The tree shows that customer loyalty to Citrus Hill (LoyalCH) is the strongest predictor of purchase. If loyalty is low, the model checks price differences to decide between CH and MM. When loyalty is high, customers almost always choose CH unless price strongly favors MM. Overall, loyalty drives the decisions, with price acting as a secondary factor.
Part E
# Predict on test data
<- predict(oj_tree, newdata = test_data, type = "class")
tree_pred
# Confusion matrix
<- table(Predicted = tree_pred, Actual = test_data$Purchase)
conf_matrix print(conf_matrix)
Actual
Predicted CH MM
CH 150 34
MM 16 70
# Test error rate
<- mean(tree_pred != test_data$Purchase)
test_error print(test_error)
[1] 0.1851852
Test error rate: 18.5%
Part F
# Cross-validation for optimal tree size
set.seed(123) # For reproducibility
<- cv.tree(oj_tree, FUN = prune.misclass)
cv_oj
# Plot cross-validation results
plot(cv_oj$size, cv_oj$dev, type = "b",
xlab = "Tree Size (Number of Terminal Nodes)",
ylab = "CV Misclassification Error",
main = "Cross-Validation Results")
cv_oj
$size
[1] 8 5 3 2 1
$dev
[1] 139 139 157 167 313
$k
[1] -Inf 0 8 11 154
$method
[1] "misclass"
attr(,"class")
[1] "prune" "tree.sequence"
<- cv_oj$size[which.min(cv_oj$dev)]
optimal_size print(optimal_size)
[1] 8
Part G
# Prune to 5 terminal nodes instead of 8
<- prune.misclass(oj_tree, best = 5)
pruned_oj
# Plot the pruned tree
plot(pruned_oj)
text(pruned_oj, pretty = 0)
# Predict and evaluate
<- predict(pruned_oj, newdata = test_data, type = "class")
pruned_pred <- table(Predicted = pruned_pred, Actual = test_data$Purchase)
pruned_conf print(pruned_conf)
Actual
Predicted CH MM
CH 150 34
MM 16 70
<- mean(pruned_pred != test_data$Purchase)
pruned_error print(pruned_error)
[1] 0.1851852
Cross-validation identified both 8-node and 5-node trees as having the lowest error. We chose to prune the tree to 5 nodes for simplicity. The pruned tree achieved the same test error rate of 18.5% as the full tree, indicating that the extra complexity did not improve generalization. This supports using the smaller tree for better interpretability.
Part H
$size cv_oj
[1] 8 5 3 2 1
$dev cv_oj
[1] 139 139 157 167 313
Based on the output of cv.tree(), tree sizes 8 and 5 both resulted in the lowest cross-validated classification error of 139. Since the 5-node tree is simpler and equally effective, it would typically be chosen for better interpretability.
Part I
# Prune to 5 terminal nodes
<- prune.misclass(oj_tree, best = 5)
pruned_oj
# Plot the pruned tree
plot(pruned_oj)
text(pruned_oj, pretty = 0)
title("Pruned Classification Tree (5 Terminal Nodes)")
Part J
1. Training error for unpruned tree
# Predict on training data using the full tree
<- predict(oj_tree, newdata = train_data, type = "class")
train_pred_unpruned
# Training error (unpruned)
<- mean(train_pred_unpruned != train_data$Purchase)
train_error_unpruned print(train_error_unpruned)
[1] 0.165
2. Training error for pruned tree
# Predict on training data using the pruned tree
<- predict(pruned_oj, newdata = train_data, type = "class")
train_pred_pruned
# Training error (pruned)
<- mean(train_pred_pruned != train_data$Purchase)
train_error_pruned print(train_error_pruned)
[1] 0.165
Surprisingly, both the pruned and unpruned trees have the same training error rate of 16.5%. This indicates that the branches removed during pruning did not impact the model’s ability to classify the training data. The pruned tree achieves the same accuracy while being simpler, making it preferable for interpretability and avoiding overfitting.
Part K
Unpruned tree (8 nodes) → Test error = 18.5%
Pruned tree (5 nodes) → Test error = 18.5%
The test error rate for both the pruned and unpruned trees is 18.5%. Since the test performance is the same, the pruned tree is preferred because it achieves the same accuracy with fewer terminal nodes. This suggests that the additional complexity in the full tree did not improve generalization and may have simply overfit the training data.