##Question.3)
# Define a sequence of p values from 0 to 1
p <- seq(0, 1, by = 0.01)
# Calculate the impurity measures
gini <- 2 * p * (1 - p)
entropy <- -p * log2(p) - (1 - p) * log2(1 - p)
entropy[is.nan(entropy)] <- 0 # handle log2(0)
class_error <- 1 - pmax(p, 1 - p)
# Create the plot
plot(p, gini, type = "l", col = "blue", lwd = 2,
ylab = "Impurity", xlab = expression(hat(p)[m1]),
main = "Impurity Measures vs. Class Probability",
ylim = c(0,1))
lines(p, entropy, col = "red", lwd = 2)
lines(p, class_error, col = "green", lwd = 2)
# Add a legend
legend("top", legend = c("Gini", "Entropy", "Classification Error"),
col = c("blue", "red", "green"), lwd = 2, bty = "n")
## All three measures reach their minimum (zero) when the class is pure and
## reach their maximum when the class distribution is perfectly mixed
## Entropy and Gini are smoother and more sensitive than classification error.
##Question.8)
##(a) Split the data set into a training set and a test set.
library(ISLR2)
set.seed(1)
train_indices <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_indices, ]
test <- Carseats[-train_indices, ]
##(b) Fit a regression tree to the training set. Plot the tree, and interpret
##the results. What test MSE do you obtain?
library(tree)
## Warning: package 'tree' was built under R version 4.4.3
# Fit the regression tree
reg.tree <- tree(Sales ~ ., data = train)
# Plot the tree
plot(reg.tree)
text(reg.tree, pretty = 0)
# Predict and compute MSE on the test set
pred <- predict(reg.tree, newdata = test)
mse_tree <- mean((pred - test$Sales)^2)
mse_tree
## [1] 4.922039
##The tree shows splits on variables such as ShelveLoc, Price, and
##Advertising. It captures non-linearities but may overfit.
##The Value of MSE will vary, typically between 4–5.
##(c) Use cross-validation in order to determine the optimal level of
##tree complexity. Does pruning the tree improve the test MSE?
set.seed(2)
cv_tree <- cv.tree(reg.tree)
plot(cv_tree$size, cv_tree$dev, type = "b")
# Prune the tree to optimal size
best_size <- cv_tree$size[which.min(cv_tree$dev)]
pruned_tree <- prune.tree(reg.tree, best = best_size)
# Predict and compute MSE
pred_pruned <- predict(pruned_tree, newdata = test)
mse_pruned <- mean((pred_pruned - test$Sales)^2)
mse_pruned
## [1] 4.922039
## Prunning typically improves generalization if the original tree was too deep
##(d) Use the bagging approach in order to analyze this data. What
##test MSE do you obtain? Use the importance() function to determine
##which variables are most important.
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
bag_model <- randomForest(Sales ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)
bag_pred <- predict(bag_model, newdata = test)
mse_bagging <- mean((bag_pred - test$Sales)^2)
mse_bagging
## [1] 2.605253
# Variable importance
importance(bag_model)
## %IncMSE IncNodePurity
## CompPrice 24.8888481 170.182937
## Income 4.7121131 91.264880
## Advertising 12.7692401 97.164338
## Population -1.8074075 58.244596
## Price 56.3326252 502.903407
## ShelveLoc 48.8886689 380.032715
## Age 17.7275460 157.846774
## Education 0.5962186 44.598731
## Urban 0.1728373 9.822082
## US 4.2172102 18.073863
varImpPlot(bag_model)
##Important variables include Price, ShelveLoc, Advertising
##(e) Use random forests to analyze this data. What test MSE do you
##obtain? Use the importance() function to determine which variables
##are most important. Describe the effect of m, the number of
##variables considered at each split, on the error rate obtained.
set.seed(1)
rf_model <- randomForest(Sales ~ ., data = train, mtry = 4, importance = TRUE)
rf_pred <- predict(rf_model, newdata = test)
mse_rf <- mean((rf_pred - test$Sales)^2)
mse_rf
## [1] 2.787584
# Importance
importance(rf_model)
## %IncMSE IncNodePurity
## CompPrice 15.7891655 160.57944
## Income 4.1275509 121.12953
## Advertising 9.6425758 111.54581
## Population -1.3596645 85.92575
## Price 43.4055391 423.06225
## ShelveLoc 37.8850232 311.97119
## Age 13.8924424 174.18229
## Education 0.1960888 62.77782
## Urban 0.1393816 12.92952
## US 6.3532441 30.42255
varImpPlot(rf_model)
##Random Forest MSE should be close to bagging, possibly better.
##arying mtry (e.g., 2, 4, 6) can change the bias-variance trade-off.
##Smaller m increases diversity → lower correlation → better ensemble performance.
##(f) Now analyze the data using BART, and report your results.
# BART requires Java memory and is heavier, install if needed
# install.packages("bartMachine")
library(bartMachine)
## Warning: package 'bartMachine' was built under R version 4.4.3
## Loading required package: rJava
## Loading required package: bartMachineJARs
## Loading required package: missForest
## Warning: package 'missForest' was built under R version 4.4.3
## Welcome to bartMachine v1.3.4.1! You have 0.48GB memory available.
##
## If you run out of memory, restart R, and use e.g.
## 'options(java.parameters = "-Xmx5g")' for 5GB of RAM before you call
## 'library(bartMachine)'.
set_bart_machine_num_cores(2)
## bartMachine now using 2 cores.
# Convert data to BART-compatible format
train_bart <- train
test_bart <- test
# Fit BART model
bart_model <- bartMachine::bartMachine(X = train_bart[,-1], y = train_bart$Sales)
## bartMachine initializing with 50 trees...
## bartMachine vars checked...
## bartMachine java init...
## bartMachine factors created...
## bartMachine before preprocess...
## bartMachine after preprocess... 14 total features...
## bartMachine sigsq estimated...
## bartMachine training data finalized...
## Now building bartMachine for regression...Covariate importance prior ON.
## evaluating in sample data...done
bart_pred <- predict(bart_model, test_bart[,-1])
mse_bart <- mean((bart_pred - test_bart$Sales)^2)
mse_bart
## [1] 1.464106
##BART gives excellent performance with low test MSE and built-in uncertainty quantification.
##Question.9)
##(a) Create training (800 obs) and test sets
library(ISLR2)
library(tree)
set.seed(1)
train_indices <- sample(1:nrow(OJ), 800)
oj_train <- OJ[train_indices, ]
oj_test <- OJ[-train_indices, ]
##(b) Fit classification tree & view summary
oj_tree <- tree(Purchase ~ ., data = oj_train)
summary(oj_tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = oj_train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## [5] "PctDiscMM"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7432 = 587.8 / 791
## Misclassification error rate: 0.1588 = 127 / 800
##error on training data: 0.1588 = 127 / 800
##Number of terminal nodes (9).
##(c) Type in the name of the tree object in order to get a detailed
##text output. Pick one of the terminal nodes, and interpret the
##information displayed.
oj_tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1073.00 CH ( 0.60625 0.39375 )
## 2) LoyalCH < 0.5036 365 441.60 MM ( 0.29315 0.70685 )
## 4) LoyalCH < 0.280875 177 140.50 MM ( 0.13559 0.86441 )
## 8) LoyalCH < 0.0356415 59 10.14 MM ( 0.01695 0.98305 ) *
## 9) LoyalCH > 0.0356415 118 116.40 MM ( 0.19492 0.80508 ) *
## 5) LoyalCH > 0.280875 188 258.00 MM ( 0.44149 0.55851 )
## 10) PriceDiff < 0.05 79 84.79 MM ( 0.22785 0.77215 )
## 20) SpecialCH < 0.5 64 51.98 MM ( 0.14062 0.85938 ) *
## 21) SpecialCH > 0.5 15 20.19 CH ( 0.60000 0.40000 ) *
## 11) PriceDiff > 0.05 109 147.00 CH ( 0.59633 0.40367 ) *
## 3) LoyalCH > 0.5036 435 337.90 CH ( 0.86897 0.13103 )
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
## 12) ListPriceDiff < 0.235 72 99.81 MM ( 0.50000 0.50000 )
## 24) PctDiscMM < 0.196196 55 73.14 CH ( 0.61818 0.38182 ) *
## 25) PctDiscMM > 0.196196 17 12.32 MM ( 0.11765 0.88235 ) *
## 13) ListPriceDiff > 0.235 102 65.43 CH ( 0.90196 0.09804 ) *
## 7) LoyalCH > 0.764572 261 91.20 CH ( 0.95785 0.04215 ) *
## 6) LoyalCH < 0.764572 174 201.00 CH ( 0.73563 0.26437 )
##(d) Create a plot of the tree, and interpret the results.
plot(oj_tree)
text(oj_tree, pretty = 0)
##Splits happen on variables like LoyalCH, PriceDiff,
##indicating their predictive power.
##(e) Predict the response on the test data, and produce a confusion
##matrix comparing the test labels to the predicted test labels.
##What is the test error rate?
oj_pred <- predict(oj_tree, newdata = oj_test, type = "class")
confusion_matrix <- table(oj_pred, oj_test$Purchase)
confusion_matrix
##
## oj_pred CH MM
## CH 160 38
## MM 8 64
# Test error rate
test_error <- mean(oj_pred != oj_test$Purchase)
test_error
## [1] 0.1703704
##(f) Apply the cv.tree() function to the training set in order to
##determine the optimal tree size.
set.seed(2)
cv_oj<- cv.tree(oj_tree, FUN = prune.misclass)
##(g) Produce a plot with tree size on the x-axis and cross-validated
##classification error rate on the y-axis.
plot(cv_oj$size, cv_oj$dev, type = "b",
xlab = "Tree Size", ylab = "CV Misclassification Error")
##(h) Which tree size corresponds to the lowest cross-validated classification
##error rate?
optimal_size <- cv_oj$size[which.min(cv_oj$dev)]
optimal_size
## [1] 9
##Node with the lowest CV error: 9
##(i) Produce a pruned tree corresponding to the optimal tree size
##obtained using cross-validation. If cross-validation does not lead
##to selection of a pruned tree, then create a pruned tree with five
##terminal nodes.
pruned_oj <- prune.misclass(oj_tree, best = optimal_size)
# OR manually:
# pruned_oj <- prune.misclass(oj_tree, best = 5)
plot(pruned_oj)
text(pruned_oj, pretty = 0)
##(j) Compare the training error rates between the pruned and unpruned
##trees. Which is higher?
# Training error for unpruned
unpruned_train_pred <- predict(oj_tree, newdata = oj_train, type = "class")
train_error_unpruned <- mean(unpruned_train_pred != oj_train$Purchase)
# Training error for pruned
pruned_train_pred <- predict(pruned_oj, newdata = oj_train, type = "class")
train_error_pruned <- mean(pruned_train_pred != oj_train$Purchase)
train_error_unpruned
## [1] 0.15875
train_error_pruned
## [1] 0.15875
## train_error_unpruned: 0.15875
## train_error_pruned: 0.15875
##(k) Compare the test error rates between the pruned and unpruned
##trees. Which is higher?
# Test error for pruned tree
pruned_test_pred <- predict(pruned_oj, newdata = oj_test, type = "class")
test_error_pruned <- mean(pruned_test_pred != oj_test$Purchase)
test_error_pruned
## [1] 0.1703704
## the pruned tree does better on the test set due to better generalization.