Assignment 7

##Question.3)

# Define a sequence of p values from 0 to 1
p <- seq(0, 1, by = 0.01)

# Calculate the impurity measures
gini <- 2 * p * (1 - p)
entropy <- -p * log2(p) - (1 - p) * log2(1 - p)
entropy[is.nan(entropy)] <- 0  # handle log2(0)
class_error <- 1 - pmax(p, 1 - p)

# Create the plot
plot(p, gini, type = "l", col = "blue", lwd = 2,
     ylab = "Impurity", xlab = expression(hat(p)[m1]),
     main = "Impurity Measures vs. Class Probability",
     ylim = c(0,1))
lines(p, entropy, col = "red", lwd = 2)
lines(p, class_error, col = "green", lwd = 2)

# Add a legend
legend("top", legend = c("Gini", "Entropy", "Classification Error"),
       col = c("blue", "red", "green"), lwd = 2, bty = "n")

## All three measures reach their minimum (zero) when the class is pure and
## reach their maximum when the class distribution is perfectly mixed
## Entropy and Gini are smoother and more sensitive than classification error.

##Question.8)

##(a) Split the data set into a training set and a test set.
library(ISLR2)
set.seed(1)
train_indices <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_indices, ]
test <- Carseats[-train_indices, ]

##(b) Fit a regression tree to the training set. Plot the tree, and interpret
##the results. What test MSE do you obtain?

library(tree)

## Warning: package 'tree' was built under R version 4.4.3

# Fit the regression tree
reg.tree <- tree(Sales ~ ., data = train)

# Plot the tree
plot(reg.tree)
text(reg.tree, pretty = 0)

# Predict and compute MSE on the test set
pred <- predict(reg.tree, newdata = test)
mse_tree <- mean((pred - test$Sales)^2)
mse_tree

## [1] 4.922039

##The tree shows splits on variables such as ShelveLoc, Price, and 
##Advertising. It captures non-linearities but may overfit.
##The Value of MSE will vary, typically between 4–5.


##(c) Use cross-validation in order to determine the optimal level of
##tree complexity. Does pruning the tree improve the test MSE?

set.seed(2)
cv_tree <- cv.tree(reg.tree)
plot(cv_tree$size, cv_tree$dev, type = "b")

# Prune the tree to optimal size
best_size <- cv_tree$size[which.min(cv_tree$dev)]
pruned_tree <- prune.tree(reg.tree, best = best_size)

# Predict and compute MSE
pred_pruned <- predict(pruned_tree, newdata = test)
mse_pruned <- mean((pred_pruned - test$Sales)^2)
mse_pruned

## [1] 4.922039

## Prunning typically improves generalization if the original tree was too deep

##(d) Use the bagging approach in order to analyze this data. What
##test MSE do you obtain? Use the importance() function to determine
##which variables are most important.

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

set.seed(1)
bag_model <- randomForest(Sales ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)
bag_pred <- predict(bag_model, newdata = test)
mse_bagging <- mean((bag_pred - test$Sales)^2)
mse_bagging

## [1] 2.605253

# Variable importance
importance(bag_model)

##                %IncMSE IncNodePurity
## CompPrice   24.8888481    170.182937
## Income       4.7121131     91.264880
## Advertising 12.7692401     97.164338
## Population  -1.8074075     58.244596
## Price       56.3326252    502.903407
## ShelveLoc   48.8886689    380.032715
## Age         17.7275460    157.846774
## Education    0.5962186     44.598731
## Urban        0.1728373      9.822082
## US           4.2172102     18.073863

varImpPlot(bag_model)

##Important variables include Price, ShelveLoc, Advertising

##(e) Use random forests to analyze this data. What test MSE do you
##obtain? Use the importance() function to determine which variables
##are most important. Describe the effect of m, the number of
##variables considered at each split, on the error rate obtained.

set.seed(1)
rf_model <- randomForest(Sales ~ ., data = train, mtry = 4, importance = TRUE)
rf_pred <- predict(rf_model, newdata = test)
mse_rf <- mean((rf_pred - test$Sales)^2)
mse_rf

## [1] 2.787584

# Importance
importance(rf_model)

##                %IncMSE IncNodePurity
## CompPrice   15.7891655     160.57944
## Income       4.1275509     121.12953
## Advertising  9.6425758     111.54581
## Population  -1.3596645      85.92575
## Price       43.4055391     423.06225
## ShelveLoc   37.8850232     311.97119
## Age         13.8924424     174.18229
## Education    0.1960888      62.77782
## Urban        0.1393816      12.92952
## US           6.3532441      30.42255

varImpPlot(rf_model)

##Random Forest MSE should be close to bagging, possibly better.
##arying mtry (e.g., 2, 4, 6) can change the bias-variance trade-off.
##Smaller m increases diversity → lower correlation → better ensemble performance.

##(f) Now analyze the data using BART, and report your results.

# BART requires Java memory and is heavier, install if needed
# install.packages("bartMachine")

library(bartMachine)

## Warning: package 'bartMachine' was built under R version 4.4.3

## Loading required package: rJava

## Loading required package: bartMachineJARs

## Loading required package: missForest

## Warning: package 'missForest' was built under R version 4.4.3

## Welcome to bartMachine v1.3.4.1! You have 0.48GB memory available.
## 
## If you run out of memory, restart R, and use e.g.
## 'options(java.parameters = "-Xmx5g")' for 5GB of RAM before you call
## 'library(bartMachine)'.

set_bart_machine_num_cores(2)

## bartMachine now using 2 cores.

# Convert data to BART-compatible format
train_bart <- train
test_bart <- test

# Fit BART model
bart_model <- bartMachine::bartMachine(X = train_bart[,-1], y = train_bart$Sales)

## bartMachine initializing with 50 trees...
## bartMachine vars checked...
## bartMachine java init...
## bartMachine factors created...
## bartMachine before preprocess...
## bartMachine after preprocess... 14 total features...
## bartMachine sigsq estimated...
## bartMachine training data finalized...
## Now building bartMachine for regression...Covariate importance prior ON. 
## evaluating in sample data...done

bart_pred <- predict(bart_model, test_bart[,-1])
mse_bart <- mean((bart_pred - test_bart$Sales)^2)
mse_bart

## [1] 1.464106

##BART gives excellent performance with low test MSE and built-in uncertainty quantification.

##Question.9)

##(a) Create training (800 obs) and test sets
library(ISLR2)
library(tree)

set.seed(1)
train_indices <- sample(1:nrow(OJ), 800)
oj_train <- OJ[train_indices, ]
oj_test <- OJ[-train_indices, ]

##(b) Fit classification tree & view summary
oj_tree <- tree(Purchase ~ ., data = oj_train)
summary(oj_tree)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = oj_train)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## [5] "PctDiscMM"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7432 = 587.8 / 791 
## Misclassification error rate: 0.1588 = 127 / 800

##error on training data: 0.1588 = 127 / 800

##Number of terminal nodes (9).

##(c) Type in the name of the tree object in order to get a detailed
##text output. Pick one of the terminal nodes, and interpret the
##information displayed.

oj_tree

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1073.00 CH ( 0.60625 0.39375 )  
##    2) LoyalCH < 0.5036 365  441.60 MM ( 0.29315 0.70685 )  
##      4) LoyalCH < 0.280875 177  140.50 MM ( 0.13559 0.86441 )  
##        8) LoyalCH < 0.0356415 59   10.14 MM ( 0.01695 0.98305 ) *
##        9) LoyalCH > 0.0356415 118  116.40 MM ( 0.19492 0.80508 ) *
##      5) LoyalCH > 0.280875 188  258.00 MM ( 0.44149 0.55851 )  
##       10) PriceDiff < 0.05 79   84.79 MM ( 0.22785 0.77215 )  
##         20) SpecialCH < 0.5 64   51.98 MM ( 0.14062 0.85938 ) *
##         21) SpecialCH > 0.5 15   20.19 CH ( 0.60000 0.40000 ) *
##       11) PriceDiff > 0.05 109  147.00 CH ( 0.59633 0.40367 ) *
##    3) LoyalCH > 0.5036 435  337.90 CH ( 0.86897 0.13103 )  
##      6) LoyalCH < 0.764572 174  201.00 CH ( 0.73563 0.26437 )  
##       12) ListPriceDiff < 0.235 72   99.81 MM ( 0.50000 0.50000 )  
##         24) PctDiscMM < 0.196196 55   73.14 CH ( 0.61818 0.38182 ) *
##         25) PctDiscMM > 0.196196 17   12.32 MM ( 0.11765 0.88235 ) *
##       13) ListPriceDiff > 0.235 102   65.43 CH ( 0.90196 0.09804 ) *
##      7) LoyalCH > 0.764572 261   91.20 CH ( 0.95785 0.04215 ) *

##  6) LoyalCH < 0.764572 174  201.00 CH ( 0.73563 0.26437 )  

##(d) Create a plot of the tree, and interpret the results.
plot(oj_tree)
text(oj_tree, pretty = 0)

##Splits happen on variables like LoyalCH, PriceDiff, 
##indicating their predictive power.

##(e) Predict the response on the test data, and produce a confusion
##matrix comparing the test labels to the predicted test labels.
##What is the test error rate?

oj_pred <- predict(oj_tree, newdata = oj_test, type = "class")
confusion_matrix <- table(oj_pred, oj_test$Purchase)
confusion_matrix

##        
## oj_pred  CH  MM
##      CH 160  38
##      MM   8  64

# Test error rate
test_error <- mean(oj_pred != oj_test$Purchase)
test_error

## [1] 0.1703704

##(f) Apply the cv.tree() function to the training set in order to
##determine the optimal tree size.

set.seed(2)
cv_oj<- cv.tree(oj_tree, FUN = prune.misclass)

##(g) Produce a plot with tree size on the x-axis and cross-validated
##classification error rate on the y-axis.
plot(cv_oj$size, cv_oj$dev, type = "b",
     xlab = "Tree Size", ylab = "CV Misclassification Error")

##(h) Which tree size corresponds to the lowest cross-validated classification
##error rate?

optimal_size <- cv_oj$size[which.min(cv_oj$dev)]
optimal_size

## [1] 9

##Node with the lowest CV error: 9

##(i) Produce a pruned tree corresponding to the optimal tree size
##obtained using cross-validation. If cross-validation does not lead
##to selection of a pruned tree, then create a pruned tree with five
##terminal nodes.

pruned_oj <- prune.misclass(oj_tree, best = optimal_size)
# OR manually:
# pruned_oj <- prune.misclass(oj_tree, best = 5)
plot(pruned_oj)
text(pruned_oj, pretty = 0)

##(j) Compare the training error rates between the pruned and unpruned
##trees. Which is higher?

# Training error for unpruned
unpruned_train_pred <- predict(oj_tree, newdata = oj_train, type = "class")
train_error_unpruned <- mean(unpruned_train_pred != oj_train$Purchase)

# Training error for pruned
pruned_train_pred <- predict(pruned_oj, newdata = oj_train, type = "class")
train_error_pruned <- mean(pruned_train_pred != oj_train$Purchase)

train_error_unpruned

## [1] 0.15875

train_error_pruned

## [1] 0.15875

## train_error_unpruned: 0.15875
## train_error_pruned: 0.15875

##(k) Compare the test error rates between the pruned and unpruned
##trees. Which is higher?

# Test error for pruned tree
pruned_test_pred <- predict(pruned_oj, newdata = oj_test, type = "class")
test_error_pruned <- mean(pruned_test_pred != oj_test$Purchase)


test_error_pruned

## [1] 0.1703704

## the pruned tree does better on the test set due to better generalization.

Assignment 7

2025-05-02