Q3

options(repos = c(CRAN = "https://cloud.r-project.org"))

# Define a sequence of pm1 values
pm1 <- seq(0, 1, by = 0.01)
pm2 <- 1 - pm1

# Calculate metrics
gini <- pm1 * (1 - pm1) + pm2 * (1 - pm2)
entropy <- -pm1 * log2(pm1) - pm2 * log2(pm2)
entropy[is.nan(entropy)] <- 0  # Handle log(0)
class_error <- 1 - pmax(pm1, pm2)

# Plot
plot(pm1, gini, type = "l", col = "red", ylim = c(0, 1), ylab = "Impurity/Error", xlab = "pm1", lwd = 2)
lines(pm1, entropy, col = "blue", lwd = 2)
lines(pm1, class_error, col = "green", lwd = 2)
legend("top", legend = c("Gini", "Entropy", "Classification Error"), col = c("red", "blue", "green"), lty = 1, lwd = 2)


Q8. Regression Trees on Carseats Dataset**

Load the required libraries and data:

library(ISLR2)
install.packages("tree")
## 
## The downloaded binary packages are in
##  /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//Rtmpc8RFej/downloaded_packages
library(tree)
install.packages("randomForest")
## 
## The downloaded binary packages are in
##  /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//Rtmpc8RFej/downloaded_packages
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
install.packages("gbm")
## 
## The downloaded binary packages are in
##  /var/folders/n6/kts7k_nx3v3208p01m5x0p_00000gn/T//Rtmpc8RFej/downloaded_packages
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(1)

# Convert Sales to quantitative
data(Carseats)

(a) Split into training and test sets:

train_indices <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_indices, ]
test <- Carseats[-train_indices, ]

(b) Fit a regression tree

tree_model <- tree(Sales ~ ., data = train)
summary(tree_model)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Advertising" "CompPrice"  
## [6] "US"         
## Number of terminal nodes:  18 
## Residual mean deviance:  2.167 = 394.3 / 182 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.88200 -0.88200 -0.08712  0.00000  0.89590  4.09900
plot(tree_model)
text(tree_model, pretty = 0)

# Predict and test MSE
preds <- predict(tree_model, newdata = test)
mse <- mean((preds - test$Sales)^2)
print(mse)
## [1] 4.922039

(c) Cross-validation and pruning

cv_results <- cv.tree(tree_model)
plot(cv_results$size, cv_results$dev, type = "b", xlab = "Tree Size", ylab = "CV Error")

# Prune tree
pruned_tree <- prune.tree(tree_model, best = cv_results$size[which.min(cv_results$dev)])
plot(pruned_tree)
text(pruned_tree, pretty = 0)

# Test MSE after pruning
preds_pruned <- predict(pruned_tree, newdata = test)
mean((preds_pruned - test$Sales)^2)
## [1] 4.922039

(d) Bagging

bag_model <- randomForest(Sales ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)
preds_bag <- predict(bag_model, newdata = test)
mean((preds_bag - test$Sales)^2)
## [1] 2.657296
importance(bag_model)
##                 %IncMSE IncNodePurity
## CompPrice   23.07909904    171.185734
## Income       2.82081527     94.079825
## Advertising 11.43295625     99.098941
## Population  -3.92119532     59.818905
## Price       54.24314632    505.887016
## ShelveLoc   46.26912996    361.962753
## Age         14.24992212    159.740422
## Education   -0.07662320     46.738585
## Urban        0.08530119      8.453749
## US           4.34349223     15.157608

(e) Random Forest

rf_model <- randomForest(Sales ~ ., data = train, mtry = 3, importance = TRUE)
preds_rf <- predict(rf_model, newdata = test)
mean((preds_rf - test$Sales)^2)
## [1] 3.049406
importance(rf_model)
##                %IncMSE IncNodePurity
## CompPrice   12.9489323     158.48521
## Income       2.2754686     129.59400
## Advertising  8.9977589     111.94374
## Population  -2.2513981     102.84599
## Price       33.4226950     391.60804
## ShelveLoc   34.0233545     290.56502
## Age         12.2185108     171.83302
## Education    0.2592124      71.65413
## Urban        1.1382113      14.76798
## US           4.1925335      33.75554

Try different values of mtry to study its effect.

(f) BART (Bayesian Additive Regression Trees)

library(BART)
## Loading required package: nlme
## Loading required package: survival
x_train <- train[, -which(names(train) == "Sales")]
x_test <- test[, -which(names(test) == "Sales")]
y_train <- train$Sales

bart_model <- wbart(x.train = x_train, y.train = y_train, x.test = x_test)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 200, 14, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 1.000000
## xp1,xp[np*p]: 111.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.273474,3.000000,0.230740
## *****sigma: 1.088371
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 2s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
mean((bart_model$yhat.test.mean - test$Sales)^2)
## [1] 1.438471

Q9. Classification Tree on OJ Dataset**

(a) Split data

library(ISLR2)
data(OJ)
set.seed(1)
train_indices <- sample(1:nrow(OJ), 800)
train <- OJ[train_indices, ]
test <- OJ[-train_indices, ]

(b) Fit tree

tree_oj <- tree(Purchase ~ ., data = train)
summary(tree_oj)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## [5] "PctDiscMM"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7432 = 587.8 / 791 
## Misclassification error rate: 0.1588 = 127 / 800

(c) Interpret a terminal node

tree_oj  # Shows splits and terminal node rules
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1073.00 CH ( 0.60625 0.39375 )  
##    2) LoyalCH < 0.5036 365  441.60 MM ( 0.29315 0.70685 )  
##      4) LoyalCH < 0.280875 177  140.50 MM ( 0.13559 0.86441 )  
##        8) LoyalCH < 0.0356415 59   10.14 MM ( 0.01695 0.98305 ) *
##        9) LoyalCH > 0.0356415 118  116.40 MM ( 0.19492 0.80508 ) *
##      5) LoyalCH > 0.280875 188  258.00 MM ( 0.44149 0.55851 )  
##       10) PriceDiff < 0.05 79   84.79 MM ( 0.22785 0.77215 )  
##         20) SpecialCH < 0.5 64   51.98 MM ( 0.14062 0.85938 ) *
##         21) SpecialCH > 0.5 15   20.19 CH ( 0.60000 0.40000 ) *
##       11) PriceDiff > 0.05 109  147.00 CH ( 0.59633 0.40367 ) *
##    3) LoyalCH > 0.5036 435  337.90 CH ( 0.86897 0.13103 )  
##      6) LoyalCH < 0.764572 174  201.00 CH ( 0.73563 0.26437 )  
##       12) ListPriceDiff < 0.235 72   99.81 MM ( 0.50000 0.50000 )  
##         24) PctDiscMM < 0.196196 55   73.14 CH ( 0.61818 0.38182 ) *
##         25) PctDiscMM > 0.196196 17   12.32 MM ( 0.11765 0.88235 ) *
##       13) ListPriceDiff > 0.235 102   65.43 CH ( 0.90196 0.09804 ) *
##      7) LoyalCH > 0.764572 261   91.20 CH ( 0.95785 0.04215 ) *

(d) Plot the tree

plot(tree_oj)
text(tree_oj, pretty = 0)

(e) Predict and confusion matrix

preds <- predict(tree_oj, newdata = test, type = "class")
table(preds, test$Purchase)
##      
## preds  CH  MM
##    CH 160  38
##    MM   8  64
mean(preds != test$Purchase)  # Test error rate
## [1] 0.1703704

(f) Cross-validation for pruning

cv_oj <- cv.tree(tree_oj, FUN = prune.misclass)

(g) Plot tree size vs. CV error

plot(cv_oj$size, cv_oj$dev, type = "b")

(h) Optimal tree size

cv_oj$size[which.min(cv_oj$dev)]
## [1] 7

(i) Prune tree

pruned_oj <- prune.misclass(tree_oj, best = 5)
plot(pruned_oj)
text(pruned_oj, pretty = 0)

(j) Training error rates

train_pred_unpruned <- predict(tree_oj, newdata = train, type = "class")
train_pred_pruned <- predict(pruned_oj, newdata = train, type = "class")
mean(train_pred_unpruned != train$Purchase)
## [1] 0.15875
mean(train_pred_pruned != train$Purchase)
## [1] 0.1625

(k) Test error rates

test_pred_unpruned <- predict(tree_oj, newdata = test, type = "class")
test_pred_pruned <- predict(pruned_oj, newdata = test, type = "class")
mean(test_pred_unpruned != test$Purchase)
## [1] 0.1703704
mean(test_pred_pruned != test$Purchase)
## [1] 0.162963