Assignment#7

3

p <- seq(0, 1, length.out = 200)

gini <- 2 * p * (1 - p)
classification_error <- 1 - pmax(p, 1 - p)
entropy <- -(p * log2(p + 1e-10) + (1 - p) * log2(1 - p + 1e-10))

plot(p, gini, type = "l", col = "blue", lwd = 2,
     ylab = "Value", xlab = expression(hat(p)[m1]),
     main = "Gini Index, Classification Error, and Entropy")

lines(p, classification_error, col = "red", lwd = 2)
lines(p, entropy, col = "green", lwd = 2)

legend("top", legend = c("Gini Index", "Classification Error", "Entropy"),
       col = c("blue", "red", "green"), lwd = 2)

## 8

library(ISLR2)
library(tree)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

library(BART)

## Loading required package: nlme

## Loading required package: survival

set.seed(1)

# (a) 
n <- nrow(Carseats)
train_idx <- sample(1:n, n/2)
train <- Carseats[train_idx, ]
test <- Carseats[-train_idx, ]
test_y <- test$Sales

# (b) 
tree_carseats <- tree(Sales ~ ., data = train)
plot(tree_carseats)
text(tree_carseats, pretty = 0)

pred_tree <- predict(tree_carseats, newdata = test)
mse_tree <- mean((pred_tree - test_y)^2)
mse_tree

## [1] 4.922039

# (c) 
cv_carseats <- cv.tree(tree_carseats)
plot(cv_carseats$size, cv_carseats$dev, type = "b")

best_size <- cv_carseats$size[which.min(cv_carseats$dev)]
pruned_tree <- prune.tree(tree_carseats, best = best_size)
pred_pruned <- predict(pruned_tree, newdata = test)
mse_pruned <- mean((pred_pruned - test_y)^2)
mse_pruned

## [1] 4.922039

# (d) 
bag_carseats <- randomForest(Sales ~ ., data = train,
                             mtry = ncol(train) - 1,
                             importance = TRUE)
pred_bag <- predict(bag_carseats, newdata = test)
mse_bag <- mean((pred_bag - test_y)^2)
mse_bag

## [1] 2.657296

importance(bag_carseats)

##                 %IncMSE IncNodePurity
## CompPrice   23.07909904    171.185734
## Income       2.82081527     94.079825
## Advertising 11.43295625     99.098941
## Population  -3.92119532     59.818905
## Price       54.24314632    505.887016
## ShelveLoc   46.26912996    361.962753
## Age         14.24992212    159.740422
## Education   -0.07662320     46.738585
## Urban        0.08530119      8.453749
## US           4.34349223     15.157608

varImpPlot(bag_carseats)

# (e) 
rf_carseats <- randomForest(Sales ~ ., data = train,
                            mtry = 4, importance = TRUE)
pred_rf <- predict(rf_carseats, newdata = test)
mse_rf <- mean((pred_rf - test_y)^2)
mse_rf

## [1] 2.842032

importance(rf_carseats)

##                %IncMSE IncNodePurity
## CompPrice   18.7065535     160.23676
## Income       4.6350507     119.44462
## Advertising  8.3522415     109.00978
## Population  -2.3147828      85.94078
## Price       38.7746936     427.92458
## ShelveLoc   38.4928490     319.99035
## Age         12.2426522     170.34485
## Education   -1.5831043      64.55763
## Urban       -0.4023136      13.67858
## US           6.5812684      30.75458

varImpPlot(rf_carseats)

# (f) 
x_train <- subset(train, select = -Sales)
y_train <- train$Sales
x_test <- subset(test, select = -Sales)
bart_model <- wbart(x.train = x_train, y.train = y_train, x.test = x_test)

## *****Into main of wbart
## *****Data:
## data:n,p,np: 200, 14, 200
## y1,yn: 2.781850, 1.091850
## x1,x[n*p]: 107.000000, 1.000000
## xp1,xp[np*p]: 111.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 63 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.273474,3.000000,0.230740
## *****sigma: 1.088371
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 2s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000

mse_bart <- mean((bart_model$yhat.test.mean - test_y)^2)
mse_bart

## [1] 1.447347

9

# (a) 
data(OJ)
set.seed(1)
train_idx <- sample(1:nrow(OJ), 800)
train <- OJ[train_idx, ]
test <- OJ[-train_idx, ]

# (b) 
tree_oj <- tree(Purchase ~ ., data = train)
train_pred <- predict(tree_oj, train, type = "class")
train_error <- mean(train_pred != train$Purchase)
train_error

## [1] 0.15875

# (c) 
plot(tree_oj)
text(tree_oj, pretty = 0)

length(unique(tree_oj$where))

## [1] 9

# (d) 
summary(tree_oj)

## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "SpecialCH"     "ListPriceDiff"
## [5] "PctDiscMM"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7432 = 587.8 / 791 
## Misclassification error rate: 0.1588 = 127 / 800

tree_oj

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 1073.00 CH ( 0.60625 0.39375 )  
##    2) LoyalCH < 0.5036 365  441.60 MM ( 0.29315 0.70685 )  
##      4) LoyalCH < 0.280875 177  140.50 MM ( 0.13559 0.86441 )  
##        8) LoyalCH < 0.0356415 59   10.14 MM ( 0.01695 0.98305 ) *
##        9) LoyalCH > 0.0356415 118  116.40 MM ( 0.19492 0.80508 ) *
##      5) LoyalCH > 0.280875 188  258.00 MM ( 0.44149 0.55851 )  
##       10) PriceDiff < 0.05 79   84.79 MM ( 0.22785 0.77215 )  
##         20) SpecialCH < 0.5 64   51.98 MM ( 0.14062 0.85938 ) *
##         21) SpecialCH > 0.5 15   20.19 CH ( 0.60000 0.40000 ) *
##       11) PriceDiff > 0.05 109  147.00 CH ( 0.59633 0.40367 ) *
##    3) LoyalCH > 0.5036 435  337.90 CH ( 0.86897 0.13103 )  
##      6) LoyalCH < 0.764572 174  201.00 CH ( 0.73563 0.26437 )  
##       12) ListPriceDiff < 0.235 72   99.81 MM ( 0.50000 0.50000 )  
##         24) PctDiscMM < 0.196196 55   73.14 CH ( 0.61818 0.38182 ) *
##         25) PctDiscMM > 0.196196 17   12.32 MM ( 0.11765 0.88235 ) *
##       13) ListPriceDiff > 0.235 102   65.43 CH ( 0.90196 0.09804 ) *
##      7) LoyalCH > 0.764572 261   91.20 CH ( 0.95785 0.04215 ) *

# (e) 
test_pred <- predict(tree_oj, test, type = "class")
confusion_matrix <- table(Predicted = test_pred, Actual = test$Purchase)
confusion_matrix

##          Actual
## Predicted  CH  MM
##        CH 160  38
##        MM   8  64

test_error <- mean(test_pred != test$Purchase)
test_error

## [1] 0.1703704

# (f) 
set.seed(2)
cv_oj <- cv.tree(tree_oj, FUN = prune.misclass)

# (g) 
plot(cv_oj$size, cv_oj$dev, type = "b",
     xlab = "Tree Size", ylab = "CV Classification Error")

# (h) 
optimal_size <- cv_oj$size[which.min(cv_oj$dev)]
optimal_size

## [1] 9

# (i) 
pruned_oj <- prune.misclass(tree_oj, best = optimal_size)

# (j) 
train_pred_pruned <- predict(pruned_oj, train, type = "class")
train_error_pruned <- mean(train_pred_pruned != train$Purchase)
train_error_pruned

## [1] 0.15875

# (k) 
test_pred_pruned <- predict(pruned_oj, test, type = "class")
test_error_pruned <- mean(test_pred_pruned != test$Purchase)
test_error_pruned

## [1] 0.1703704

Assignment#7

2025-07-20

3

9