P3

# Define p values
p <- seq(0, 1, length.out = 200)

# Define the three measures
gini        <- 2 * p * (1 - p)
class_error <- 1 - pmax(p, 1 - p)
entropy     <- ifelse(p == 0 | p == 1, 0,
                      -(p * log(p) + (1 - p) * log(1 - p)))

# Plot
plot(p, gini,
     type = "l", 
     ylim = c(0, 0.75),
     xlab = expression(hat(p)[m1]),
     ylab = "Value",
     main = "Node impurity measures")

lines(p, class_error)
lines(p, entropy)

legend("topright",
       legend = c("Gini index",
                  "Classification error",
                  "Entropy"))

P8 With heavy AI help

n       <- nrow(Carseats)                        # 400 observations
train   <- sample(1:n, size = n / 2)             # 200 train
test    <- setdiff(1:n, train)                    # 200 test
 
train_data <- Carseats[train, ]
test_data  <- Carseats[test,  ]
 
cat("Training set size:", nrow(train_data), "\n")
## Training set size: 200
cat("Test set size:    ", nrow(test_data),  "\n\n")
## Test set size:     200

Treeee

tree_fit <- tree(Sales ~ ., data = train_data)
 
print(summary(tree_fit))
## 
## Regression tree:
## tree(formula = Sales ~ ., data = train_data)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Income"      "Advertising"
## [6] "CompPrice"   "Population"  "Urban"      
## Number of terminal nodes:  18 
## Residual mean deviance:  2.266 = 412.4 / 182 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.46100 -1.01400 -0.09829  0.00000  1.06900  3.68600
# Plot the tree
par(mfrow = c(1, 1))
plot(tree_fit)
text(tree_fit, pretty = 0, cex = 0.7)
title("Regression Tree — Carseats (unpruned)")

# Test MSE
tree_pred <- predict(tree_fit, newdata = test_data)
tree_mse  <- mean((tree_pred - test_data$Sales)^2)
cat(sprintf("\n(b) Regression Tree Test MSE: %.4f\n\n", tree_mse))
## 
## (b) Regression Tree Test MSE: 5.6864

Cross-Validation and Pruning

cv_tree <- cv.tree(tree_fit)           # 10-fold CV by default
 
# Plot CV deviance vs tree size
par(mfrow = c(1, 2))
plot(cv_tree$size, cv_tree$dev, type = "b", pch = 19,
     xlab = "Number of terminal nodes", ylab = "CV deviance",
     main = "CV: Deviance vs Tree Size")
plot(cv_tree$k, cv_tree$dev, type = "b", pch = 19,
     xlab = "Cost-complexity parameter k", ylab = "CV deviance",
     main = "CV: Deviance vs k")

par(mfrow = c(1, 1))
 
# Optimal size
opt_size <- cv_tree$size[which.min(cv_tree$dev)]
cat(sprintf("Optimal number of terminal nodes (CV): %d\n", opt_size))
## Optimal number of terminal nodes (CV): 15
# Prune the tree
pruned_tree <- prune.tree(tree_fit, best = opt_size)
 
plot(pruned_tree)
text(pruned_tree, pretty = 0, cex = 0.8)
title(sprintf("Pruned Tree (%d terminal nodes)", opt_size))

# Pruned test MSE
pruned_pred <- predict(pruned_tree, newdata = test_data)
pruned_mse  <- mean((pruned_pred - test_data$Sales)^2)
cat(sprintf("(c) Pruned Tree Test MSE  : %.4f\n", pruned_mse))
## (c) Pruned Tree Test MSE  : 5.3936
cat(sprintf("    Unpruned Tree Test MSE: %.4f\n", tree_mse))
##     Unpruned Tree Test MSE: 5.6864
cat(sprintf("    Pruning %s the test MSE\n\n",
            ifelse(pruned_mse < tree_mse, "improved", "did not improve")))
##     Pruning improved the test MSE

Bagging (m = p = all 10 predictors)

p <- ncol(Carseats) - 1   # number of predictors (10)
 
bag_fit <- randomForest(
  Sales ~ .,
  data       = train_data,
  mtry       = p,         # use ALL predictors → bagging
  ntree      = 500,
  importance = TRUE
)
 
cat("=== Bagging Summary ===\n")
## === Bagging Summary ===
print(bag_fit)
## 
## Call:
##  randomForest(formula = Sales ~ ., data = train_data, mtry = p,      ntree = 500, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##           Mean of squared residuals: 2.819133
##                     % Var explained: 64.2
# Test MSE
bag_pred <- predict(bag_fit, newdata = test_data)
bag_mse  <- mean((bag_pred - test_data$Sales)^2)
cat(sprintf("\n(d) Bagging Test MSE: %.4f\n", bag_mse))
## 
## (d) Bagging Test MSE: 2.3604
# Variable importance
cat("\nVariable Importance (Bagging):\n")
## 
## Variable Importance (Bagging):
bag_imp <- importance(bag_fit)
print(round(bag_imp, 3))
##             %IncMSE IncNodePurity
## CompPrice    30.442       197.998
## Income        9.210       104.303
## Advertising  12.750       104.098
## Population    2.500        62.704
## Price        54.834       432.197
## ShelveLoc    56.833       429.295
## Age          10.480       133.922
## Education    -0.561        42.617
## Urban         1.840        10.027
## US            0.805         5.683
# Plot importance
varImpPlot(bag_fit, main = "Variable Importance — Bagging")

Random Forests — vary m

# Try different values of m
m_values  <- c(2, 3, 4, 5, 7, p)
rf_mse    <- numeric(length(m_values))
rf_models <- list()
 
for (i in seq_along(m_values)) {
  m <- m_values[i]
  rf_m <- randomForest(
    Sales ~ .,
    data       = train_data,
    mtry       = m,
    ntree      = 500,
    importance = TRUE
  )
  pred        <- predict(rf_m, newdata = test_data)
  rf_mse[i]   <- mean((pred - test_data$Sales)^2)
  rf_models[[i]] <- rf_m
  cat(sprintf("  m = %2d  Test MSE = %.4f\n", m, rf_mse[i]))
}
##   m =  2  Test MSE = 3.5562
##   m =  3  Test MSE = 2.9463
##   m =  4  Test MSE = 2.6539
##   m =  5  Test MSE = 2.4476
##   m =  7  Test MSE = 2.3797
##   m = 10  Test MSE = 2.3675
# Best m
best_idx <- which.min(rf_mse)
best_m   <- m_values[best_idx]
cat(sprintf("\n(e) Best m = %d  →  Test MSE = %.4f\n", best_m, rf_mse[best_idx]))
## 
## (e) Best m = 10  →  Test MSE = 2.3675
# Plot MSE vs m
par(mfrow = c(1, 1))
plot(m_values, rf_mse, type = "b", pch = 19, col = "steelblue",
     xlab = "m (variables per split)", ylab = "Test MSE",
     main = "Random Forest: Test MSE vs m")
abline(v = best_m, lty = 2, col = "firebrick")
legend("topright", legend = paste("Best m =", best_m), lty = 2, col = "firebrick")

# Variable importance from best RF
best_rf <- rf_models[[best_idx]]
cat("\nVariable Importance (Random Forest, best m):\n")
## 
## Variable Importance (Random Forest, best m):
rf_imp <- importance(best_rf)
print(round(rf_imp, 3))
##             %IncMSE IncNodePurity
## CompPrice    30.052       202.736
## Income       10.095       102.025
## Advertising  13.982       111.508
## Population    0.208        64.883
## Price        55.105       441.580
## ShelveLoc    55.667       426.688
## Age          10.934       140.304
## Education    -0.565        42.532
## Urban         2.778         8.478
## US           -0.778         4.592
varImpPlot(best_rf, main = paste("Variable Importance — Random Forest (m =", best_m, ")"))

BART

# Prepare matrices (BART needs numeric matrices)
x_train <- model.matrix(Sales ~ . - 1, data = train_data)
x_test  <- model.matrix(Sales ~ . - 1, data = test_data)
y_train <- train_data$Sales
y_test  <- test_data$Sales
 
# Fit BART (default: 200 trees, 1000 MCMC draws after 100 burn-in)
bart_fit <- gbart(
  x.train  = x_train,
  y.train  = y_train,
  x.test   = x_test
)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 12, 200
## y1,yn: -3.732150, -7.112150
## x1,x[n*p]: 116.000000, 1.000000
## xp1,xp[np*p]: 115.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 62 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.284787,3,0.219563,7.64215
## *****sigma: 1.061683
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,12,0
## *****printevery: 100
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 2s
## trcnt,tecnt: 1000,1000
# Test MSE (posterior mean predictions)
bart_pred <- bart_fit$yhat.test.mean
bart_mse  <- mean((bart_pred - y_test)^2)
cat(sprintf("\n(f) BART Test MSE: %.4f\n", bart_mse))
## 
## (f) BART Test MSE: 1.7253
# Variable inclusion proportions (proxy for importance)
ord <- order(bart_fit$varcount.mean, decreasing = TRUE)
cat("\nBART — Variable inclusion counts (higher = more important):\n")
## 
## BART — Variable inclusion counts (higher = more important):
print(round(bart_fit$varcount.mean[ord], 2))
##           Price   ShelveLocGood           USYes       CompPrice    ShelveLocBad 
##           25.65           21.27           20.54           20.36           20.29 
##             Age ShelveLocMedium      Population          Income       Education 
##           19.95           19.37           19.07           18.92           17.82 
##        UrbanYes     Advertising 
##           17.72           16.48