P8 With heavy AI help
n <- nrow(Carseats) # 400 observations
train <- sample(1:n, size = n / 2) # 200 train
test <- setdiff(1:n, train) # 200 test
train_data <- Carseats[train, ]
test_data <- Carseats[test, ]
cat("Training set size:", nrow(train_data), "\n")
## Training set size: 200
cat("Test set size: ", nrow(test_data), "\n\n")
## Test set size: 200
Treeee
tree_fit <- tree(Sales ~ ., data = train_data)
print(summary(tree_fit))
##
## Regression tree:
## tree(formula = Sales ~ ., data = train_data)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Income" "Advertising"
## [6] "CompPrice" "Population" "Urban"
## Number of terminal nodes: 18
## Residual mean deviance: 2.266 = 412.4 / 182
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.46100 -1.01400 -0.09829 0.00000 1.06900 3.68600
# Plot the tree
par(mfrow = c(1, 1))
plot(tree_fit)
text(tree_fit, pretty = 0, cex = 0.7)
title("Regression Tree — Carseats (unpruned)")

# Test MSE
tree_pred <- predict(tree_fit, newdata = test_data)
tree_mse <- mean((tree_pred - test_data$Sales)^2)
cat(sprintf("\n(b) Regression Tree Test MSE: %.4f\n\n", tree_mse))
##
## (b) Regression Tree Test MSE: 5.6864
Cross-Validation and Pruning
cv_tree <- cv.tree(tree_fit) # 10-fold CV by default
# Plot CV deviance vs tree size
par(mfrow = c(1, 2))
plot(cv_tree$size, cv_tree$dev, type = "b", pch = 19,
xlab = "Number of terminal nodes", ylab = "CV deviance",
main = "CV: Deviance vs Tree Size")
plot(cv_tree$k, cv_tree$dev, type = "b", pch = 19,
xlab = "Cost-complexity parameter k", ylab = "CV deviance",
main = "CV: Deviance vs k")

par(mfrow = c(1, 1))
# Optimal size
opt_size <- cv_tree$size[which.min(cv_tree$dev)]
cat(sprintf("Optimal number of terminal nodes (CV): %d\n", opt_size))
## Optimal number of terminal nodes (CV): 15
# Prune the tree
pruned_tree <- prune.tree(tree_fit, best = opt_size)
plot(pruned_tree)
text(pruned_tree, pretty = 0, cex = 0.8)
title(sprintf("Pruned Tree (%d terminal nodes)", opt_size))

# Pruned test MSE
pruned_pred <- predict(pruned_tree, newdata = test_data)
pruned_mse <- mean((pruned_pred - test_data$Sales)^2)
cat(sprintf("(c) Pruned Tree Test MSE : %.4f\n", pruned_mse))
## (c) Pruned Tree Test MSE : 5.3936
cat(sprintf(" Unpruned Tree Test MSE: %.4f\n", tree_mse))
## Unpruned Tree Test MSE: 5.6864
cat(sprintf(" Pruning %s the test MSE\n\n",
ifelse(pruned_mse < tree_mse, "improved", "did not improve")))
## Pruning improved the test MSE
Bagging (m = p = all 10 predictors)
p <- ncol(Carseats) - 1 # number of predictors (10)
bag_fit <- randomForest(
Sales ~ .,
data = train_data,
mtry = p, # use ALL predictors → bagging
ntree = 500,
importance = TRUE
)
cat("=== Bagging Summary ===\n")
## === Bagging Summary ===
print(bag_fit)
##
## Call:
## randomForest(formula = Sales ~ ., data = train_data, mtry = p, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.819133
## % Var explained: 64.2
# Test MSE
bag_pred <- predict(bag_fit, newdata = test_data)
bag_mse <- mean((bag_pred - test_data$Sales)^2)
cat(sprintf("\n(d) Bagging Test MSE: %.4f\n", bag_mse))
##
## (d) Bagging Test MSE: 2.3604
# Variable importance
cat("\nVariable Importance (Bagging):\n")
##
## Variable Importance (Bagging):
bag_imp <- importance(bag_fit)
print(round(bag_imp, 3))
## %IncMSE IncNodePurity
## CompPrice 30.442 197.998
## Income 9.210 104.303
## Advertising 12.750 104.098
## Population 2.500 62.704
## Price 54.834 432.197
## ShelveLoc 56.833 429.295
## Age 10.480 133.922
## Education -0.561 42.617
## Urban 1.840 10.027
## US 0.805 5.683
# Plot importance
varImpPlot(bag_fit, main = "Variable Importance — Bagging")

Random Forests — vary m
# Try different values of m
m_values <- c(2, 3, 4, 5, 7, p)
rf_mse <- numeric(length(m_values))
rf_models <- list()
for (i in seq_along(m_values)) {
m <- m_values[i]
rf_m <- randomForest(
Sales ~ .,
data = train_data,
mtry = m,
ntree = 500,
importance = TRUE
)
pred <- predict(rf_m, newdata = test_data)
rf_mse[i] <- mean((pred - test_data$Sales)^2)
rf_models[[i]] <- rf_m
cat(sprintf(" m = %2d Test MSE = %.4f\n", m, rf_mse[i]))
}
## m = 2 Test MSE = 3.5562
## m = 3 Test MSE = 2.9463
## m = 4 Test MSE = 2.6539
## m = 5 Test MSE = 2.4476
## m = 7 Test MSE = 2.3797
## m = 10 Test MSE = 2.3675
# Best m
best_idx <- which.min(rf_mse)
best_m <- m_values[best_idx]
cat(sprintf("\n(e) Best m = %d → Test MSE = %.4f\n", best_m, rf_mse[best_idx]))
##
## (e) Best m = 10 → Test MSE = 2.3675
# Plot MSE vs m
par(mfrow = c(1, 1))
plot(m_values, rf_mse, type = "b", pch = 19, col = "steelblue",
xlab = "m (variables per split)", ylab = "Test MSE",
main = "Random Forest: Test MSE vs m")
abline(v = best_m, lty = 2, col = "firebrick")
legend("topright", legend = paste("Best m =", best_m), lty = 2, col = "firebrick")

# Variable importance from best RF
best_rf <- rf_models[[best_idx]]
cat("\nVariable Importance (Random Forest, best m):\n")
##
## Variable Importance (Random Forest, best m):
rf_imp <- importance(best_rf)
print(round(rf_imp, 3))
## %IncMSE IncNodePurity
## CompPrice 30.052 202.736
## Income 10.095 102.025
## Advertising 13.982 111.508
## Population 0.208 64.883
## Price 55.105 441.580
## ShelveLoc 55.667 426.688
## Age 10.934 140.304
## Education -0.565 42.532
## Urban 2.778 8.478
## US -0.778 4.592
varImpPlot(best_rf, main = paste("Variable Importance — Random Forest (m =", best_m, ")"))

BART
# Prepare matrices (BART needs numeric matrices)
x_train <- model.matrix(Sales ~ . - 1, data = train_data)
x_test <- model.matrix(Sales ~ . - 1, data = test_data)
y_train <- train_data$Sales
y_test <- test_data$Sales
# Fit BART (default: 200 trees, 1000 MCMC draws after 100 burn-in)
bart_fit <- gbart(
x.train = x_train,
y.train = y_train,
x.test = x_test
)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 12, 200
## y1,yn: -3.732150, -7.112150
## x1,x[n*p]: 116.000000, 1.000000
## xp1,xp[np*p]: 115.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 62 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.284787,3,0.219563,7.64215
## *****sigma: 1.061683
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,12,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 2s
## trcnt,tecnt: 1000,1000
# Test MSE (posterior mean predictions)
bart_pred <- bart_fit$yhat.test.mean
bart_mse <- mean((bart_pred - y_test)^2)
cat(sprintf("\n(f) BART Test MSE: %.4f\n", bart_mse))
##
## (f) BART Test MSE: 1.7253
# Variable inclusion proportions (proxy for importance)
ord <- order(bart_fit$varcount.mean, decreasing = TRUE)
cat("\nBART — Variable inclusion counts (higher = more important):\n")
##
## BART — Variable inclusion counts (higher = more important):
print(round(bart_fit$varcount.mean[ord], 2))
## Price ShelveLocGood USYes CompPrice ShelveLocBad
## 25.65 21.27 20.54 20.36 20.29
## Age ShelveLocMedium Population Income Education
## 19.95 19.37 19.07 18.92 17.82
## UrbanYes Advertising
## 17.72 16.48