library(MASS)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Preparing data:
set.seed(1)
data(Boston)
train_idx <- sample(1:nrow(Boston), nrow(Boston)/2)
train <- Boston[train_idx, ]
test <- Boston[-train_idx, ]
# Defining mtry and ntree ranges:
mtry_vals <- c(2, 4, 6, 8, 10, 13)
ntree_vals <- c(25, 100, 200, 300, 400, 500)
# Creating a results data frame:
results <- expand.grid(mtry = mtry_vals, ntree = ntree_vals)
results$TestMSE <- NA
# Fitting random forests and compute test MSE:
for (i in 1:nrow(results)) {
fit <- randomForest(medv ~ ., data = train,
mtry = results$mtry[i],
ntree = results$ntree[i])
pred <- predict(fit, newdata = test)
results$TestMSE[i] <- mean((pred - test$medv)^2)
}
# Plotting test error:
ggplot(results, aes(x = mtry, y = TestMSE, color = as.factor(ntree), group = ntree)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(title = "Test MSE vs mtry for Different ntree Values",
x = "mtry (Number of variables tried at each split)",
y = "Test Mean Squared Error (MSE)",
color = "ntree") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
When tuning “mtry” (the number of variables considered at each split), the test MSE on the Boston housing data follows a clear U‑shaped pattern across every choice of ntree. With very small mtry (e.g. 2), the model under-fits, too few candidate predictors produce overly simplistic splits and high error. As mtry rises to around 4–6, the model strikes the best bias–variance balance, yielding the lowest MSE (roughly 18–19). Beyond that point, larger mtry values allow too many variables at each split, which reduces bias but dramatically increases variance, causing the error to climb back into the low‑20s.
Increasing “ntree” (the number of trees) consistently lowers and stabilizes test error, but with diminishing returns. A forest of only 25 trees is both noisy and relatively inaccurate. Growing up to about 200 trees yields the largest reduction in MSE and smooths out variability across different mtry settings. Beyond 200–300 trees, additional trees make only marginal improvements, indicating that the ensemble has already converged to a stable estimate.
In summary, these results suggest that a Random Forest with mtry ≈ 4–6 and on the order of 200–300 trees will deliver near‑optimal performance on this dataset. Pushing mtry much higher invites overfitting, while growing hundreds more trees provides little extra benefit but adds computational cost.
library(ISLR)
set.seed(42)
n <- nrow(Carseats)
train_idx <- sample(n, n/2)
train <- Carseats[train_idx, ]
test <- Carseats[-train_idx, ]
library(tree)
tree.carseats <- tree(Sales ~ ., data = train)
plot(tree.carseats)
text(tree.carseats, pretty = 0)
pred.tree <- predict(tree.carseats, newdata = test)
mse.tree <- mean((pred.tree - test$Sales)^2)
cat("Regression tree test MSE:", round(mse.tree, 3), "\n\n")
## Regression tree test MSE: 5.686
The test MSE obtained is 5.686.
cv.carseats <- cv.tree(tree.carseats)
# For Inspection:
plot(cv.carseats$size, cv.carseats$dev, type="b")
best.size <- cv.carseats$size[which.min(cv.carseats$dev)]
pruned <- prune.tree(tree.carseats, best = best.size)
# Test‐set MSE after pruning:
pred.pruned <- predict(pruned, newdata = test)
mse.pruned <- mean((pred.pruned - test$Sales)^2)
cat("Pruned tree (size =", best.size,") test MSE:", round(mse.pruned, 3), "\n\n")
## Pruned tree (size = 15 ) test MSE: 5.394
The test MSE obtained by pruning is 5.394. It did not improve the test MSE as compared to that of Regression tree.
# Bagging = randomForest with mtry = p
p <- ncol(train) - 1
bag <- randomForest(Sales ~ ., data = train, mtry = p, ntree = 500, importance = TRUE)
pred.bag <- predict(bag, newdata = test)
mse.bag <- mean((pred.bag - test$Sales)^2)
cat("Bagging test MSE:", round(mse.bag, 3), "\n\n")
## Bagging test MSE: 2.36
cat("Variable importance (bagging):\n")
## Variable importance (bagging):
print(importance(bag)[, c("%IncMSE","IncNodePurity")])
## %IncMSE IncNodePurity
## CompPrice 30.4417501 197.998343
## Income 9.2101020 104.302860
## Advertising 12.7498522 104.098442
## Population 2.4997582 62.703767
## Price 54.8337819 432.196870
## ShelveLoc 56.8325702 429.294512
## Age 10.4795981 133.921947
## Education -0.5607019 42.616731
## Urban 1.8401309 10.026923
## US 0.8045295 5.682508
cat("\n")
From the result above, we can see a drastic reduction in the test MSE by Bagging as 2.401. And the most important variables are ShelveLoc, Price and CompPrice.
# Trying default mtry = p/3, and exploring effect of different mtry:
mtry.default <- floor(p/3)
rf.default <- randomForest(Sales ~ ., data = train,
mtry = mtry.default, ntree = 500, importance = TRUE)
pred.rf <- predict(rf.default, newdata = test)
mse.rf <- mean((pred.rf - test$Sales)^2)
cat("RF (mtry =", mtry.default, ") test MSE:", round(mse.rf,3), "\n\n")
## RF (mtry = 3 ) test MSE: 2.877
cat("Variable importance (RF):\n")
## Variable importance (RF):
print(importance(rf.default)[, c("%IncMSE","IncNodePurity")])
## %IncMSE IncNodePurity
## CompPrice 15.9428967 157.54796
## Income 7.3743194 130.27268
## Advertising 9.5571748 130.03211
## Population -0.5072071 100.52536
## Price 33.6029882 354.00325
## ShelveLoc 37.0599948 323.24520
## Age 9.3104350 172.18275
## Education 0.2414226 86.51519
## Urban 1.0532783 16.66863
## US 1.3390775 17.10398
cat("\n")
# Effect of mtry on test error:
mtry.grid <- c(2, 4, 6, 8, 10)
mse.mtry <- sapply(mtry.grid, function(m) {
rf.tmp <- randomForest(Sales ~ ., data = train,
mtry = m, ntree = 500)
pred.tmp <- predict(rf.tmp, newdata = test)
mean((pred.tmp - test$Sales)^2)
})
plot(mtry.grid, mse.mtry, type="b", pch=19,
xlab="mtry", ylab="Test MSE",
main="Effect of mtry on RF Test MSE")
With mtry equals to 3, the test MSE is 2.814. This is a significant improvement over the single regression tree (MSE ~5.686) and pruned tree (MSE ~5.394), demonstrating the power of ensemble learning in reducing prediction error. Also, the two most important predictors are ShelveLoc and Price. These variables have the greatest impact on both predictive accuracy and tree structure.
From the plot, test MSE decreases sharply as mtry increases from 2 to 6. After mtry = 6, the improvements level off and slightly plateau (small marginal gains from mtry = 8 or 10). This trend reflects the bias–variance trade-off.
library(BART)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: survival
# Preparing design matrix:
x.train <- model.matrix(Sales ~ . -1, data = train)
y.train <- train$Sales
x.test <- model.matrix(Sales ~ . -1, data = test)
# Fitting BART:
bart.fit <- gbart(x.train, y.train, x.test = x.test)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 200, 12, 200
## y1,yn: -3.732150, -7.112150
## x1,x[n*p]: 116.000000, 1.000000
## xp1,xp[np*p]: 115.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 62 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.284787,3,0.219563,7.64215
## *****sigma: 1.061683
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,12,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 1s
## trcnt,tecnt: 1000,1000
pred.bart <- bart.fit$yhat.test.mean
mse.bart <- mean((pred.bart - test$Sales)^2)
cat("BART test MSE:", round(mse.bart,3), "\n")
## BART test MSE: 1.684
So far, the best test MSE is obtained by using BART as 1.656. It captures complex nonlinear interactions while avoiding over-fitting, due to its Bayesian regularization and averaging across many trees. BART uses priors to control the tree depth and smoothness of the function, helping it avoid over-fitting. Hence, BART clearly outperforms Bagging and Random Forest.
library(ISLR)
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
# Loading data:
data(Caravan)
train_idx <- 1:1000
Caravan.train <- Caravan[train_idx, ]
Caravan.test <- Caravan[-train_idx, ]
Purchase.test <- Caravan$Purchase[-train_idx]
# Preparing data for boosting: numeric {0,1} response
Caravan_boost_train <- Caravan.train
Caravan_boost_train$Purchase01 <- ifelse(Caravan_boost_train$Purchase == "Yes", 1, 0)
Caravan_boost_train$Purchase <- NULL
names(Caravan_boost_train)[names(Caravan_boost_train) == "Purchase01"] <- "Purchase"
Caravan_boost_test <- Caravan.test
Caravan_boost_test$Purchase01 <- ifelse(Caravan_boost_test$Purchase == "Yes", 1, 0)
Caravan_boost_test$Purchase <- NULL
names(Caravan_boost_test)[names(Caravan_boost_test) == "Purchase01"] <- "Purchase"
shrinkage_vals <- c(0.01, 0.05, 0.1, 0.2)
shrinkage_results <- data.frame(Shrinkage = shrinkage_vals, TestError = NA)
set.seed(1)
for (i in seq_along(shrinkage_vals)) {
model <- gbm(
formula = Purchase ~ .,
data = Caravan_boost_train,
distribution = "bernoulli",
n.trees = 1000,
shrinkage = shrinkage_vals[i],
verbose = FALSE
)
probs <- predict(
model,
newdata = Caravan_boost_test,
n.trees = 1000,
type = "response"
)
pred <- ifelse(probs > 0.2, 1, 0)
shrinkage_results$TestError[i] <- mean(pred != Caravan_boost_test$Purchase)
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
print(shrinkage_results)
## Shrinkage TestError
## 1 0.01 0.07859809
## 2 0.05 0.10700954
## 3 0.10 0.11737868
## 4 0.20 0.13210286
# Choosing best lambda:
best_lambda <- shrinkage_results$Shrinkage[which.min(shrinkage_results$TestError)]
cat("Best shrinkage (lambda):", best_lambda, "\n")
## Best shrinkage (lambda): 0.01
# Refitting boosting with best lambda and showing variable importance:
set.seed(1)
best_boost <- gbm(
formula = Purchase ~ .,
data = Caravan_boost_train,
distribution = "bernoulli",
n.trees = 1000,
shrinkage = best_lambda,
verbose = FALSE
)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
summary(best_boost)
## var rel.inf
## PPERSAUT PPERSAUT 14.63504779
## MKOOPKLA MKOOPKLA 9.47091649
## MOPLHOOG MOPLHOOG 7.31457416
## MBERMIDD MBERMIDD 6.08651965
## PBRAND PBRAND 4.66766122
## MGODGE MGODGE 4.49463264
## ABRAND ABRAND 4.32427755
## MINK3045 MINK3045 4.17590619
## MOSTYPE MOSTYPE 2.86402583
## PWAPART PWAPART 2.78191075
## MAUT1 MAUT1 2.61929152
## MBERARBG MBERARBG 2.10480508
## MSKA MSKA 2.10185152
## MAUT2 MAUT2 2.02172510
## MSKC MSKC 1.98684345
## MINKGEM MINKGEM 1.92122708
## MGODPR MGODPR 1.91777542
## MBERHOOG MBERHOOG 1.80710618
## MGODOV MGODOV 1.78693913
## PBYSTAND PBYSTAND 1.57279593
## MSKB1 MSKB1 1.43551401
## MFWEKIND MFWEKIND 1.37264255
## MRELGE MRELGE 1.20805179
## MOPLMIDD MOPLMIDD 0.93791970
## MINK7512 MINK7512 0.92590720
## MINK4575 MINK4575 0.91745993
## MGODRK MGODRK 0.90765539
## MFGEKIND MFGEKIND 0.85745374
## MZPART MZPART 0.82531066
## MRELOV MRELOV 0.80731252
## MINKM30 MINKM30 0.74126812
## MHKOOP MHKOOP 0.73690793
## MZFONDS MZFONDS 0.71638323
## MAUT0 MAUT0 0.71388052
## MHHUUR MHHUUR 0.59287247
## APERSAUT APERSAUT 0.58056986
## MOSHOOFD MOSHOOFD 0.58029563
## MSKB2 MSKB2 0.53885275
## PLEVEN PLEVEN 0.53052444
## MINK123M MINK123M 0.50660603
## MBERARBO MBERARBO 0.48596479
## MGEMOMV MGEMOMV 0.47614792
## PMOTSCO PMOTSCO 0.46163590
## MSKD MSKD 0.39735297
## MBERBOER MBERBOER 0.36417546
## MGEMLEEF MGEMLEEF 0.26166240
## MFALLEEN MFALLEEN 0.21448118
## MBERZELF MBERZELF 0.15906143
## MOPLLAAG MOPLLAAG 0.05263665
## MAANTHUI MAANTHUI 0.03766014
## MRELSA MRELSA 0.00000000
## PWABEDR PWABEDR 0.00000000
## PWALAND PWALAND 0.00000000
## PBESAUT PBESAUT 0.00000000
## PVRAAUT PVRAAUT 0.00000000
## PAANHANG PAANHANG 0.00000000
## PTRACTOR PTRACTOR 0.00000000
## PWERKT PWERKT 0.00000000
## PBROM PBROM 0.00000000
## PPERSONG PPERSONG 0.00000000
## PGEZONG PGEZONG 0.00000000
## PWAOREG PWAOREG 0.00000000
## PZEILPL PZEILPL 0.00000000
## PPLEZIER PPLEZIER 0.00000000
## PFIETS PFIETS 0.00000000
## PINBOED PINBOED 0.00000000
## AWAPART AWAPART 0.00000000
## AWABEDR AWABEDR 0.00000000
## AWALAND AWALAND 0.00000000
## ABESAUT ABESAUT 0.00000000
## AMOTSCO AMOTSCO 0.00000000
## AVRAAUT AVRAAUT 0.00000000
## AAANHANG AAANHANG 0.00000000
## ATRACTOR ATRACTOR 0.00000000
## AWERKT AWERKT 0.00000000
## ABROM ABROM 0.00000000
## ALEVEN ALEVEN 0.00000000
## APERSONG APERSONG 0.00000000
## AGEZONG AGEZONG 0.00000000
## AWAOREG AWAOREG 0.00000000
## AZEILPL AZEILPL 0.00000000
## APLEZIER APLEZIER 0.00000000
## AFIETS AFIETS 0.00000000
## AINBOED AINBOED 0.00000000
## ABYSTAND ABYSTAND 0.00000000
The lowest test error is achieved at λ = 0.01 (≈ 7.86%), hence the optimal shrinkage rate. The single most important predictor is PPERSAUT (percent who previously purchased auto insurance), followed by MKOOPKLA and MOPLHOOG. These variables contribute the largest reductions in the loss function when the model splits on them.
# Predicting purchase‐probabilities on the test set:
boost.probs.test <- predict(
best_boost,
newdata = Caravan_boost_test,
n.trees = 1000,
type = "response"
)
# Classifying as “Yes” if prob > 0.20:
boost.pred.test <- factor(
ifelse(boost.probs.test > 0.2, "Yes", "No"),
levels = c("No","Yes")
)
# Build confusion matrix against the original Purchase factor:
cm.boost <- table(
Predicted = boost.pred.test,
Actual = Caravan$Purchase[-train_idx]
)
print(cm.boost)
## Actual
## Predicted No Yes
## No 4410 256
## Yes 123 33
# Computing boosting “precision” = P(Actual=Yes | Predicted=Yes):
boost.precision <- cm.boost["Yes","Yes"] / sum(cm.boost["Yes",])
cat("Boosting precision @20% cutoff:", round(boost.precision,4), "\n\n")
## Boosting precision @20% cutoff: 0.2115
## KNN:
# Standardizing predictors (excluding the Purchase column):
Xall <- scale(Caravan[ , setdiff(names(Caravan), "Purchase")])
X.train <- Xall[ train_idx, ]
X.test <- Xall[-train_idx, ]
y.train <- Caravan$Purchase[train_idx]
y.test <- Caravan$Purchase[-train_idx]
# Trying a grid of k–values:
k.grid <- c(1,3,5,7,9,11,15)
knn.res <- data.frame(k=k.grid, Precision=NA, TestError=NA)
set.seed(1)
for(i in seq_along(k.grid)) {
k <- k.grid[i]
pred <- knn(X.train, X.test, y.train, k = k)
cm <- table(Pred=pred, True=y.test)
# precision = true positives / predicted positives
prec <- if("Yes" %in% rownames(cm)) cm["Yes","Yes"]/sum(cm["Yes",]) else 0
err <- mean(pred != y.test)
knn.res[i, c("Precision","TestError")] <- c(prec, err)
}
print(knn.res)
## k Precision TestError
## 1 1 0.1187500 0.11053505
## 2 3 0.2066116 0.07465782
## 3 5 0.2702703 0.06345915
## 4 7 0.1111111 0.06138532
## 5 9 0.0000000 0.06014102
## 6 11 NaN 0.05993364
## 7 15 NaN 0.05993364
# Comparing:
best.knn.idx <- which.max(knn.res$Precision)
cat("Best KNN precision:", round(knn.res$Precision[best.knn.idx],4),
"at k =", knn.res$k[best.knn.idx], "\n")
## Best KNN precision: 0.2703 at k = 5
cat("Boosting precision:", round(boost.precision,4), "\n")
## Boosting precision: 0.2115
From the above results, we can see that the number of actual buyers who were correctly predicted as buyers by the model is 21.15%, which indicate every 1 in 5 is an actual buyer. However, with k equals to 5, the precision value is 27.03% which indicate a bit more than 1 in 4 is an actual buyer. Hence, at the 20% threshold, KNN(k=5) outperforms the boosting model in precision, correctly identifying a higher fraction of actual buyers among those it predicts will buy.