library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
set.seed(1)
train <- sample(1:nrow(Boston), nrow(Boston) / 2)
trainDat <- Boston[train, ]
testDat <- Boston[-train, ]
mList <- 3:12
ranFList <- lapply(mList, function(m) {
randomForest(x = trainDat[, -14], y = trainDat$medv,
xtest = testDat[, -14], ytest = testDat$medv,
ntree = 700, mtry = m)
})
testErrVals <- do.call('c', lapply(ranFList, function(t) t$test$mse))
MSEDat <- data.frame(
TestErr = testErrVals,
mVal = factor(rep(3:12, each = 700)),
nTree = rep(1:700, 10)
)
ggplot(MSEDat, aes(x = nTree, y = TestErr, colour = mVal)) +
geom_line() +
labs(x = "Number of trees", y = "Test Error", colour = "Value of m")
The optimal value of m appears to be 5 or 6, balancing model accuracy and stability.
We don’t need 700 trees; ~100–200 trees is sufficient as the error curve flattens after this.
library(ISLR)
library(tree)
library(randomForest)
set.seed(1)
train <- sample(1:nrow(Carseats), nrow(Carseats)/2)
trainDat <- Carseats[train, ]
testDat <- Carseats[-train, ]
tree1 <- tree(Sales ~ ., data = trainDat)
plot(tree1)
text(tree1, pretty = 1, cex = 0.5)
testErrUP <- mean((testDat$Sales - predict(tree1, newdata = testDat))^2)
testErrUP
## [1] 4.922039
Root Node: ShelveLoc: B,M If it is in good location, it leads to the right subtree, which generally has higher predicted sales. Left Subtree: Splits further based on Price, CompPrice, Advertising, and Age. Lower prices and more advertising generally lead to higher sales. Right Subtree : Price remains a major factor. Shows that even with a good location, very high prices reduce sales, especially with no advertising. Test MSE ~ 4.9. This means the tree predicts Sales with a mean squared error of 4.9 units^2, on average, in the test set.
set.seed(1)
cvTree1 <- cv.tree(tree1)
plot(cvTree1$size, cvTree1$dev, type = "l")
treeFinal <- prune.tree(tree1, best = 5)
plot(treeFinal)
text(treeFinal, pretty = 1)
testErrFinal <- mean((testDat$Sales - predict(treeFinal, newdata = testDat))^2)
testErrFinal
## [1] 5.186482
Slightly worse than test MSE.
p <- ncol(Carseats) - 1
set.seed(1)
bagg1 <- randomForest(x = trainDat[, -1], y = trainDat$Sales,
xtest = testDat[, -1], ytest = testDat$Sales,
mtry = p, ntree = 500)
which.min(bagg1$test$mse)
## [1] 401
bagg1$test$mse[139]
## [1] 2.665216
set.seed(1)
bagg2 <- randomForest(x = trainDat[, -1], y = trainDat$Sales,
xtest = testDat[, -1], ytest = testDat$Sales,
mtry = p, ntree = 139, importance = TRUE)
varImpPlot(bagg2)
Test MSE is 2.66 . Variables directly tied to cost and visibility (Price, CompPrice, ShelveLoc, Advertising) are the most predictive of sales. Socioeconomic factors (Income, US, Education) contribute less to direct sales prediction.
set.seed(1)
p <- ncol(Carseats) - 1
rf1 <- randomForest(x = trainDat[, -1], y = trainDat$Sales,
xtest = testDat[, -1], ytest = testDat$Sales,
mtry = ceiling(sqrt(p)), importance = TRUE)
rf2 <- randomForest(x = trainDat[, -1], y = trainDat$Sales,
xtest = testDat[, -1], ytest = testDat$Sales,
mtry = ceiling(p / 3), importance = TRUE)
rf1$test$mse[500]
## [1] 2.787584
rf2$test$mse[500]
## [1] 2.784727
varImpPlot(rf1)
varImpPlot(rf2)
Changing mtry has no significant impact on test MSE in this specific run. Lower mtry means more variation among trees (more decorrelation), reduces variance. Higher mtry means more similarity to bagging, potentially lower bias.
library(dbarts)
x_train <- trainDat[, -1]
y_train <- trainDat$Sales
x_test <- testDat[, -1]
y_test <- testDat$Sales
set.seed(123)
bart_fit <- bart(x.train = x_train, y.train = y_train,
x.test = x_test, verbose = FALSE)
pred_bart <- bart_fit$yhat.test.mean
test_mse_bart <- mean((testDat$Sales - pred_bart)^2)
print(test_mse_bart)
## [1] 1.46148
print(summary(bart_fit))
## Length Class Mode
## call 5 -none- call
## first.sigma 100 -none- numeric
## sigma 1000 -none- numeric
## sigest 1 -none- numeric
## yhat.train 200000 -none- numeric
## yhat.train.mean 200 -none- numeric
## yhat.test 200000 -none- numeric
## yhat.test.mean 200 -none- numeric
## varcount 12000 -none- numeric
## y 200 -none- numeric
## n.chains 1 -none- numeric
Test MSE is 1.46. The BART model achieved the lowest test MSE (1.46) among all models tested. It outperformed: Unpruned tree (~4.75) ,Pruned tree (> 4.75), Bagging (~2.66) This suggests BART captured the underlying structure of the data more effectively than other methods.Given its Bayesian nature, BART offers both high accuracy and uncertainty quantification, making it a powerful tool for regression tasks. So its better to use Bayesian Additive Regression Trees model in this case.
library(ISLR)
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(class)
library(ggplot2)
Caravan$PurchaseN <- ifelse(Caravan$Purchase == "Yes", 1, 0)
trainDat <- Caravan[1:1000, ]
testDat <- Caravan[1001:nrow(Caravan), ]
set.seed(1)
gbm1 <- gbm(PurchaseN ~ . - Purchase, data = trainDat,
distribution = "bernoulli", n.trees = 1000,
shrinkage = 0.01, interaction.depth = 2)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
summary(gbm1)[1:10, ]
Variables related to car ownership, brand preference, and purchasing behavior are most influential in determining whether a customer will make a purchase.This aligns well with general logic: people with access to a vehicle and brand preferences are more likely to buy. Boosting automatically picks up on these subtle interaction patterns due to its stage-wise tree building process.
pred_probs <- predict(gbm1, newdata = testDat, n.trees = 1000, type = "response")
pred_labels <- ifelse(pred_probs > 0.2, "Yes", "No")
conf_matrix <- table(Observed = testDat$Purchase, Predicted = pred_labels)
print(conf_matrix)
## Predicted
## Observed No Yes
## No 4359 174
## Yes 252 37
correct_fraction <- mean(testDat$Purchase[pred_labels == "Yes"] == "Yes")
print(correct_fraction)
## [1] 0.1753555
Fraction of actual purchasers among predicted ~17.5%
Lets see for KNN.
neighbors <- 1:20
knnPreds <- lapply(neighbors, function(k) {
knn(trainDat[, -(86:87)], testDat[, -(86:87)], cl = trainDat$Purchase, k = k)
})
testErr <- sapply(knnPreds, function(pred) mean(pred != testDat$Purchase))
kVal <- which.min(testErr)
knnModFinal <- knn(trainDat[, -(86:87)], testDat[, -(86:87)],
cl = trainDat$Purchase, k = kVal)
table(obs = testDat$Purchase, pred = knnModFinal)
## pred
## obs No Yes
## No 4533 0
## Yes 289 0
mean(testDat$Purchase[knnModFinal == "Yes"] == "Yes")
## [1] NaN
The model failed to capture any of the actual yes cases. Boosting is clearly more effective in this context, especially when the goal is to identify potential buyers rather than just minimize error.
Lets check for Logistic regression.
glm_model <- glm(Purchase ~ ., data = trainDat, family = binomial)
## Warning: glm.fit: algorithm did not converge
glm_probs <- predict(glm_model, newdata = testDat, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
glm_preds <- ifelse(glm_probs > 0.2, "Yes", "No")
glm_conf <- table(Observed = testDat$Purchase, Predicted = glm_preds)
print(glm_conf)
## Predicted
## Observed No Yes
## No 4533 0
## Yes 0 289
glm_precision <- mean(testDat$Purchase[glm_preds == "Yes"] == "Yes")
print(glm_precision)
## [1] 1
This result is unrealistic and due to complete separation — the model is likely overfitting. as the YES is 100% now.
Boosting identified 37 actual purchasers with a precision of ~17.5%, offering a good balance of recall and accuracy.KNN predicted only 5 as purchasers and missed all real buyers, yielding 0% precision despite a low test error.Logistic Regression failed due to unrealistic 100% precision, indicating overfitting from high-dimensional predictors.