title: "LAB-9"
output: html_document
date: "2025-04-20"
editor_options:
markdown:
wrap: 72
---
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
set.seed(42)
# Load data
data(Boston)
train_index <- sample(1:nrow(Boston), nrow(Boston) / 2)
train <- Boston[train_index, ]
test <- Boston[-train_index, ]
mtry_values <- c(2, 4, 6, 8, 13)
ntree_values <- seq(10, 500, by=10)
# Stores results
results <- data.frame()
for (m in mtry_values) {
for (n in ntree_values) {
rf <- randomForest(medv ~ ., data = train, mtry = m, ntree = n)
pred <- predict(rf, newdata = test)
mse <- mean((pred - test$medv)^2)
results <- rbind(results, data.frame(mtry = m, ntree = n, TestMSE = mse))
}
}
# Plot
ggplot(results, aes(x = ntree, y = TestMSE, color = as.factor(mtry))) +
geom_line() +
labs(
title = "Test MSE for Different mtry and ntree Values",
x = "Number of Trees",
y = "Test MSE",
color = "mtry"
) +
theme_minimal()
Figure: Results from random forests on the Boston housing dataset. The test MSE is plotted as a function of the number of trees for various values of mtry, the number of variables randomly selected at each split.
library(ISLR)
library(tree)
set.seed(1)
head(Carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1 9.50 138 73 11 276 120 Bad 42 17
## 2 11.22 111 48 16 260 83 Good 65 10
## 3 10.06 113 35 10 269 80 Medium 59 12
## 4 7.40 117 100 4 466 97 Medium 55 14
## 5 4.15 141 64 3 340 128 Bad 38 13
## 6 10.81 124 113 13 501 72 Bad 78 16
## Urban US
## 1 Yes Yes
## 2 Yes Yes
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes No
## 6 No Yes
train_index <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]
library(rpart)
library(rpart.plot)
reg_tree <- rpart(Sales ~ ., data = train, method = "anova")
rpart.plot(reg_tree, type = 2, extra = 101, fallen.leaves = TRUE,
main = "Regression Tree for Carseats Sales")
pred <- predict(reg_tree, newdata = test)
mse <- mean((pred - test$Sales)^2)
print(paste("Test MSE:", round(mse, 2)))
## [1] "Test MSE: 5.17"
Test MSE: 5.17
Interpretation: This MSE value indicates the average squared difference between predicted and actual sales values on the test set.
Compared to earlier model using the tree package (which gave MSE = 4.92), this tree from rpart is: - Slightly more interpretable and better visualized - But performing slightly worse in terms of prediction error
library(ISLR)
library(rpart)
library(rpart.plot)
set.seed(1)
train_index <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]
reg_tree <- rpart(Sales ~ ., data = train, method = "anova", cp = 0.001)
printcp(reg_tree)
##
## Regression tree:
## rpart(formula = Sales ~ ., data = train, method = "anova", cp = 0.001)
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Price ShelveLoc Urban
## [7] US
##
## Root node error: 1572.7/200 = 7.8634
##
## n= 200
##
## CP nsplit rel error xerror xstd
## 1 0.2146655 0 1.00000 1.01810 0.092661
## 2 0.1016064 1 0.78533 0.85580 0.076755
## 3 0.0742035 2 0.68373 0.86999 0.081939
## 4 0.0633277 3 0.60952 0.81915 0.076191
## 5 0.0484641 4 0.54620 0.76465 0.068430
## 6 0.0334277 5 0.49773 0.71550 0.065359
## 7 0.0259139 6 0.46431 0.73310 0.064712
## 8 0.0251275 8 0.41248 0.74708 0.065786
## 9 0.0200723 9 0.38735 0.73815 0.065887
## 10 0.0184373 10 0.36728 0.72888 0.064208
## 11 0.0130752 11 0.34884 0.69277 0.063229
## 12 0.0108066 12 0.33577 0.68953 0.062993
## 13 0.0102719 13 0.32496 0.68010 0.062602
## 14 0.0061469 14 0.31469 0.66950 0.062226
## 15 0.0061176 15 0.30854 0.66788 0.062199
## 16 0.0010000 16 0.30242 0.67196 0.061987
plotcp(reg_tree)
best_cp <- reg_tree$cptable[which.min(reg_tree$cptable[,"xerror"]), "CP"]
cat("Best cp:", best_cp, "\n")
## Best cp: 0.00611765
pruned_tree <- prune(reg_tree, cp = best_cp)
rpart.plot(pruned_tree, type = 2, extra = 101, fallen.leaves = TRUE,
main = "Pruned Regression Tree for Carseats Sales")
pred_pruned <- predict(pruned_tree, newdata = test)
mse_pruned <- mean((pred_pruned - test$Sales)^2)
cat("Test MSE after pruning:", round(mse_pruned, 2), "\n")
## Test MSE after pruning: 5.09
From the output:
Yes, pruning slightly improved the test MSE from 5.17 → 5.09. Although the improvement is small, pruning helps reduce model complexity and the risk of overfitting — especially important for maintaining interpretability and generalization on new data.
bag_model <- randomForest(Sales ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)
bag_preds <- predict(bag_model, newdata = test)
bag_mse <- mean((bag_preds - test$Sales)^2)
print(paste("Test MSE (Bagging):", round(bag_mse, 2)))
## [1] "Test MSE (Bagging): 2.63"
importance(bag_model)
## %IncMSE IncNodePurity
## CompPrice 25.34775256 173.707956
## Income 5.61490543 87.839486
## Advertising 13.20754343 94.375878
## Population -0.31278333 61.236009
## Price 59.26748191 503.350038
## ShelveLoc 47.47747099 372.192543
## Age 17.05951301 160.642531
## Education -0.05364482 45.304788
## Urban -0.25577727 9.726168
## US 5.58481702 18.880187
varImpPlot(bag_model, main = "Variable Importance from Bagging Model")
Test MSE Obtained: The test Mean Squared Error (MSE) using the bagging
model is:2.63
This is a substantial improvement compared to: - Unpruned Tree: 5.17 - Pruned Tree: 5.09 This shows that bagging significantly reduces variance and enhances prediction accuracy.
Variables like Price, ShelveLoc, and CompPrice are by far the most influential in predicting Sales. Other variables like Population, Education, Urban, and US had very low or even negative importance, implying little to no contribution in prediction.
library(randomForest)
set.seed(1)
rf_model <- randomForest(Sales ~ ., data = train, mtry = 3, importance = TRUE)
rf_preds <- predict(rf_model, newdata = test)
rf_mse <- mean((rf_preds - test$Sales)^2)
print(paste("Test MSE (Random Forests):", round(rf_mse, 2)))
## [1] "Test MSE (Random Forests): 2.96"
importance(rf_model)
## %IncMSE IncNodePurity
## CompPrice 14.8840765 158.82956
## Income 4.3293950 125.64850
## Advertising 8.2215192 107.51700
## Population -0.9488134 97.06024
## Price 34.9793386 385.93142
## ShelveLoc 34.9248499 298.54210
## Age 14.3055912 178.42061
## Education 1.3117842 70.49202
## Urban -1.2680807 17.39986
## US 6.1139696 33.98963
varImpPlot(rf_model, main = "Variable Importance from Random Forest")
# Effect of m (number of variables considered at each split) on error
rate — Based on Your Results
Method m value Test MSE Description
Bagging m = 10 (all predictors) 2.63 Best performance, trees are strong but more correlated Random Forests m = 3 (√p) 2.96 Slightly worse than bagging, but better generalization Single Tree Not applicable 5.17 High variance and lower accuracy
In this analysis, decreasing m from 10 (bagging) to 3 (random forest default) resulted in a slightly higher test MSE (2.63 → 2.96). This suggests that allowing more variables per split increased tree strength and led to better overall predictive accuracy on this dataset.
#install.packages("BART")
#install.packages("dbarts")
library(dbarts)
set.seed(1)
train_index <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]
x_train <- train[, -which(names(train) == "Sales")]
y_train <- train$Sales
x_test <- test[, -which(names(test) == "Sales")]
y_test <- test$Sales
bart_model <- bart(x.train = x_train, y.train = y_train,
x.test = x_test, verbose = FALSE)
bart_preds <- bart_model$yhat.test.mean
bart_mse <- mean((bart_preds - y_test)^2)
print(paste("Test MSE (BART):", round(bart_mse, 2)))
## [1] "Test MSE (BART): 1.45"
Test MSE (BART): 1.45
Interpretation:
BART achieved the lowest test MSE of 1.45, outperforming single tree, bagging, and random forests. This confirms that BART’s ability to model complex interactions and account for uncertainty provides superior predictive accuracy on this dataset.
library(ISLR)
data(Caravan)
str(Caravan)
## 'data.frame': 5822 obs. of 86 variables:
## $ MOSTYPE : num 33 37 37 9 40 23 39 33 33 11 ...
## $ MAANTHUI: num 1 1 1 1 1 1 2 1 1 2 ...
## $ MGEMOMV : num 3 2 2 3 4 2 3 2 2 3 ...
## $ MGEMLEEF: num 2 2 2 3 2 1 2 3 4 3 ...
## $ MOSHOOFD: num 8 8 8 3 10 5 9 8 8 3 ...
## $ MGODRK : num 0 1 0 2 1 0 2 0 0 3 ...
## $ MGODPR : num 5 4 4 3 4 5 2 7 1 5 ...
## $ MGODOV : num 1 1 2 2 1 0 0 0 3 0 ...
## $ MGODGE : num 3 4 4 4 4 5 5 2 6 2 ...
## $ MRELGE : num 7 6 3 5 7 0 7 7 6 7 ...
## $ MRELSA : num 0 2 2 2 1 6 2 2 0 0 ...
## $ MRELOV : num 2 2 4 2 2 3 0 0 3 2 ...
## $ MFALLEEN: num 1 0 4 2 2 3 0 0 3 2 ...
## $ MFGEKIND: num 2 4 4 3 4 5 3 5 3 2 ...
## $ MFWEKIND: num 6 5 2 4 4 2 6 4 3 6 ...
## $ MOPLHOOG: num 1 0 0 3 5 0 0 0 0 0 ...
## $ MOPLMIDD: num 2 5 5 4 4 5 4 3 1 4 ...
## $ MOPLLAAG: num 7 4 4 2 0 4 5 6 8 5 ...
## $ MBERHOOG: num 1 0 0 4 0 2 0 2 1 2 ...
## $ MBERZELF: num 0 0 0 0 5 0 0 0 1 0 ...
## $ MBERBOER: num 1 0 0 0 4 0 0 0 0 0 ...
## $ MBERMIDD: num 2 5 7 3 0 4 4 2 1 3 ...
## $ MBERARBG: num 5 0 0 1 0 2 1 5 8 3 ...
## $ MBERARBO: num 2 4 2 2 0 2 5 2 1 3 ...
## $ MSKA : num 1 0 0 3 9 2 0 2 1 1 ...
## $ MSKB1 : num 1 2 5 2 0 2 1 1 1 2 ...
## $ MSKB2 : num 2 3 0 1 0 2 4 2 0 1 ...
## $ MSKC : num 6 5 4 4 0 4 5 5 8 4 ...
## $ MSKD : num 1 0 0 0 0 2 0 2 1 2 ...
## $ MHHUUR : num 1 2 7 5 4 9 6 0 9 0 ...
## $ MHKOOP : num 8 7 2 4 5 0 3 9 0 9 ...
## $ MAUT1 : num 8 7 7 9 6 5 8 4 5 6 ...
## $ MAUT2 : num 0 1 0 0 2 3 0 4 2 1 ...
## $ MAUT0 : num 1 2 2 0 1 3 1 2 3 2 ...
## $ MZFONDS : num 8 6 9 7 5 9 9 6 7 6 ...
## $ MZPART : num 1 3 0 2 4 0 0 3 2 3 ...
## $ MINKM30 : num 0 2 4 1 0 5 4 2 7 2 ...
## $ MINK3045: num 4 0 5 5 0 2 3 5 2 3 ...
## $ MINK4575: num 5 5 0 3 9 3 3 3 1 3 ...
## $ MINK7512: num 0 2 0 0 0 0 0 0 0 1 ...
## $ MINK123M: num 0 0 0 0 0 0 0 0 0 0 ...
## $ MINKGEM : num 4 5 3 4 6 3 3 3 2 4 ...
## $ MKOOPKLA: num 3 4 4 4 3 3 5 3 3 7 ...
## $ PWAPART : num 0 2 2 0 0 0 0 0 0 2 ...
## $ PWABEDR : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PWALAND : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PPERSAUT: num 6 0 6 6 0 6 6 0 5 0 ...
## $ PBESAUT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PMOTSCO : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PVRAAUT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PAANHANG: num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTRACTOR: num 0 0 0 0 0 0 0 0 0 0 ...
## $ PWERKT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PBROM : num 0 0 0 0 0 0 0 3 0 0 ...
## $ PLEVEN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PPERSONG: num 0 0 0 0 0 0 0 0 0 0 ...
## $ PGEZONG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PWAOREG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PBRAND : num 5 2 2 2 6 0 0 0 0 3 ...
## $ PZEILPL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PPLEZIER: num 0 0 0 0 0 0 0 0 0 0 ...
## $ PFIETS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PINBOED : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PBYSTAND: num 0 0 0 0 0 0 0 0 0 0 ...
## $ AWAPART : num 0 2 1 0 0 0 0 0 0 1 ...
## $ AWABEDR : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AWALAND : num 0 0 0 0 0 0 0 0 0 0 ...
## $ APERSAUT: num 1 0 1 1 0 1 1 0 1 0 ...
## $ ABESAUT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AMOTSCO : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AVRAAUT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AAANHANG: num 0 0 0 0 0 0 0 0 0 0 ...
## $ ATRACTOR: num 0 0 0 0 0 0 0 0 0 0 ...
## $ AWERKT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ABROM : num 0 0 0 0 0 0 0 1 0 0 ...
## $ ALEVEN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ APERSONG: num 0 0 0 0 0 0 0 0 0 0 ...
## $ AGEZONG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AWAOREG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ABRAND : num 1 1 1 1 1 0 0 0 0 1 ...
## $ AZEILPL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ APLEZIER: num 0 0 0 0 0 0 0 0 0 0 ...
## $ AFIETS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AINBOED : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ABYSTAND: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Purchase: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
table(Caravan$Purchase)
##
## No Yes
## 5474 348
standardized_data <- scale(Caravan[, -86]) # Exclude response variable
Caravan_scaled <- data.frame(standardized_data, Purchase = Caravan$Purchase)
train <- Caravan_scaled[1:1000, ]
test <- Caravan_scaled[-(1:1000), ]
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(ISLR)
data(Caravan)
standardized_data <- scale(Caravan[, -86])
Caravan_scaled <- data.frame(standardized_data, Purchase = Caravan$Purchase)
train <- Caravan_scaled[1:1000, ]
test <- Caravan_scaled[-(1:1000), ]
train$Purchase <- as.numeric(train$Purchase == "Yes")
test$Purchase <- as.numeric(test$Purchase == "Yes")
shrinkages <- c(0.01, 0.05, 0.1, 0.2)
test_errors <- c()
for (shrink in shrinkages) {
cat("Fitting boosting model with shrinkage =", shrink, "\n")
boost_model <- gbm(Purchase ~ .,
data = train,
distribution = "bernoulli",
n.trees = 1000,
shrinkage = shrink,
interaction.depth = 4,
verbose = FALSE)
probs <- predict(boost_model, newdata = test, n.trees = 1000, type = "response")
preds <- ifelse(probs > 0.5, 1, 0)
error <- mean(preds != test$Purchase)
test_errors <- c(test_errors, error)
cat("Test error:", round(error, 4), "\n\n")
}
## Fitting boosting model with shrinkage = 0.01
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Test error: 0.0645
##
## Fitting boosting model with shrinkage = 0.05
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Test error: 0.0742
##
## Fitting boosting model with shrinkage = 0.1
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Test error: 0.0794
##
## Fitting boosting model with shrinkage = 0.2
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Test error: 0.0825
results <- data.frame(Shrinkage = shrinkages, Test_Error = test_errors)
print(results)
## Shrinkage Test_Error
## 1 0.01 0.06449606
## 2 0.05 0.07424305
## 3 0.10 0.07942762
## 4 0.20 0.08253837
best_lambda <- shrinkages[which.min(test_errors)]
cat("Best shrinkage:", best_lambda, "\n")
## Best shrinkage: 0.01
best_model <- gbm(Purchase ~ ., data = train,
distribution = "bernoulli",
n.trees = 1000,
shrinkage = best_lambda,
interaction.depth = 4,
verbose = FALSE)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
summary(best_model, n.trees = 1000)
## var rel.inf
## PPERSAUT PPERSAUT 7.26368538
## MGODGE MGODGE 5.16171025
## MOPLHOOG MOPLHOOG 4.68595215
## PBRAND PBRAND 4.50508652
## MOSTYPE MOSTYPE 4.39735198
## MBERMIDD MBERMIDD 3.91818286
## MKOOPKLA MKOOPKLA 3.91508934
## MINK3045 MINK3045 3.25852700
## MGODPR MGODPR 3.17896932
## MSKC MSKC 2.89856400
## MAUT2 MAUT2 2.36710022
## MINK7512 MINK7512 2.24616152
## MSKB1 MSKB1 2.23731161
## MBERARBG MBERARBG 2.18149332
## PWAPART PWAPART 2.14061200
## MSKA MSKA 2.10880965
## MRELGE MRELGE 2.02042964
## MGODOV MGODOV 1.93005099
## MOPLMIDD MOPLMIDD 1.92086347
## MAUT1 MAUT1 1.91666048
## MRELOV MRELOV 1.88967606
## MHHUUR MHHUUR 1.87035639
## MINKM30 MINKM30 1.84160096
## MFGEKIND MFGEKIND 1.77985440
## MBERHOOG MBERHOOG 1.77676408
## MBERARBO MBERARBO 1.74285122
## MFWEKIND MFWEKIND 1.67591089
## MINKGEM MINKGEM 1.67461550
## MZFONDS MZFONDS 1.51129299
## MINK4575 MINK4575 1.44330649
## MFALLEEN MFALLEEN 1.41094267
## MGEMLEEF MGEMLEEF 1.38612876
## MSKB2 MSKB2 1.35936970
## MAUT0 MAUT0 1.30520507
## ABRAND ABRAND 1.26017789
## MZPART MZPART 1.23964552
## MSKD MSKD 1.19974528
## MRELSA MRELSA 1.17071856
## MGODRK MGODRK 1.15793001
## MHKOOP MHKOOP 1.03197291
## MGEMOMV MGEMOMV 0.95969075
## MOSHOOFD MOSHOOFD 0.89115163
## APERSAUT APERSAUT 0.85479547
## MOPLLAAG MOPLLAAG 0.77697408
## MBERZELF MBERZELF 0.68816771
## PMOTSCO PMOTSCO 0.53458414
## PLEVEN PLEVEN 0.37460968
## PBYSTAND PBYSTAND 0.34020481
## MBERBOER MBERBOER 0.25128182
## MINK123M MINK123M 0.21052472
## MAANTHUI MAANTHUI 0.09801030
## ALEVEN ALEVEN 0.03932786
## PWABEDR PWABEDR 0.00000000
## PWALAND PWALAND 0.00000000
## PBESAUT PBESAUT 0.00000000
## PVRAAUT PVRAAUT 0.00000000
## PAANHANG PAANHANG 0.00000000
## PTRACTOR PTRACTOR 0.00000000
## PWERKT PWERKT 0.00000000
## PBROM PBROM 0.00000000
## PPERSONG PPERSONG 0.00000000
## PGEZONG PGEZONG 0.00000000
## PWAOREG PWAOREG 0.00000000
## PZEILPL PZEILPL 0.00000000
## PPLEZIER PPLEZIER 0.00000000
## PFIETS PFIETS 0.00000000
## PINBOED PINBOED 0.00000000
## AWAPART AWAPART 0.00000000
## AWABEDR AWABEDR 0.00000000
## AWALAND AWALAND 0.00000000
## ABESAUT ABESAUT 0.00000000
## AMOTSCO AMOTSCO 0.00000000
## AVRAAUT AVRAAUT 0.00000000
## AAANHANG AAANHANG 0.00000000
## ATRACTOR ATRACTOR 0.00000000
## AWERKT AWERKT 0.00000000
## ABROM ABROM 0.00000000
## APERSONG APERSONG 0.00000000
## AGEZONG AGEZONG 0.00000000
## AWAOREG AWAOREG 0.00000000
## AZEILPL AZEILPL 0.00000000
## APLEZIER APLEZIER 0.00000000
## AFIETS AFIETS 0.00000000
## AINBOED AINBOED 0.00000000
## ABYSTAND ABYSTAND 0.00000000
The best performing shrinkage value is λ = 0.01, giving a test error of 6.45%, meaning the model correctly predicted ~93.5% of the test observations.
PPERSAUT (Car ownership) is the most influential predictor of whether a person buys caravan insurance.
Demographic and product usage variables like MGODGE (household composition), PBRAND, and MOPLHOOG are also highly influential.
# Use the best model already fitted with shrinkage = 0.01
library(caret)
## Loading required package: lattice
boost_probs <- predict(best_model, newdata = test, n.trees = 1000, type = "response")
boost_preds_20 <- ifelse(boost_probs > 0.2, 1, 0)
actual <- test$Purchase
conf_matrix <- table(Predicted = boost_preds_20, Actual = actual)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 4337 258
## 1 196 31
precision <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
cat("Precision (purchase rate among predicted purchasers):", round(precision, 4), "\n")
## Precision (purchase rate among predicted purchasers): 0.1366
library(ISLR)
library(class)
data(Caravan)
standardized_data <- scale(Caravan[, -86]) # Exclude response
Caravan_scaled <- data.frame(standardized_data, Purchase = Caravan$Purchase)
x_train <- Caravan_scaled[1:1000, -86]
x_test <- Caravan_scaled[-(1:1000), -86]
y_train <- Caravan_scaled[1:1000, "Purchase"]
y_test <- Caravan_scaled[-(1:1000), "Purchase"]
ks <- c(1, 3, 5, 10)
knn_results <- data.frame(k = ks, Test_Error = NA, Precision = NA)
for (i in 1:length(ks)) {
k_val <- ks[i]
knn_pred <- knn(train = x_train, test = x_test, cl = y_train, k = k_val)
error <- mean(knn_pred != y_test)
cm <- table(Predicted = knn_pred, Actual = y_test)
if ("Yes" %in% rownames(cm)) {
precision <- cm["Yes", "Yes"] / sum(cm["Yes", ])
} else {
precision <- 0
}
knn_results[i, "Test_Error"] <- error
knn_results[i, "Precision"] <- precision
}
print(knn_results)
## k Test_Error Precision
## 1 1 0.11115720 0.1152648
## 2 3 0.07445044 0.2083333
## 3 5 0.06345915 0.2702703
## 4 10 0.06034840 0.0000000
Precision (Purchase rate among predicted purchasers):
Precision= True Positives / Total Predicted Yes = 31/(196+31) =0.1366 or 13.66%
This means that only 13.66% of predicted buyers actually purchased caravan insurance.
KNN (k = 5) performed the best overall with: - Lowest test error (6.35%) - Highest precision (27.03%)
Using the boosting model with a 20% probability threshold, 13.66% of people predicted to purchase actually did so. This was outperformed by KNN with k = 5, which achieved a higher precision of 27.03% and a slightly lower test error (6.35%). This indicates that KNN, at the right value of k, was more selective and effective at identifying actual purchasers than boosting.