title: "LAB-9"
output: html_document
date: "2025-04-20"
editor_options: 
  markdown: 
    wrap: 72
---

7

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

library(MASS)  
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

set.seed(42)

# Load data
data(Boston)
train_index <- sample(1:nrow(Boston), nrow(Boston) / 2)
train <- Boston[train_index, ]
test <- Boston[-train_index, ]

mtry_values <- c(2, 4, 6, 8, 13)
ntree_values <- seq(10, 500, by=10)

# Stores results
results <- data.frame()

for (m in mtry_values) {
  for (n in ntree_values) {
    rf <- randomForest(medv ~ ., data = train, mtry = m, ntree = n)
    pred <- predict(rf, newdata = test)
    mse <- mean((pred - test$medv)^2)
    results <- rbind(results, data.frame(mtry = m, ntree = n, TestMSE = mse))
  }
}

# Plot
ggplot(results, aes(x = ntree, y = TestMSE, color = as.factor(mtry))) +
  geom_line() +
  labs(
    title = "Test MSE for Different mtry and ntree Values",
    x = "Number of Trees",
    y = "Test MSE",
    color = "mtry"
  ) +
  theme_minimal()

Description of Results

Figure: Results from random forests on the Boston housing dataset. The test MSE is plotted as a function of the number of trees for various values of mtry, the number of variables randomly selected at each split.

When mtry = 2, the model performs significantly worse, resulting in a higher and more unstable test MSE.
For values of mtry = 4, 6, 8, 13, the performance improves, with test errors stabilizing around a lower value as the number of trees increases.
The lowest test MSE is achieved when mtry = 6, suggesting that a moderate number of features at each split provides the best bias-variance trade-off.
Increasing the number of trees helps initially, but after about 100–200 trees, the test error stabilizes, indicating that adding more trees offers marginal improvements.
The model with mtry = 13 (i.e., using all features, similar to bagging) does not perform as well as some of the random forest configurations, highlighting the benefit of randomness in feature selection.

8 (a)

library(ISLR)
library(tree)

set.seed(1)


head(Carseats)

##   Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1  9.50       138     73          11        276   120       Bad  42        17
## 2 11.22       111     48          16        260    83      Good  65        10
## 3 10.06       113     35          10        269    80    Medium  59        12
## 4  7.40       117    100           4        466    97    Medium  55        14
## 5  4.15       141     64           3        340   128       Bad  38        13
## 6 10.81       124    113          13        501    72       Bad  78        16
##   Urban  US
## 1   Yes Yes
## 2   Yes Yes
## 3   Yes Yes
## 4   Yes Yes
## 5   Yes  No
## 6    No Yes

train_index <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]

(b)

library(rpart)
library(rpart.plot)


reg_tree <- rpart(Sales ~ ., data = train, method = "anova")

rpart.plot(reg_tree, type = 2, extra = 101, fallen.leaves = TRUE,
           main = "Regression Tree for Carseats Sales")

pred <- predict(reg_tree, newdata = test)


mse <- mean((pred - test$Sales)^2)
print(paste("Test MSE:", round(mse, 2)))

## [1] "Test MSE: 5.17"

Test MSE: 5.17

Interpretation: This MSE value indicates the average squared difference between predicted and actual sales values on the test set.

Compared to earlier model using the tree package (which gave MSE = 4.92), this tree from rpart is: - Slightly more interpretable and better visualized - But performing slightly worse in terms of prediction error

(c)

library(ISLR)
library(rpart)
library(rpart.plot)

set.seed(1)
train_index <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]


reg_tree <- rpart(Sales ~ ., data = train, method = "anova", cp = 0.001)


printcp(reg_tree)

## 
## Regression tree:
## rpart(formula = Sales ~ ., data = train, method = "anova", cp = 0.001)
## 
## Variables actually used in tree construction:
## [1] Advertising Age         CompPrice   Price       ShelveLoc   Urban      
## [7] US         
## 
## Root node error: 1572.7/200 = 7.8634
## 
## n= 200 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.2146655      0   1.00000 1.01810 0.092661
## 2  0.1016064      1   0.78533 0.85580 0.076755
## 3  0.0742035      2   0.68373 0.86999 0.081939
## 4  0.0633277      3   0.60952 0.81915 0.076191
## 5  0.0484641      4   0.54620 0.76465 0.068430
## 6  0.0334277      5   0.49773 0.71550 0.065359
## 7  0.0259139      6   0.46431 0.73310 0.064712
## 8  0.0251275      8   0.41248 0.74708 0.065786
## 9  0.0200723      9   0.38735 0.73815 0.065887
## 10 0.0184373     10   0.36728 0.72888 0.064208
## 11 0.0130752     11   0.34884 0.69277 0.063229
## 12 0.0108066     12   0.33577 0.68953 0.062993
## 13 0.0102719     13   0.32496 0.68010 0.062602
## 14 0.0061469     14   0.31469 0.66950 0.062226
## 15 0.0061176     15   0.30854 0.66788 0.062199
## 16 0.0010000     16   0.30242 0.67196 0.061987

plotcp(reg_tree)

best_cp <- reg_tree$cptable[which.min(reg_tree$cptable[,"xerror"]), "CP"]
cat("Best cp:", best_cp, "\n")

## Best cp: 0.00611765

pruned_tree <- prune(reg_tree, cp = best_cp)


rpart.plot(pruned_tree, type = 2, extra = 101, fallen.leaves = TRUE,
           main = "Pruned Regression Tree for Carseats Sales")

pred_pruned <- predict(pruned_tree, newdata = test)

mse_pruned <- mean((pred_pruned - test$Sales)^2)
cat("Test MSE after pruning:", round(mse_pruned, 2), "\n")

## Test MSE after pruning: 5.09

Does pruning improve the test MSE?

From the output:

Test MSE before pruning: 5.17
Test MSE after pruning: 5.09
Optimal cp (complexity parameter) chosen by cross-validation: 0.00611765
Pruned tree has fewer terminal nodes, resulting in a simpler and more generalizable model

Yes, pruning slightly improved the test MSE from 5.17 → 5.09. Although the improvement is small, pruning helps reduce model complexity and the risk of overfitting — especially important for maintaining interpretability and generalization on new data.

(d)

bag_model <- randomForest(Sales ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)

bag_preds <- predict(bag_model, newdata = test)

bag_mse <- mean((bag_preds - test$Sales)^2)
print(paste("Test MSE (Bagging):", round(bag_mse, 2)))

## [1] "Test MSE (Bagging): 2.63"

importance(bag_model)

##                 %IncMSE IncNodePurity
## CompPrice   25.34775256    173.707956
## Income       5.61490543     87.839486
## Advertising 13.20754343     94.375878
## Population  -0.31278333     61.236009
## Price       59.26748191    503.350038
## ShelveLoc   47.47747099    372.192543
## Age         17.05951301    160.642531
## Education   -0.05364482     45.304788
## Urban       -0.25577727      9.726168
## US           5.58481702     18.880187

varImpPlot(bag_model, main = "Variable Importance from Bagging Model")

Test MSE Obtained: The test Mean Squared Error (MSE) using the bagging model is:2.63

This is a substantial improvement compared to: - Unpruned Tree: 5.17 - Pruned Tree: 5.09 This shows that bagging significantly reduces variance and enhances prediction accuracy.

Variables like Price, ShelveLoc, and CompPrice are by far the most influential in predicting Sales. Other variables like Population, Education, Urban, and US had very low or even negative importance, implying little to no contribution in prediction.

(e)

library(randomForest)


set.seed(1)


rf_model <- randomForest(Sales ~ ., data = train, mtry = 3, importance = TRUE)

rf_preds <- predict(rf_model, newdata = test)


rf_mse <- mean((rf_preds - test$Sales)^2)
print(paste("Test MSE (Random Forests):", round(rf_mse, 2)))

## [1] "Test MSE (Random Forests): 2.96"

importance(rf_model)

##                %IncMSE IncNodePurity
## CompPrice   14.8840765     158.82956
## Income       4.3293950     125.64850
## Advertising  8.2215192     107.51700
## Population  -0.9488134      97.06024
## Price       34.9793386     385.93142
## ShelveLoc   34.9248499     298.54210
## Age         14.3055912     178.42061
## Education    1.3117842      70.49202
## Urban       -1.2680807      17.39986
## US           6.1139696      33.98963

varImpPlot(rf_model, main = "Variable Importance from Random Forest")

# Effect of m (number of variables considered at each split) on error rate — Based on Your Results

Method m value Test MSE Description

Bagging m = 10 (all predictors) 2.63 Best performance, trees are strong but more correlated Random Forests m = 3 (√p) 2.96 Slightly worse than bagging, but better generalization Single Tree Not applicable 5.17 High variance and lower accuracy

In this analysis, decreasing m from 10 (bagging) to 3 (random forest default) resulted in a slightly higher test MSE (2.63 → 2.96). This suggests that allowing more variables per split increased tree strength and led to better overall predictive accuracy on this dataset.

(f)

#install.packages("BART")       
#install.packages("dbarts")

library(dbarts)


set.seed(1)


train_index <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
train <- Carseats[train_index, ]
test <- Carseats[-train_index, ]


x_train <- train[, -which(names(train) == "Sales")]
y_train <- train$Sales
x_test <- test[, -which(names(test) == "Sales")]
y_test <- test$Sales


bart_model <- bart(x.train = x_train, y.train = y_train, 
                   x.test = x_test, verbose = FALSE)


bart_preds <- bart_model$yhat.test.mean


bart_mse <- mean((bart_preds - y_test)^2)
print(paste("Test MSE (BART):", round(bart_mse, 2)))

## [1] "Test MSE (BART): 1.45"

Test MSE (BART): 1.45

Interpretation:

BART significantly outperformed all other methods on this dataset.
It provides better generalization by combining flexibility of regression trees with Bayesian regularization.
The posterior averaging over many trees helps reduce both bias and variance, leading to a highly accurate model.

BART achieved the lowest test MSE of 1.45, outperforming single tree, bagging, and random forests. This confirms that BART’s ability to model complex interactions and account for uncertainty provides superior predictive accuracy on this dataset.

11 (a)

library(ISLR)
data(Caravan)


str(Caravan)

## 'data.frame':    5822 obs. of  86 variables:
##  $ MOSTYPE : num  33 37 37 9 40 23 39 33 33 11 ...
##  $ MAANTHUI: num  1 1 1 1 1 1 2 1 1 2 ...
##  $ MGEMOMV : num  3 2 2 3 4 2 3 2 2 3 ...
##  $ MGEMLEEF: num  2 2 2 3 2 1 2 3 4 3 ...
##  $ MOSHOOFD: num  8 8 8 3 10 5 9 8 8 3 ...
##  $ MGODRK  : num  0 1 0 2 1 0 2 0 0 3 ...
##  $ MGODPR  : num  5 4 4 3 4 5 2 7 1 5 ...
##  $ MGODOV  : num  1 1 2 2 1 0 0 0 3 0 ...
##  $ MGODGE  : num  3 4 4 4 4 5 5 2 6 2 ...
##  $ MRELGE  : num  7 6 3 5 7 0 7 7 6 7 ...
##  $ MRELSA  : num  0 2 2 2 1 6 2 2 0 0 ...
##  $ MRELOV  : num  2 2 4 2 2 3 0 0 3 2 ...
##  $ MFALLEEN: num  1 0 4 2 2 3 0 0 3 2 ...
##  $ MFGEKIND: num  2 4 4 3 4 5 3 5 3 2 ...
##  $ MFWEKIND: num  6 5 2 4 4 2 6 4 3 6 ...
##  $ MOPLHOOG: num  1 0 0 3 5 0 0 0 0 0 ...
##  $ MOPLMIDD: num  2 5 5 4 4 5 4 3 1 4 ...
##  $ MOPLLAAG: num  7 4 4 2 0 4 5 6 8 5 ...
##  $ MBERHOOG: num  1 0 0 4 0 2 0 2 1 2 ...
##  $ MBERZELF: num  0 0 0 0 5 0 0 0 1 0 ...
##  $ MBERBOER: num  1 0 0 0 4 0 0 0 0 0 ...
##  $ MBERMIDD: num  2 5 7 3 0 4 4 2 1 3 ...
##  $ MBERARBG: num  5 0 0 1 0 2 1 5 8 3 ...
##  $ MBERARBO: num  2 4 2 2 0 2 5 2 1 3 ...
##  $ MSKA    : num  1 0 0 3 9 2 0 2 1 1 ...
##  $ MSKB1   : num  1 2 5 2 0 2 1 1 1 2 ...
##  $ MSKB2   : num  2 3 0 1 0 2 4 2 0 1 ...
##  $ MSKC    : num  6 5 4 4 0 4 5 5 8 4 ...
##  $ MSKD    : num  1 0 0 0 0 2 0 2 1 2 ...
##  $ MHHUUR  : num  1 2 7 5 4 9 6 0 9 0 ...
##  $ MHKOOP  : num  8 7 2 4 5 0 3 9 0 9 ...
##  $ MAUT1   : num  8 7 7 9 6 5 8 4 5 6 ...
##  $ MAUT2   : num  0 1 0 0 2 3 0 4 2 1 ...
##  $ MAUT0   : num  1 2 2 0 1 3 1 2 3 2 ...
##  $ MZFONDS : num  8 6 9 7 5 9 9 6 7 6 ...
##  $ MZPART  : num  1 3 0 2 4 0 0 3 2 3 ...
##  $ MINKM30 : num  0 2 4 1 0 5 4 2 7 2 ...
##  $ MINK3045: num  4 0 5 5 0 2 3 5 2 3 ...
##  $ MINK4575: num  5 5 0 3 9 3 3 3 1 3 ...
##  $ MINK7512: num  0 2 0 0 0 0 0 0 0 1 ...
##  $ MINK123M: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ MINKGEM : num  4 5 3 4 6 3 3 3 2 4 ...
##  $ MKOOPKLA: num  3 4 4 4 3 3 5 3 3 7 ...
##  $ PWAPART : num  0 2 2 0 0 0 0 0 0 2 ...
##  $ PWABEDR : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PWALAND : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PPERSAUT: num  6 0 6 6 0 6 6 0 5 0 ...
##  $ PBESAUT : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PMOTSCO : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PVRAAUT : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PAANHANG: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTRACTOR: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PWERKT  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PBROM   : num  0 0 0 0 0 0 0 3 0 0 ...
##  $ PLEVEN  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PPERSONG: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PGEZONG : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PWAOREG : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PBRAND  : num  5 2 2 2 6 0 0 0 0 3 ...
##  $ PZEILPL : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PPLEZIER: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PFIETS  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PINBOED : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PBYSTAND: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWAPART : num  0 2 1 0 0 0 0 0 0 1 ...
##  $ AWABEDR : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWALAND : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ APERSAUT: num  1 0 1 1 0 1 1 0 1 0 ...
##  $ ABESAUT : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AMOTSCO : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AVRAAUT : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AAANHANG: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ATRACTOR: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWERKT  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ABROM   : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ ALEVEN  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ APERSONG: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AGEZONG : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWAOREG : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ABRAND  : num  1 1 1 1 1 0 0 0 0 1 ...
##  $ AZEILPL : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ APLEZIER: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AFIETS  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AINBOED : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ABYSTAND: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Purchase: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...

table(Caravan$Purchase)

## 
##   No  Yes 
## 5474  348

standardized_data <- scale(Caravan[, -86])  # Exclude response variable
Caravan_scaled <- data.frame(standardized_data, Purchase = Caravan$Purchase)

train <- Caravan_scaled[1:1000, ]
test <- Caravan_scaled[-(1:1000), ]

(b)

library(gbm)

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(ISLR)

data(Caravan)


standardized_data <- scale(Caravan[, -86])
Caravan_scaled <- data.frame(standardized_data, Purchase = Caravan$Purchase)


train <- Caravan_scaled[1:1000, ]
test <- Caravan_scaled[-(1:1000), ]


train$Purchase <- as.numeric(train$Purchase == "Yes")
test$Purchase <- as.numeric(test$Purchase == "Yes")


shrinkages <- c(0.01, 0.05, 0.1, 0.2)
test_errors <- c()

for (shrink in shrinkages) {
  cat("Fitting boosting model with shrinkage =", shrink, "\n")
  
  boost_model <- gbm(Purchase ~ ., 
                     data = train, 
                     distribution = "bernoulli", 
                     n.trees = 1000, 
                     shrinkage = shrink, 
                     interaction.depth = 4, 
                     verbose = FALSE)
  

  probs <- predict(boost_model, newdata = test, n.trees = 1000, type = "response")
  

  preds <- ifelse(probs > 0.5, 1, 0)
  
 
  error <- mean(preds != test$Purchase)
  test_errors <- c(test_errors, error)
  
  cat("Test error:", round(error, 4), "\n\n")
}

## Fitting boosting model with shrinkage = 0.01

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.

## Test error: 0.0645 
## 
## Fitting boosting model with shrinkage = 0.05

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.

## Test error: 0.0742 
## 
## Fitting boosting model with shrinkage = 0.1

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.

## Test error: 0.0794 
## 
## Fitting boosting model with shrinkage = 0.2

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.

## Test error: 0.0825

results <- data.frame(Shrinkage = shrinkages, Test_Error = test_errors)
print(results)

##   Shrinkage Test_Error
## 1      0.01 0.06449606
## 2      0.05 0.07424305
## 3      0.10 0.07942762
## 4      0.20 0.08253837

best_lambda <- shrinkages[which.min(test_errors)]
cat("Best shrinkage:", best_lambda, "\n")

## Best shrinkage: 0.01

best_model <- gbm(Purchase ~ ., data = train,
                  distribution = "bernoulli",
                  n.trees = 1000,
                  shrinkage = best_lambda,
                  interaction.depth = 4,
                  verbose = FALSE)

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.

summary(best_model, n.trees = 1000)

##               var    rel.inf
## PPERSAUT PPERSAUT 7.26368538
## MGODGE     MGODGE 5.16171025
## MOPLHOOG MOPLHOOG 4.68595215
## PBRAND     PBRAND 4.50508652
## MOSTYPE   MOSTYPE 4.39735198
## MBERMIDD MBERMIDD 3.91818286
## MKOOPKLA MKOOPKLA 3.91508934
## MINK3045 MINK3045 3.25852700
## MGODPR     MGODPR 3.17896932
## MSKC         MSKC 2.89856400
## MAUT2       MAUT2 2.36710022
## MINK7512 MINK7512 2.24616152
## MSKB1       MSKB1 2.23731161
## MBERARBG MBERARBG 2.18149332
## PWAPART   PWAPART 2.14061200
## MSKA         MSKA 2.10880965
## MRELGE     MRELGE 2.02042964
## MGODOV     MGODOV 1.93005099
## MOPLMIDD MOPLMIDD 1.92086347
## MAUT1       MAUT1 1.91666048
## MRELOV     MRELOV 1.88967606
## MHHUUR     MHHUUR 1.87035639
## MINKM30   MINKM30 1.84160096
## MFGEKIND MFGEKIND 1.77985440
## MBERHOOG MBERHOOG 1.77676408
## MBERARBO MBERARBO 1.74285122
## MFWEKIND MFWEKIND 1.67591089
## MINKGEM   MINKGEM 1.67461550
## MZFONDS   MZFONDS 1.51129299
## MINK4575 MINK4575 1.44330649
## MFALLEEN MFALLEEN 1.41094267
## MGEMLEEF MGEMLEEF 1.38612876
## MSKB2       MSKB2 1.35936970
## MAUT0       MAUT0 1.30520507
## ABRAND     ABRAND 1.26017789
## MZPART     MZPART 1.23964552
## MSKD         MSKD 1.19974528
## MRELSA     MRELSA 1.17071856
## MGODRK     MGODRK 1.15793001
## MHKOOP     MHKOOP 1.03197291
## MGEMOMV   MGEMOMV 0.95969075
## MOSHOOFD MOSHOOFD 0.89115163
## APERSAUT APERSAUT 0.85479547
## MOPLLAAG MOPLLAAG 0.77697408
## MBERZELF MBERZELF 0.68816771
## PMOTSCO   PMOTSCO 0.53458414
## PLEVEN     PLEVEN 0.37460968
## PBYSTAND PBYSTAND 0.34020481
## MBERBOER MBERBOER 0.25128182
## MINK123M MINK123M 0.21052472
## MAANTHUI MAANTHUI 0.09801030
## ALEVEN     ALEVEN 0.03932786
## PWABEDR   PWABEDR 0.00000000
## PWALAND   PWALAND 0.00000000
## PBESAUT   PBESAUT 0.00000000
## PVRAAUT   PVRAAUT 0.00000000
## PAANHANG PAANHANG 0.00000000
## PTRACTOR PTRACTOR 0.00000000
## PWERKT     PWERKT 0.00000000
## PBROM       PBROM 0.00000000
## PPERSONG PPERSONG 0.00000000
## PGEZONG   PGEZONG 0.00000000
## PWAOREG   PWAOREG 0.00000000
## PZEILPL   PZEILPL 0.00000000
## PPLEZIER PPLEZIER 0.00000000
## PFIETS     PFIETS 0.00000000
## PINBOED   PINBOED 0.00000000
## AWAPART   AWAPART 0.00000000
## AWABEDR   AWABEDR 0.00000000
## AWALAND   AWALAND 0.00000000
## ABESAUT   ABESAUT 0.00000000
## AMOTSCO   AMOTSCO 0.00000000
## AVRAAUT   AVRAAUT 0.00000000
## AAANHANG AAANHANG 0.00000000
## ATRACTOR ATRACTOR 0.00000000
## AWERKT     AWERKT 0.00000000
## ABROM       ABROM 0.00000000
## APERSONG APERSONG 0.00000000
## AGEZONG   AGEZONG 0.00000000
## AWAOREG   AWAOREG 0.00000000
## AZEILPL   AZEILPL 0.00000000
## APLEZIER APLEZIER 0.00000000
## AFIETS     AFIETS 0.00000000
## AINBOED   AINBOED 0.00000000
## ABYSTAND ABYSTAND 0.00000000

The best performing shrinkage value is λ = 0.01, giving a test error of 6.45%, meaning the model correctly predicted ~93.5% of the test observations.
PPERSAUT (Car ownership) is the most influential predictor of whether a person buys caravan insurance.
Demographic and product usage variables like MGODGE (household composition), PBRAND, and MOPLHOOG are also highly influential.

(c)

# Use the best model already fitted with shrinkage = 0.01
library(caret)

## Loading required package: lattice

boost_probs <- predict(best_model, newdata = test, n.trees = 1000, type = "response")

boost_preds_20 <- ifelse(boost_probs > 0.2, 1, 0)


actual <- test$Purchase


conf_matrix <- table(Predicted = boost_preds_20, Actual = actual)
print(conf_matrix)

##          Actual
## Predicted    0    1
##         0 4337  258
##         1  196   31

precision <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
cat("Precision (purchase rate among predicted purchasers):", round(precision, 4), "\n")

## Precision (purchase rate among predicted purchasers): 0.1366

library(ISLR)
library(class)


data(Caravan)


standardized_data <- scale(Caravan[, -86])  # Exclude response
Caravan_scaled <- data.frame(standardized_data, Purchase = Caravan$Purchase)


x_train <- Caravan_scaled[1:1000, -86]
x_test <- Caravan_scaled[-(1:1000), -86]
y_train <- Caravan_scaled[1:1000, "Purchase"]
y_test <- Caravan_scaled[-(1:1000), "Purchase"]


ks <- c(1, 3, 5, 10)
knn_results <- data.frame(k = ks, Test_Error = NA, Precision = NA)

for (i in 1:length(ks)) {
  k_val <- ks[i]
  

  knn_pred <- knn(train = x_train, test = x_test, cl = y_train, k = k_val)
  

  error <- mean(knn_pred != y_test)
  

  cm <- table(Predicted = knn_pred, Actual = y_test)
  

  if ("Yes" %in% rownames(cm)) {
    precision <- cm["Yes", "Yes"] / sum(cm["Yes", ])
  } else {
    precision <- 0
  }
  
 
  knn_results[i, "Test_Error"] <- error
  knn_results[i, "Precision"] <- precision
}

print(knn_results)

##    k Test_Error Precision
## 1  1 0.11115720 0.1152648
## 2  3 0.07445044 0.2083333
## 3  5 0.06345915 0.2702703
## 4 10 0.06034840 0.0000000

Precision (Purchase rate among predicted purchasers):

Precision= True Positives / Total Predicted Yes = 31/(196+31) =0.1366 or 13.66%

This means that only 13.66% of predicted buyers actually purchased caravan insurance.

KNN (k = 5) performed the best overall with: - Lowest test error (6.35%) - Highest precision (27.03%)

Using the boosting model with a 20% probability threshold, 13.66% of people predicted to purchase actually did so. This was outperformed by KNN with k = 5, which achieved a higher precision of 27.03% and a slightly lower test error (6.35%). This indicates that KNN, at the right value of k, was more selective and effective at identifying actual purchasers than boosting.