Lab9

7.

library(MASS)       # For Boston dataset
library(randomForest)  # For random forest models
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(ggplot2)    # For visualization
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
# Set seed for reproducibility
set.seed(123)
# Load Boston dataset
data(Boston)
# Split data into training and testing sets (70% training, 30% testing)
train_indices <- sample(1:nrow(Boston), 0.7*nrow(Boston))
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]

# Define ranges for mtry and ntree
mtry_values <- 1:13  # Boston dataset has 13 predictors
ntree_values <- c(25, 50, 100, 200, 300, 400, 500)
# Create a data frame to store results
results <- data.frame()

# Run random forests with different combinations of mtry and ntree
for (m in mtry_values) {
  for (n in ntree_values) {
    # Fit random forest model
    rf_model <- randomForest(medv ~ ., 
                            data = train_data,
                            mtry = m,
                            ntree = n,
                            importance = TRUE)
    
    # Make predictions on test set
    predictions <- predict(rf_model, test_data)
    
    # Calculate test MSE
    test_mse <- mean((predictions - test_data$medv)^2)
    
    # Store results
    results <- rbind(results, data.frame(mtry = m, ntree = n, test_mse = test_mse))
  }
  # Print progress
  cat("Completed mtry =", m, "\n")
}
## Completed mtry = 1 
## Completed mtry = 2 
## Completed mtry = 3 
## Completed mtry = 4 
## Completed mtry = 5 
## Completed mtry = 6 
## Completed mtry = 7 
## Completed mtry = 8 
## Completed mtry = 9 
## Completed mtry = 10 
## Completed mtry = 11 
## Completed mtry = 12 
## Completed mtry = 13
# Create plot similar to Figure 8.10
ggplot(results, aes(x = mtry, y = test_mse, color = factor(ntree))) +
  geom_line() +
  geom_point() +
  scale_color_brewer(palette = "Set1", name = "Number of Trees") +
  labs(title = "Test MSE for Random Forests on Boston Housing Data",
       x = "Number of Variables Considered at Each Split (mtry)",
       y = "Test Mean Squared Error") +
  theme_minimal() +
  theme(legend.position = "right")

# Alternative visualization: heatmap of test MSE
ggplot(results, aes(x = factor(mtry), y = factor(ntree), fill = test_mse)) +
  geom_tile() +
  scale_fill_viridis_c(name = "Test MSE") +
  labs(title = "Test MSE for Random Forests on Boston Housing Data",
       x = "Number of Variables Considered (mtry)",
       y = "Number of Trees (ntree)") +
  theme_minimal()

# Find the combination with minimum test MSE
best_result <- results[which.min(results$test_mse), ]
cat("Best combination: mtry =", best_result$mtry, 
    ", ntree =", best_result$ntree, 
    ", Test MSE =", round(best_result$test_mse, 4), "\n")
## Best combination: mtry = 4 , ntree = 400 , Test MSE = 10.8984

8.

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(randomForest)
library(BART)
## Warning: package 'BART' was built under R version 4.4.3
## Loading required package: nlme
## Loading required package: survival
# Load the Carseats dataset
data(Carseats)

# Set seed for reproducibility
set.seed(123)

a. Splitting the Data into Training and Test Sets

# Split the data into training and test sets (70% training, 30% testing)
train_indices <- sample(1:nrow(Carseats), 0.7 * nrow(Carseats))
train_data <- Carseats[train_indices, ]
test_data <- Carseats[-train_indices, ]

# Output the dimensions of the training and test sets
dim(train_data)
## [1] 280  11
dim(test_data)
## [1] 120  11

b. Fitting a Regression Tree

# Fit a regression tree to the training set
reg_tree <- rpart(Sales ~ ., data = train_data, method = "anova")

# Plot the tree
rpart.plot(reg_tree, type = 4, extra = 101, fallen.leaves = TRUE,
           main = "Regression Tree for Carseats Sales")

# Make predictions on the test set
predictions <- predict(reg_tree, test_data)

# Calculate test MSE
test_mse <- mean((predictions - test_data$Sales)^2)
print(paste("Test MSE:", round(test_mse, 4)))
## [1] "Test MSE: 3.7844"
# Print a summary of the tree
printcp(reg_tree)
## 
## Regression tree:
## rpart(formula = Sales ~ ., data = train_data, method = "anova")
## 
## Variables actually used in tree construction:
## [1] Advertising Age         CompPrice   Price       ShelveLoc  
## 
## Root node error: 2227.2/280 = 7.9544
## 
## n= 280 
## 
##          CP nsplit rel error  xerror     xstd
## 1  0.254622      0   1.00000 1.00742 0.084722
## 2  0.092152      1   0.74538 0.76140 0.059011
## 3  0.070902      2   0.65323 0.73872 0.058054
## 4  0.043245      3   0.58232 0.63047 0.048350
## 5  0.036049      4   0.53908 0.62552 0.050904
## 6  0.032271      5   0.50303 0.63451 0.052964
## 7  0.024286      7   0.43849 0.59448 0.048571
## 8  0.017482      8   0.41420 0.59106 0.048463
## 9  0.015915      9   0.39672 0.60713 0.052976
## 10 0.015626     10   0.38080 0.61605 0.053264
## 11 0.014133     11   0.36518 0.60544 0.052892
## 12 0.013544     12   0.35104 0.59908 0.052783
## 13 0.012653     14   0.32396 0.59610 0.052405
## 14 0.010461     15   0.31130 0.58343 0.051559
## 15 0.010000     16   0.30084 0.58139 0.051643

c. Cross-Validation for Optimal Tree Complexity

# Cross-validation to determine optimal complexity parameter
cv_tree <- rpart(Sales ~ ., data = train_data, method = "anova",
                 control = rpart.control(cp = 0.001, xval = 10))

# Plot the cross-validation results
plotcp(cv_tree)

# Find the optimal cp value
optimal_cp <- cv_tree$cptable[which.min(cv_tree$cptable[,"xerror"]), "CP"]
print(paste("Optimal complexity parameter:", optimal_cp))
## [1] "Optimal complexity parameter: 0.001"
# Prune the tree using the optimal cp
pruned_tree <- prune(cv_tree, cp = optimal_cp)

# Plot the pruned tree
rpart.plot(pruned_tree, type = 4, extra = 101, fallen.leaves = TRUE,
           main = "Pruned Regression Tree for Carseats Sales")

# Make predictions with the pruned tree
pruned_predictions <- predict(pruned_tree, test_data)

# Calculate test MSE for the pruned tree
pruned_test_mse <- mean((pruned_predictions - test_data$Sales)^2)
print(paste("Pruned Tree Test MSE:", round(pruned_test_mse, 4)))
## [1] "Pruned Tree Test MSE: 3.5192"

d. Bagging Approach

# Fit a bagging model (Random Forest with m = p)
bagging_model <- randomForest(Sales ~ ., data = train_data,
                             mtry = ncol(train_data) - 1, # All predictors
                             importance = TRUE,
                             ntree = 500)

# Make predictions on the test set
bagging_predictions <- predict(bagging_model, test_data)

# Calculate test MSE for the bagging model
bagging_test_mse <- mean((bagging_predictions - test_data$Sales)^2)
print(paste("Bagging Test MSE:", round(bagging_test_mse, 4)))
## [1] "Bagging Test MSE: 2.2414"
# Variable importance
importance(bagging_model)
##                %IncMSE IncNodePurity
## CompPrice   34.6929966    263.733263
## Income       9.4304377    113.884814
## Advertising 19.7679196    145.470012
## Population  -2.0798516     64.696448
## Price       66.7489094    632.515073
## ShelveLoc   72.3378601    690.312936
## Age         18.4365623    187.336470
## Education    3.4295024     64.064236
## Urban        1.4639584      9.568371
## US           0.8511541      7.816334
varImpPlot(bagging_model, main = "Variable Importance from Bagging")

e. Random Forests

# Try different values of mtry
mtry_values <- c(1:9) # Adjust based on number of predictors
rf_results <- data.frame(mtry = integer(), mse = numeric())

for (m in mtry_values) {
  rf_model <- randomForest(Sales ~ ., data = train_data,
                          mtry = m,
                          importance = TRUE,
                          ntree = 500)
  
  # Make predictions
  rf_predictions <- predict(rf_model, test_data)
  
  # Calculate test MSE
  rf_test_mse <- mean((rf_predictions - test_data$Sales)^2)
  
  # Store results
  rf_results <- rbind(rf_results, data.frame(mtry = m, mse = rf_test_mse))
}
# Find optimal mtry
optimal_mtry <- rf_results$mtry[which.min(rf_results$mse)]
print(paste("Optimal mtry:", optimal_mtry))
## [1] "Optimal mtry: 8"
print(paste("Minimum Test MSE:", round(min(rf_results$mse), 4)))
## [1] "Minimum Test MSE: 2.2227"
# Plot mtry vs MSE
plot(rf_results$mtry, rf_results$mse, type = "b", 
     xlab = "Number of Variables at Each Split (mtry)",
     ylab = "Test MSE", main = "Random Forest: mtry vs Test MSE")

# Fit the random forest with optimal mtry
optimal_rf <- randomForest(Sales ~ ., data = train_data,
                          mtry = optimal_mtry,
                          importance = TRUE,
                          ntree = 500)

# Variable importance
importance(optimal_rf)
##               %IncMSE IncNodePurity
## CompPrice   29.837365    253.609378
## Income       8.789126    122.113482
## Advertising 21.677064    162.568311
## Population  -1.445769     69.865277
## Price       60.172243    588.860323
## ShelveLoc   65.302222    676.736537
## Age         19.149264    209.696319
## Education    2.726531     70.367105
## Urban        1.071614      9.385424
## US           2.388123      9.628030
varImpPlot(optimal_rf, main = "Variable Importance from Random Forest")

f. Bayesian Additive Regression Trees (BART)

# Prepare data for BART
x_train <- train_data[, names(train_data) != "Sales"]
y_train <- train_data$Sales
x_test <- test_data[, names(test_data) != "Sales"]
y_test <- test_data$Sales

# Convert categorical variables to factors if they aren't already
for (i in 1:ncol(x_train)) {
  if (is.character(x_train[,i])) {
    x_train[,i] <- as.factor(x_train[,i])
    x_test[,i] <- as.factor(x_test[,i])
  }
}
# Fit BART model
bart_model <- gbart(x_train, y_train, x_test)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 280, 14, 120
## y1,yn: 3.222214, -2.907786
## x1,x[n*p]: 104.000000, 1.000000
## xp1,xp[np*p]: 138.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 67 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.284787,3,0.198596,7.43779
## *****sigma: 1.009719
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 5s
## trcnt,tecnt: 1000,1000
# Extract predictions (BART returns posterior samples, take the mean)
bart_predictions <- bart_model$yhat.test.mean

# Calculate test MSE for BART
bart_test_mse <- mean((bart_predictions - y_test)^2)
print(paste("BART Test MSE:", round(bart_test_mse, 4)))
## [1] "BART Test MSE: 1.5911"
# Variable importance in BART
bart_var_importance <- apply(bart_model$varcount, 2, mean)
names(bart_var_importance) <- names(x_train)
bart_var_importance <- sort(bart_var_importance, decreasing = TRUE)
print("BART Variable Importance:")
## [1] "BART Variable Importance:"
print(bart_var_importance)
##       Price   CompPrice   ShelveLoc        <NA>       Urban        <NA> 
##      24.802      18.387      18.361      17.020      16.623      16.550 
##         Age      Income        <NA>        <NA>   Education  Population 
##      16.509      16.386      15.989      15.603      15.597      15.592 
## Advertising          US 
##      15.338      14.832
# Plot variable importance
barplot(bart_var_importance, 
        main = "Variable Importance from BART",
        xlab = "Variables", ylab = "Inclusion Proportion",
        las = 2, cex.names = 0.7)

"BART Test MSE: 1.6107"

11.

library(ISLR)
library(gbm)
## Warning: package 'gbm' was built under R version 4.4.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(class)
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

a. Creating Training and Test Sets

# Load the Caravan dataset
data(Caravan)

# Convert Purchase to a binary factor (0/1)
Caravan$Purchase <- ifelse(Caravan$Purchase == "Yes", 1, 0)

# Create training and test sets
train_data <- Caravan[1:1000, ]
test_data <- Caravan[1001:nrow(Caravan), ]

# Check dimensions
dim(train_data)
## [1] 1000   86
dim(test_data)
## [1] 4822   86

b. Fitting Boosting Models with Different Shrinkage Values

# Define shrinkage values to try
shrinkage_values <- c(0.01, 0.05, 0.1, 0.2)

# Store models and their performance
boost_models <- list()
test_errors <- numeric(length(shrinkage_values))
importance_lists <- list()
# Fit boosting models with different shrinkage values
for (i in 1:length(shrinkage_values)) {
  lambda <- shrinkage_values[i]
  
  # Fit boosting model
  boost_model <- gbm(Purchase ~ ., 
                    data = train_data,
                    distribution = "bernoulli",
                    n.trees = 1000,
                    interaction.depth = 4,
                    shrinkage = lambda,
                    verbose = FALSE)
  
  # Store the model
  boost_models[[i]] <- boost_model
  
  # Make predictions on test set
  pred_probs <- predict(boost_model, 
                       newdata = test_data, 
                       n.trees = 1000, 
                       type = "response")
  
  # Calculate test error (misclassification rate with 0.2 threshold)
  pred_class <- ifelse(pred_probs > 0.2, 1, 0)
  test_errors[i] <- mean(pred_class != test_data$Purchase)
  
  # Get variable importance
  importance <- summary(boost_model, plotit = FALSE)
  importance_lists[[i]] <- importance
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.
# Find the best shrinkage value
best_index <- which.min(test_errors)
best_lambda <- shrinkage_values[best_index]
best_model <- boost_models[[best_index]]

# Print results
cat("Best shrinkage value:", best_lambda, "\n")
## Best shrinkage value: 0.01
cat("Test error rate:", test_errors[best_index], "\n\n")
## Test error rate: 0.09083368
# Plot variable importance for the best model
print("Most important predictors:")
## [1] "Most important predictors:"
importance <- importance_lists[[best_index]]
head(importance, 10)
##               var  rel.inf
## PPERSAUT PPERSAUT 7.919325
## MGODGE     MGODGE 5.113786
## MOSTYPE   MOSTYPE 4.623106
## MKOOPKLA MKOOPKLA 4.570109
## MOPLHOOG MOPLHOOG 4.131745
## PBRAND     PBRAND 4.008690
## MGODPR     MGODPR 3.446960
## MBERMIDD MBERMIDD 3.355803
## MINK3045 MINK3045 3.059326
## MAUT2       MAUT2 2.939429
# Plot variable importance
par(mar = c(10, 4, 4, 2) + 0.1)
barplot(importance$rel.inf[1:10], 
        names.arg = importance$var[1:10], 
        las = 2, 
        main = "Top 10 Most Important Variables",
        cex.names = 0.7)

c. Evaluating the Best Boosting Model and Comparing with KNN

# Use the best boosting model to predict on test data
best_boost_probs <- predict(best_model, 
                           newdata = test_data, 
                           n.trees = 1000, 
                           type = "response")

# Create predictions using 0.2 threshold
best_boost_preds <- ifelse(best_boost_probs > 0.2, 1, 0)
# Create confusion matrix
boost_conf_matrix <- table(Predicted = best_boost_preds, 
                          Actual = test_data$Purchase)
print("Boosting Confusion Matrix:")
## [1] "Boosting Confusion Matrix:"
print(boost_conf_matrix)
##          Actual
## Predicted    0    1
##         0 4353  258
##         1  180   31
# Calculate precision (fraction of predicted purchases that are actual purchases)
boost_precision <- boost_conf_matrix[2,2] / sum(boost_conf_matrix[2,])
cat("Boosting Precision (fraction of predicted purchases that are actual):", 
    round(boost_precision, 4), "\n")
## Boosting Precision (fraction of predicted purchases that are actual): 0.1469
# Calculate other metrics
boost_recall <- boost_conf_matrix[2,2] / sum(boost_conf_matrix[,2])
boost_accuracy <- sum(diag(boost_conf_matrix)) / sum(boost_conf_matrix)

cat("Boosting Recall:", round(boost_recall, 4), "\n")
## Boosting Recall: 0.1073
cat("Boosting Accuracy:", round(boost_accuracy, 4), "\n\n")
## Boosting Accuracy: 0.9092
# Now implement KNN with different k values
# First, standardize the data
standardized_train <- scale(train_data[, -which(names(train_data) == "Purchase")])
standardized_test <- scale(test_data[, -which(names(test_data) == "Purchase")],
                          center = attr(standardized_train, "scaled:center"),
                          scale = attr(standardized_train, "scaled:scale"))

# Try different values of k
k_values <- c(1, 3, 5, 7, 9, 11, 13, 15, 17, 19)
knn_test_errors <- numeric(length(k_values))
knn_precision <- numeric(length(k_values))
# Identify complete cases
complete_indices <- complete.cases(train_data)
train_data_complete <- train_data[complete_indices, ]

# Do the same for test data
test_complete_indices <- complete.cases(test_data)
test_data_complete <- test_data[test_complete_indices, ]

for (i in 1:length(k_values)) {
  k <- k_values[i]
  
  # Fit KNN model
  knn_pred_probs <- knn(train = train_data_complete,
                       test = test_data_complete,
                       cl = train_data$Purchase,
                       k = k,
                       prob = TRUE)
  
  # Convert to binary predictions
  knn_preds <- as.numeric(as.character(knn_pred_probs))
  
  # Calculate test error
  knn_test_errors[i] <- mean(knn_preds != test_data$Purchase)
  
  # Create confusion matrix
  knn_conf_matrix <- table(Predicted = knn_preds, Actual = test_data$Purchase)
  
  # Calculate precision (if possible)
  if (sum(knn_preds) > 0) {
    knn_precision[i] <- sum(knn_preds & test_data$Purchase) / sum(knn_preds)
  } else {
    knn_precision[i] <- NA
  }
}
# Find best k
best_k_index <- which.min(knn_test_errors)
best_k <- k_values[best_k_index]

# Print KNN results
cat("Best k value:", best_k, "\n")
## Best k value: 11
cat("KNN Test error rate:", knn_test_errors[best_k_index], "\n")
## KNN Test error rate: 0.05993364
cat("KNN Precision:", knn_precision[best_k_index], "\n\n")
## KNN Precision: 0.5
# Final comparison
cat("Comparison of Best Models:\n")
## Comparison of Best Models:
cat("Boosting (shrinkage =", best_lambda, ") - Precision:", round(boost_precision, 4), "\n")
## Boosting (shrinkage = 0.01 ) - Precision: 0.1469
cat("KNN (k =", best_k, ") - Precision:", round(knn_precision[best_k_index], 4), "\n")
## KNN (k = 11 ) - Precision: 0.5
# Plot comparison of error rates
plot(shrinkage_values, test_errors, type = "b", col = "blue",
     xlab = "Shrinkage Value", ylab = "Test Error Rate",
     main = "Comparison of Model Performance",
     ylim = c(min(c(test_errors, knn_test_errors)), 
              max(c(test_errors, knn_test_errors))))
points(k_values/100, knn_test_errors, type = "b", col = "red")
legend("topright", legend = c("Boosting", "KNN"), 
       col = c("blue", "red"), lty = 1, pch = 1)

Interpretation:

  • KNN (red line) demonstrates significantly lower test error rates (around 0.06) compared to Boosting (blue line, around 0.09) across different shrinkage values.

  • KNN’s performance improves dramatically as shrinkage increases from 0.01 to about 0.10, then stabilizes, suggesting an optimal shrinkage value around 0.10-0.15.

  • Boosting shows only slight improvement with increased shrinkage values, maintaining a relatively consistent error rate.