Lab9

7.

library(MASS)       # For Boston dataset
library(randomForest)  # For random forest models

## Warning: package 'randomForest' was built under R version 4.4.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

library(ggplot2)    # For visualization

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

# Set seed for reproducibility
set.seed(123)

# Load Boston dataset
data(Boston)

# Split data into training and testing sets (70% training, 30% testing)
train_indices <- sample(1:nrow(Boston), 0.7*nrow(Boston))
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]

# Define ranges for mtry and ntree
mtry_values <- 1:13  # Boston dataset has 13 predictors
ntree_values <- c(25, 50, 100, 200, 300, 400, 500)

# Create a data frame to store results
results <- data.frame()

# Run random forests with different combinations of mtry and ntree
for (m in mtry_values) {
  for (n in ntree_values) {
    # Fit random forest model
    rf_model <- randomForest(medv ~ ., 
                            data = train_data,
                            mtry = m,
                            ntree = n,
                            importance = TRUE)
    
    # Make predictions on test set
    predictions <- predict(rf_model, test_data)
    
    # Calculate test MSE
    test_mse <- mean((predictions - test_data$medv)^2)
    
    # Store results
    results <- rbind(results, data.frame(mtry = m, ntree = n, test_mse = test_mse))
  }
  # Print progress
  cat("Completed mtry =", m, "\n")
}

## Completed mtry = 1 
## Completed mtry = 2 
## Completed mtry = 3 
## Completed mtry = 4 
## Completed mtry = 5 
## Completed mtry = 6 
## Completed mtry = 7 
## Completed mtry = 8 
## Completed mtry = 9 
## Completed mtry = 10 
## Completed mtry = 11 
## Completed mtry = 12 
## Completed mtry = 13

# Create plot similar to Figure 8.10
ggplot(results, aes(x = mtry, y = test_mse, color = factor(ntree))) +
  geom_line() +
  geom_point() +
  scale_color_brewer(palette = "Set1", name = "Number of Trees") +
  labs(title = "Test MSE for Random Forests on Boston Housing Data",
       x = "Number of Variables Considered at Each Split (mtry)",
       y = "Test Mean Squared Error") +
  theme_minimal() +
  theme(legend.position = "right")

# Alternative visualization: heatmap of test MSE
ggplot(results, aes(x = factor(mtry), y = factor(ntree), fill = test_mse)) +
  geom_tile() +
  scale_fill_viridis_c(name = "Test MSE") +
  labs(title = "Test MSE for Random Forests on Boston Housing Data",
       x = "Number of Variables Considered (mtry)",
       y = "Number of Trees (ntree)") +
  theme_minimal()

# Find the combination with minimum test MSE
best_result <- results[which.min(results$test_mse), ]
cat("Best combination: mtry =", best_result$mtry, 
    ", ntree =", best_result$ntree, 
    ", Test MSE =", round(best_result$test_mse, 4), "\n")

## Best combination: mtry = 4 , ntree = 400 , Test MSE = 10.8984

8.

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.4.3

library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.4.3

library(randomForest)
library(BART)

## Warning: package 'BART' was built under R version 4.4.3

## Loading required package: nlme

## Loading required package: survival

# Load the Carseats dataset
data(Carseats)

# Set seed for reproducibility
set.seed(123)

a. Splitting the Data into Training and Test Sets

# Split the data into training and test sets (70% training, 30% testing)
train_indices <- sample(1:nrow(Carseats), 0.7 * nrow(Carseats))
train_data <- Carseats[train_indices, ]
test_data <- Carseats[-train_indices, ]

# Output the dimensions of the training and test sets
dim(train_data)

## [1] 280  11

dim(test_data)

## [1] 120  11

b. Fitting a Regression Tree

# Fit a regression tree to the training set
reg_tree <- rpart(Sales ~ ., data = train_data, method = "anova")

# Plot the tree
rpart.plot(reg_tree, type = 4, extra = 101, fallen.leaves = TRUE,
           main = "Regression Tree for Carseats Sales")

# Make predictions on the test set
predictions <- predict(reg_tree, test_data)

# Calculate test MSE
test_mse <- mean((predictions - test_data$Sales)^2)
print(paste("Test MSE:", round(test_mse, 4)))

## [1] "Test MSE: 3.7844"

# Print a summary of the tree
printcp(reg_tree)

## 
## Regression tree:
## rpart(formula = Sales ~ ., data = train_data, method = "anova")
## 
## Variables actually used in tree construction:
## [1] Advertising Age         CompPrice   Price       ShelveLoc  
## 
## Root node error: 2227.2/280 = 7.9544
## 
## n= 280 
## 
##          CP nsplit rel error  xerror     xstd
## 1  0.254622      0   1.00000 1.00742 0.084722
## 2  0.092152      1   0.74538 0.76140 0.059011
## 3  0.070902      2   0.65323 0.73872 0.058054
## 4  0.043245      3   0.58232 0.63047 0.048350
## 5  0.036049      4   0.53908 0.62552 0.050904
## 6  0.032271      5   0.50303 0.63451 0.052964
## 7  0.024286      7   0.43849 0.59448 0.048571
## 8  0.017482      8   0.41420 0.59106 0.048463
## 9  0.015915      9   0.39672 0.60713 0.052976
## 10 0.015626     10   0.38080 0.61605 0.053264
## 11 0.014133     11   0.36518 0.60544 0.052892
## 12 0.013544     12   0.35104 0.59908 0.052783
## 13 0.012653     14   0.32396 0.59610 0.052405
## 14 0.010461     15   0.31130 0.58343 0.051559
## 15 0.010000     16   0.30084 0.58139 0.051643

c. Cross-Validation for Optimal Tree Complexity

# Cross-validation to determine optimal complexity parameter
cv_tree <- rpart(Sales ~ ., data = train_data, method = "anova",
                 control = rpart.control(cp = 0.001, xval = 10))

# Plot the cross-validation results
plotcp(cv_tree)

# Find the optimal cp value
optimal_cp <- cv_tree$cptable[which.min(cv_tree$cptable[,"xerror"]), "CP"]
print(paste("Optimal complexity parameter:", optimal_cp))

## [1] "Optimal complexity parameter: 0.001"

# Prune the tree using the optimal cp
pruned_tree <- prune(cv_tree, cp = optimal_cp)

# Plot the pruned tree
rpart.plot(pruned_tree, type = 4, extra = 101, fallen.leaves = TRUE,
           main = "Pruned Regression Tree for Carseats Sales")

# Make predictions with the pruned tree
pruned_predictions <- predict(pruned_tree, test_data)

# Calculate test MSE for the pruned tree
pruned_test_mse <- mean((pruned_predictions - test_data$Sales)^2)
print(paste("Pruned Tree Test MSE:", round(pruned_test_mse, 4)))

## [1] "Pruned Tree Test MSE: 3.5192"

d. Bagging Approach

# Fit a bagging model (Random Forest with m = p)
bagging_model <- randomForest(Sales ~ ., data = train_data,
                             mtry = ncol(train_data) - 1, # All predictors
                             importance = TRUE,
                             ntree = 500)

# Make predictions on the test set
bagging_predictions <- predict(bagging_model, test_data)

# Calculate test MSE for the bagging model
bagging_test_mse <- mean((bagging_predictions - test_data$Sales)^2)
print(paste("Bagging Test MSE:", round(bagging_test_mse, 4)))

## [1] "Bagging Test MSE: 2.2414"

# Variable importance
importance(bagging_model)

##                %IncMSE IncNodePurity
## CompPrice   34.6929966    263.733263
## Income       9.4304377    113.884814
## Advertising 19.7679196    145.470012
## Population  -2.0798516     64.696448
## Price       66.7489094    632.515073
## ShelveLoc   72.3378601    690.312936
## Age         18.4365623    187.336470
## Education    3.4295024     64.064236
## Urban        1.4639584      9.568371
## US           0.8511541      7.816334

varImpPlot(bagging_model, main = "Variable Importance from Bagging")

e. Random Forests

# Try different values of mtry
mtry_values <- c(1:9) # Adjust based on number of predictors
rf_results <- data.frame(mtry = integer(), mse = numeric())

for (m in mtry_values) {
  rf_model <- randomForest(Sales ~ ., data = train_data,
                          mtry = m,
                          importance = TRUE,
                          ntree = 500)
  
  # Make predictions
  rf_predictions <- predict(rf_model, test_data)
  
  # Calculate test MSE
  rf_test_mse <- mean((rf_predictions - test_data$Sales)^2)
  
  # Store results
  rf_results <- rbind(rf_results, data.frame(mtry = m, mse = rf_test_mse))
}

# Find optimal mtry
optimal_mtry <- rf_results$mtry[which.min(rf_results$mse)]
print(paste("Optimal mtry:", optimal_mtry))

## [1] "Optimal mtry: 8"

print(paste("Minimum Test MSE:", round(min(rf_results$mse), 4)))

## [1] "Minimum Test MSE: 2.2227"

# Plot mtry vs MSE
plot(rf_results$mtry, rf_results$mse, type = "b", 
     xlab = "Number of Variables at Each Split (mtry)",
     ylab = "Test MSE", main = "Random Forest: mtry vs Test MSE")

# Fit the random forest with optimal mtry
optimal_rf <- randomForest(Sales ~ ., data = train_data,
                          mtry = optimal_mtry,
                          importance = TRUE,
                          ntree = 500)

# Variable importance
importance(optimal_rf)

##               %IncMSE IncNodePurity
## CompPrice   29.837365    253.609378
## Income       8.789126    122.113482
## Advertising 21.677064    162.568311
## Population  -1.445769     69.865277
## Price       60.172243    588.860323
## ShelveLoc   65.302222    676.736537
## Age         19.149264    209.696319
## Education    2.726531     70.367105
## Urban        1.071614      9.385424
## US           2.388123      9.628030

varImpPlot(optimal_rf, main = "Variable Importance from Random Forest")

f. Bayesian Additive Regression Trees (BART)

# Prepare data for BART
x_train <- train_data[, names(train_data) != "Sales"]
y_train <- train_data$Sales
x_test <- test_data[, names(test_data) != "Sales"]
y_test <- test_data$Sales

# Convert categorical variables to factors if they aren't already
for (i in 1:ncol(x_train)) {
  if (is.character(x_train[,i])) {
    x_train[,i] <- as.factor(x_train[,i])
    x_test[,i] <- as.factor(x_test[,i])
  }
}

# Fit BART model
bart_model <- gbart(x_train, y_train, x_test)

## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 280, 14, 120
## y1,yn: 3.222214, -2.907786
## x1,x[n*p]: 104.000000, 1.000000
## xp1,xp[np*p]: 138.000000, 1.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 67 ... 1
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.284787,3,0.198596,7.43779
## *****sigma: 1.009719
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,14,0
## *****printevery: 100
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 5s
## trcnt,tecnt: 1000,1000

# Extract predictions (BART returns posterior samples, take the mean)
bart_predictions <- bart_model$yhat.test.mean

# Calculate test MSE for BART
bart_test_mse <- mean((bart_predictions - y_test)^2)
print(paste("BART Test MSE:", round(bart_test_mse, 4)))

## [1] "BART Test MSE: 1.5911"

# Variable importance in BART
bart_var_importance <- apply(bart_model$varcount, 2, mean)
names(bart_var_importance) <- names(x_train)
bart_var_importance <- sort(bart_var_importance, decreasing = TRUE)
print("BART Variable Importance:")

## [1] "BART Variable Importance:"

print(bart_var_importance)

##       Price   CompPrice   ShelveLoc        <NA>       Urban        <NA> 
##      24.802      18.387      18.361      17.020      16.623      16.550 
##         Age      Income        <NA>        <NA>   Education  Population 
##      16.509      16.386      15.989      15.603      15.597      15.592 
## Advertising          US 
##      15.338      14.832

# Plot variable importance
barplot(bart_var_importance, 
        main = "Variable Importance from BART",
        xlab = "Variables", ylab = "Inclusion Proportion",
        las = 2, cex.names = 0.7)

"BART Test MSE: 1.6107"

11.

library(ISLR)
library(gbm)

## Warning: package 'gbm' was built under R version 4.4.3

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(class)
library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

library(pROC)

## Warning: package 'pROC' was built under R version 4.4.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

a. Creating Training and Test Sets

# Load the Caravan dataset
data(Caravan)

# Convert Purchase to a binary factor (0/1)
Caravan$Purchase <- ifelse(Caravan$Purchase == "Yes", 1, 0)

# Create training and test sets
train_data <- Caravan[1:1000, ]
test_data <- Caravan[1001:nrow(Caravan), ]

# Check dimensions
dim(train_data)

## [1] 1000   86

dim(test_data)

## [1] 4822   86

b. Fitting Boosting Models with Different Shrinkage Values

# Define shrinkage values to try
shrinkage_values <- c(0.01, 0.05, 0.1, 0.2)

# Store models and their performance
boost_models <- list()
test_errors <- numeric(length(shrinkage_values))
importance_lists <- list()

# Fit boosting models with different shrinkage values
for (i in 1:length(shrinkage_values)) {
  lambda <- shrinkage_values[i]
  
  # Fit boosting model
  boost_model <- gbm(Purchase ~ ., 
                    data = train_data,
                    distribution = "bernoulli",
                    n.trees = 1000,
                    interaction.depth = 4,
                    shrinkage = lambda,
                    verbose = FALSE)
  
  # Store the model
  boost_models[[i]] <- boost_model
  
  # Make predictions on test set
  pred_probs <- predict(boost_model, 
                       newdata = test_data, 
                       n.trees = 1000, 
                       type = "response")
  
  # Calculate test error (misclassification rate with 0.2 threshold)
  pred_class <- ifelse(pred_probs > 0.2, 1, 0)
  test_errors[i] <- mean(pred_class != test_data$Purchase)
  
  # Get variable importance
  importance <- summary(boost_model, plotit = FALSE)
  importance_lists[[i]] <- importance
}

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 50: PVRAAUT has no variation.

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 71: AVRAAUT has no variation.

# Find the best shrinkage value
best_index <- which.min(test_errors)
best_lambda <- shrinkage_values[best_index]
best_model <- boost_models[[best_index]]

# Print results
cat("Best shrinkage value:", best_lambda, "\n")

## Best shrinkage value: 0.01

cat("Test error rate:", test_errors[best_index], "\n\n")

## Test error rate: 0.09083368

# Plot variable importance for the best model
print("Most important predictors:")

## [1] "Most important predictors:"

importance <- importance_lists[[best_index]]
head(importance, 10)

##               var  rel.inf
## PPERSAUT PPERSAUT 7.919325
## MGODGE     MGODGE 5.113786
## MOSTYPE   MOSTYPE 4.623106
## MKOOPKLA MKOOPKLA 4.570109
## MOPLHOOG MOPLHOOG 4.131745
## PBRAND     PBRAND 4.008690
## MGODPR     MGODPR 3.446960
## MBERMIDD MBERMIDD 3.355803
## MINK3045 MINK3045 3.059326
## MAUT2       MAUT2 2.939429

# Plot variable importance
par(mar = c(10, 4, 4, 2) + 0.1)
barplot(importance$rel.inf[1:10], 
        names.arg = importance$var[1:10], 
        las = 2, 
        main = "Top 10 Most Important Variables",
        cex.names = 0.7)

c. Evaluating the Best Boosting Model and Comparing with KNN

# Use the best boosting model to predict on test data
best_boost_probs <- predict(best_model, 
                           newdata = test_data, 
                           n.trees = 1000, 
                           type = "response")

# Create predictions using 0.2 threshold
best_boost_preds <- ifelse(best_boost_probs > 0.2, 1, 0)

# Create confusion matrix
boost_conf_matrix <- table(Predicted = best_boost_preds, 
                          Actual = test_data$Purchase)
print("Boosting Confusion Matrix:")

## [1] "Boosting Confusion Matrix:"

print(boost_conf_matrix)

##          Actual
## Predicted    0    1
##         0 4353  258
##         1  180   31

# Calculate precision (fraction of predicted purchases that are actual purchases)
boost_precision <- boost_conf_matrix[2,2] / sum(boost_conf_matrix[2,])
cat("Boosting Precision (fraction of predicted purchases that are actual):", 
    round(boost_precision, 4), "\n")

## Boosting Precision (fraction of predicted purchases that are actual): 0.1469

# Calculate other metrics
boost_recall <- boost_conf_matrix[2,2] / sum(boost_conf_matrix[,2])
boost_accuracy <- sum(diag(boost_conf_matrix)) / sum(boost_conf_matrix)

cat("Boosting Recall:", round(boost_recall, 4), "\n")

## Boosting Recall: 0.1073

cat("Boosting Accuracy:", round(boost_accuracy, 4), "\n\n")

## Boosting Accuracy: 0.9092

# Now implement KNN with different k values
# First, standardize the data
standardized_train <- scale(train_data[, -which(names(train_data) == "Purchase")])
standardized_test <- scale(test_data[, -which(names(test_data) == "Purchase")],
                          center = attr(standardized_train, "scaled:center"),
                          scale = attr(standardized_train, "scaled:scale"))

# Try different values of k
k_values <- c(1, 3, 5, 7, 9, 11, 13, 15, 17, 19)
knn_test_errors <- numeric(length(k_values))
knn_precision <- numeric(length(k_values))

# Identify complete cases
complete_indices <- complete.cases(train_data)
train_data_complete <- train_data[complete_indices, ]

# Do the same for test data
test_complete_indices <- complete.cases(test_data)
test_data_complete <- test_data[test_complete_indices, ]

for (i in 1:length(k_values)) {
  k <- k_values[i]
  
  # Fit KNN model
  knn_pred_probs <- knn(train = train_data_complete,
                       test = test_data_complete,
                       cl = train_data$Purchase,
                       k = k,
                       prob = TRUE)
  
  # Convert to binary predictions
  knn_preds <- as.numeric(as.character(knn_pred_probs))
  
  # Calculate test error
  knn_test_errors[i] <- mean(knn_preds != test_data$Purchase)
  
  # Create confusion matrix
  knn_conf_matrix <- table(Predicted = knn_preds, Actual = test_data$Purchase)
  
  # Calculate precision (if possible)
  if (sum(knn_preds) > 0) {
    knn_precision[i] <- sum(knn_preds & test_data$Purchase) / sum(knn_preds)
  } else {
    knn_precision[i] <- NA
  }
}

# Find best k
best_k_index <- which.min(knn_test_errors)
best_k <- k_values[best_k_index]

# Print KNN results
cat("Best k value:", best_k, "\n")

## Best k value: 11

cat("KNN Test error rate:", knn_test_errors[best_k_index], "\n")

## KNN Test error rate: 0.05993364

cat("KNN Precision:", knn_precision[best_k_index], "\n\n")

## KNN Precision: 0.5

# Final comparison
cat("Comparison of Best Models:\n")

## Comparison of Best Models:

cat("Boosting (shrinkage =", best_lambda, ") - Precision:", round(boost_precision, 4), "\n")

## Boosting (shrinkage = 0.01 ) - Precision: 0.1469

cat("KNN (k =", best_k, ") - Precision:", round(knn_precision[best_k_index], 4), "\n")

## KNN (k = 11 ) - Precision: 0.5

# Plot comparison of error rates
plot(shrinkage_values, test_errors, type = "b", col = "blue",
     xlab = "Shrinkage Value", ylab = "Test Error Rate",
     main = "Comparison of Model Performance",
     ylim = c(min(c(test_errors, knn_test_errors)), 
              max(c(test_errors, knn_test_errors))))
points(k_values/100, knn_test_errors, type = "b", col = "red")
legend("topright", legend = c("Boosting", "KNN"), 
       col = c("blue", "red"), lty = 1, pch = 1)

Interpretation:

KNN (red line) demonstrates significantly lower test error rates (around 0.06) compared to Boosting (blue line, around 0.09) across different shrinkage values.
KNN’s performance improves dramatically as shrinkage increases from 0.01 to about 0.10, then stabilizes, suggesting an optimal shrinkage value around 0.10-0.15.
Boosting shows only slight improvement with increased shrinkage values, maintaining a relatively consistent error rate.

Lab9

Kiran

2025-04-14

Lab9

7.

8.

a. Splitting the Data into Training and Test Sets

b. Fitting a Regression Tree

c. Cross-Validation for Optimal Tree Complexity

d. Bagging Approach

e. Random Forests

f. Bayesian Additive Regression Trees (BART)

11.

a. Creating Training and Test Sets

b. Fitting Boosting Models with Different Shrinkage Values

c. Evaluating the Best Boosting Model and Comparing with KNN