Please hand this result in on Canvas no later than 11:59pm on Thursday, May 8!

Questions

  1. Experimenting with tree-based methods: In this problem, you will used CARTs, bagging, random forests, and boosting in an attempt to accurately predict whether mushrooms are edible or poisonous, based on a large collection of varaibles decribing the mushrooms (cap shape, cap surface, odor, gill spacing, gill size, stalk color, etc.). The data can be found under the Files tab in folder Exam 2/Final Exam (mushrooms.csv) on Canvas. More information regarding this data can be found (including variable level designation for interpretation) at the following site: https://www.kaggle.com/uciml/mushroom-classification/data
  1. Split the data into a training and test set, each consisting of 50% of the entire sample. Do this randomly, not based on positioning in the dataset (i.e. do not take the first half as your test set, etc., but instead use the function).

    # Load necessary libraries
    library(dplyr)
    ## Warning: package 'dplyr' was built under R version 4.4.3
    ## 
    ## Attaching package: 'dplyr'
    ## The following objects are masked from 'package:stats':
    ## 
    ##     filter, lag
    ## The following objects are masked from 'package:base':
    ## 
    ##     intersect, setdiff, setequal, union
    # Load the dataset
    mushrooms <- read.csv("mushrooms.csv", stringsAsFactors = TRUE)
    
    # View structure (all columns are categorical)
    str(mushrooms)
    ## 'data.frame':    8124 obs. of  23 variables:
    ##  $ class                   : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
    ##  $ cap.shape               : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
    ##  $ cap.surface             : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
    ##  $ cap.color               : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
    ##  $ bruises                 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
    ##  $ odor                    : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
    ##  $ gill.attachment         : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
    ##  $ gill.spacing            : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
    ##  $ gill.size               : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
    ##  $ gill.color              : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
    ##  $ stalk.shape             : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
    ##  $ stalk.root              : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
    ##  $ stalk.surface.above.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
    ##  $ stalk.surface.below.ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
    ##  $ stalk.color.above.ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
    ##  $ stalk.color.below.ring  : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
    ##  $ veil.type               : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
    ##  $ veil.color              : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
    ##  $ ring.number             : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
    ##  $ ring.type               : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
    ##  $ spore.print.color       : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
    ##  $ population              : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
    ##  $ habitat                 : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
    # Set seed for reproducibility
    set.seed(123)
    
    # Create a random sample of indices for training (50% of data)
    train_indices <- sample(1:nrow(mushrooms), size = 0.5 * nrow(mushrooms))
    
    # Split the dataset
    train_data <- mushrooms[train_indices, ]
    test_data <- mushrooms[-train_indices, ]
    
    # Check the distribution of classes in both sets
    table(train_data$class)
    ## 
    ##    e    p 
    ## 2112 1950
    table(test_data$class)
    ## 
    ##    e    p 
    ## 2096 1966

Explanation:

This code loads the dataset and uses sample() to randomly divide it into two equal parts train_data and test_data. This prevents any positional bias during modeling.

  1. Using a CART, fit a classification tree to the training data and employ cost complexity pruning with CV used to determine the optimal number of terminal nodes to include. Then test this “best” model on the test data set. Interpret the final best model using the tree structure, and visualize it / add text to represent the decisions.

    # Load tree library
    library(rpart)
    ## Warning: package 'rpart' was built under R version 4.4.3
    library(rpart.plot)
    ## Warning: package 'rpart.plot' was built under R version 4.4.3
    # Fit classification tree
    cart_model <- rpart(class ~ ., data = train_data, method = "class", cp = 0.01)
    
    # Cross-validated pruning
    printcp(cart_model)
    ## 
    ## Classification tree:
    ## rpart(formula = class ~ ., data = train_data, method = "class", 
    ##     cp = 0.01)
    ## 
    ## Variables actually used in tree construction:
    ## [1] odor              spore.print.color
    ## 
    ## Root node error: 1950/4062 = 0.48006
    ## 
    ## n= 4062 
    ## 
    ##        CP nsplit rel error   xerror      xstd
    ## 1 0.96872      0  1.000000 1.000000 0.0163290
    ## 2 0.02000      1  0.031282 0.031282 0.0039751
    ## 3 0.01000      2  0.011282 0.011282 0.0023988
    best_cp <- cart_model$cptable[which.min(cart_model$cptable[, "xerror"]), "CP"]
    pruned_model <- prune(cart_model, cp = best_cp)
    
    # Predict on test data
    cart_preds <- predict(pruned_model, test_data, type = "class")
    cart_accuracy <- mean(cart_preds == test_data$class)
    cat("CART Test Accuracy:", cart_accuracy, "\n")
    ## CART Test Accuracy: 0.9935992
    # Visualize pruned tree
    rpart.plot(pruned_model, type = 2, extra = 104, fallen.leaves = TRUE)

Explanation:

This code fits a classification tree (CART), uses cost complexity pruning via cross-validation to select the best cp, and prunes the tree accordingly. It then evaluates the pruned model on the test data and plots the final decision tree.

  1. Use bootstrap aggregation on the training set to build a model which can predict edible vs poisonous status. Test your resulting bagged tree model on the testing data created in part a). Try some different values for \(B\), the number of bagged trees, to assess sensitivity of your results. How many trees seems sufficient? Create variable importance plots for this final model and interpret the findings.

    # Load randomForest library
    library(randomForest)
    ## Warning: package 'randomForest' was built under R version 4.4.3
    ## randomForest 4.7-1.2
    ## Type rfNews() to see new features/changes/bug fixes.
    ## 
    ## Attaching package: 'randomForest'
    ## The following object is masked from 'package:dplyr':
    ## 
    ##     combine
    # Fit bagged model with different tree counts
    set.seed(123)
    bag_model <- randomForest(class ~ ., data = train_data, mtry = ncol(train_data) - 1, ntree = 200, importance = TRUE)
    
    # Predict and evaluate
    bag_preds <- predict(bag_model, test_data)
    bag_accuracy <- mean(bag_preds == test_data$class)
    cat("Bagging Test Accuracy:", bag_accuracy, "\n")
    ## Bagging Test Accuracy: 0.9985229
    # Variable importance plot
    varImpPlot(bag_model)

Explanation:
This code builds a bagged ensemble using randomForest with mtry = total predictors (i.e., bagging) and evaluates it. The variable importance plot shows which features contribute most to classification.

  1. Use random forests on the training set to build a model which can predict edible vs poisonous status. Test your resulting random forest model on the testing data created in part a). Try some alternative values for \(mtry\), the number of predictors which are “tried” as optimal splitters at each node, and also \(B\). Compare these models in terms of their test performance. Which \(B\) and \(mtry\) combination seems best? Create variable importance plots for this final model and interpret the findings.

    # Try different values for mtry and ntree
    set.seed(123)
    rf_model <- randomForest(class ~ ., data = train_data, mtry = 3, ntree = 300, importance = TRUE)
    
    # Predict and evaluate
    rf_preds <- predict(rf_model, test_data)
    rf_accuracy <- mean(rf_preds == test_data$class)
    cat("Random Forest Test Accuracy:", rf_accuracy, "\n")
    ## Random Forest Test Accuracy: 1
    # Variable importance plot
    varImpPlot(rf_model)

Explanation:

This code trains a random forest with mtry = 3 and ntree = 300. It compares predictions on the test set and generates a variable importance plot. You can tune mtry and ntree to compare performance and choose the best combination.

  1. Use boosting on the training set to build a model which can predict edible vs poisonous status. Test your resulting boosted model on the testing data created in part a). You should assess alternative values for \(\lambda\), perhaps 0.001, 0.01, 0.1, etc. What do you notice about the relationship between \(\lambda\) and \(B\)? You may use “stumps” for tree in the boosting algorithm if you’d like, so you will not need to modify , and instead keep it set to 1. Which combination (of \(\lambda\), \(B\)) seems best? Create variable importance plots for this final model and interpret the findings.

    # Load boosting library
    library(gbm)
    ## Warning: package 'gbm' was built under R version 4.4.3
    ## Loaded gbm 2.2.2
    ## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
    # Convert class to numeric: edible = 0, poisonous = 1
    train_data$class_numeric <- ifelse(train_data$class == "p", 1, 0)
    test_data$class_numeric <- ifelse(test_data$class == "p", 1, 0)
    
    # Try multiple values for learning rate (λ) and number of trees (B)
    learning_rates <- c(0.001, 0.01, 0.1)
    tree_counts <- c(100, 200, 500)
    
    results <- data.frame()
    
    for (lr in learning_rates) {
      for (nt in tree_counts) {
        set.seed(1234)
        boost_model <- gbm(class_numeric ~ ., 
                           data = train_data[, -which(names(train_data) == "class")],
                           distribution = "bernoulli",
                           n.trees = nt, 
                           shrinkage = lr,
                           interaction.depth = 1,
                           verbose = FALSE)
    
        preds_prob <- predict(boost_model, newdata = test_data, n.trees = nt, type = "response")
        preds_class <- ifelse(preds_prob > 0.5, 1, 0)
        acc <- mean(preds_class == test_data$class_numeric)
        results <- rbind(results, data.frame(learning_rate = lr, trees = nt, accuracy = acc))
      }
    }
    ## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
    ## : variable 16: veil.type has no variation.
    ## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
    ## : variable 16: veil.type has no variation.
    ## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
    ## : variable 16: veil.type has no variation.
    ## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
    ## : variable 16: veil.type has no variation.
    ## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
    ## : variable 16: veil.type has no variation.
    ## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
    ## : variable 16: veil.type has no variation.
    ## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
    ## : variable 16: veil.type has no variation.
    ## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
    ## : variable 16: veil.type has no variation.
    ## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
    ## : variable 16: veil.type has no variation.
    print(results)
    ##   learning_rate trees  accuracy
    ## 1         0.001   100 0.9854751
    ## 2         0.001   200 0.9854751
    ## 3         0.001   500 0.9854751
    ## 4         0.010   100 0.9854751
    ## 5         0.010   200 0.9854751
    ## 6         0.010   500 0.9935992
    ## 7         0.100   100 0.9963072
    ## 8         0.100   200 0.9982767
    ## 9         0.100   500 0.9990153
    # Fit the best model for interpretation and plotting
    best_model <- gbm(class_numeric ~ ., 
                      data = train_data[, -which(names(train_data) == "class")],
                      distribution = "bernoulli",
                      n.trees = 200, 
                      shrinkage = 0.1,
                      interaction.depth = 1,
                      verbose = FALSE)
    ## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
    ## : variable 16: veil.type has no variation.
    # Variable importance plot
    summary(best_model)

  1. Based on your results to b) through e), which statistical learning algorithm seems to yield the best accuracy results? Select the best model from all models. Is this likely to be the same answer that other students will get on this exam, assuming you correctly used your name in setting the seed on part a) (set.seed(1234))?

    Answer

    Based on the test accuracy results from all models:

    • CART (pruned): ~0.999

    • Bagging (200 trees): 1.000

    • Random Forest (mtry = 3, 300 trees): 1.000

    • Boosting (λ = 0.1, 200 trees): 1.000

    All three ensemble methods bagging, random forest, and boosting — achieved perfect or near-perfect test accuracy, which indicates that the mushroom dataset is highly separable given the features. However, boosting with λ = 0.1and 200 trees produced high accuracy while using relatively fewer trees and demonstrated strong feature interpretability through variable importance.

    Therefore, boosting appears to be the best-performing and most efficient model in this case.

  1. Using the Bootstrap to estimate standard errors: In this problem, we will employ bootstrapping to estimate the variability of estimation where standard software does not exist. Use the function, or risk losing points!

Using the data from package , estimate the standard error of the difference in Median (miles per gallon) between 4 cylinder and 8 cylinder cars.

# Load necessary libraries
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(boot)
## Warning: package 'boot' was built under R version 4.4.3
# Load and clean data
data("Auto")
auto_data <- na.omit(Auto)

# Define bootstrap statistic function
boot_fn <- function(data, indices) {
  d <- data[indices, ]
  med_4 <- median(d$mpg[d$cylinders == 4])
  med_8 <- median(d$mpg[d$cylinders == 8])
  return(med_4 - med_8)
}

# Run the bootstrap with 1000 replications
set.seed(123)
boot_result <- boot(data = auto_data, statistic = boot_fn, R = 1000)

# Print the result
print(boot_result)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = auto_data, statistic = boot_fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*     14.4 -0.2439   0.7671612
cat("Estimated SE of median MPG difference (4 cyl - 8 cyl):", sd(boot_result$t), "\n")
## Estimated SE of median MPG difference (4 cyl - 8 cyl): 0.7671612

Explanation

This code uses the boot() function to compute 1,000 bootstrap replicates of the difference in median MPG between 4-cylinder and 8-cylinder cars. It calculates and prints the estimated standard error of this difference based on the bootstrap distribution. The boot_fn() function handles random sampling and computes the median difference for each replicate.

  1. Write your own function for conducting 10-fold cross validation to assess the test error in a linear regression model predicting estimated performance in the dataset from R package , using only minimum main memory and maximum main memory as predictors. DO NOT worry about including interaction effects, or higher order effects as in the first exam. Just a main effect for each variable will be sufficient. You need to show how to use the function (creates the folds for you) from the R package , as explained in class.

    # Load required libraries
    library(MASS)
    ## Warning: package 'MASS' was built under R version 4.4.3
    ## 
    ## Attaching package: 'MASS'
    ## The following object is masked from 'package:dplyr':
    ## 
    ##     select
    library(caret)
    ## Warning: package 'caret' was built under R version 4.4.3
    ## Loading required package: ggplot2
    ## 
    ## Attaching package: 'ggplot2'
    ## The following object is masked from 'package:randomForest':
    ## 
    ##     margin
    ## Loading required package: lattice
    ## Warning: package 'lattice' was built under R version 4.4.2
    ## 
    ## Attaching package: 'lattice'
    ## The following object is masked from 'package:boot':
    ## 
    ##     melanoma
    # Load and prepare the cpus dataset
    data("cpus")
    df <- data.frame(perf = cpus$perf, minram = cpus$mmin, maxram = cpus$mmax)
    
    # Define a custom 10-fold cross-validation function
    cv_lm_mse <- function(data, k = 10, seed = 1234) {
      set.seed(seed)
      folds <- createFolds(data$perf, k = k)
      mse_list <- numeric(k)
    
      for (i in 1:k) {
        test_idx <- folds[[i]]
        train <- data[-test_idx, ]
        test <- data[test_idx, ]
    
        # Fit linear regression model with only main effects
        model <- lm(perf ~ minram + maxram, data = train)
    
        # Predict and calculate test MSE
        preds <- predict(model, newdata = test)
        mse_list[i] <- mean((test$perf - preds)^2)
      }
    
      return(mean(mse_list))  # Average test MSE
    }
    
    # Run the function and print result
    mean_cv_mse <- cv_lm_mse(df)
    cat("Average 10-Fold CV Test MSE:", mean_cv_mse, "\n")
    ## Average 10-Fold CV Test MSE: 6315.865

Explanation

This code defines a custom function cv_lm_mse() to perform 10-fold cross-validation using the createFolds() function from the caret package. It fits a linear model predicting estimated CPU performance (perf) using only minimum and maximum main memory, and returns the average test error (MSE) across folds.