Experimenting with tree-based methods: In this problem, you will used CARTs, bagging, random forests, and boosting in an attempt to accurately predict whether mushrooms are edible or poisonous, based on a large collection of varaibles decribing the mushrooms (cap shape, cap surface, odor, gill spacing, gill size, stalk color, etc.). The data can be found under the Files tab in folder Exam 2/Final Exam (mushrooms.csv) on Canvas. More information regarding this data can be found (including variable level designation for interpretation) at the following site: https://www.kaggle.com/uciml/mushroom-classification/data
# Load libraries and Data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(tidyverse)
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(dplyr)
# Load the data
mushrooms <- read.csv("C:\\Users\\My PC\\Downloads\\IU\\SL\\mushrooms.csv")
# Convert characters to factors if needed
mushrooms <- mushrooms %>%
mutate_if(is.character, as.factor)
# Set seed
set.seed(1234)
# Split 50% train/test
train_index <- sample(1:nrow(mushrooms), nrow(mushrooms)/2)
train_data <- mushrooms[train_index, ]
test_data <- mushrooms[-train_index, ]
# Fit the tree
tree_model <- rpart(class ~ ., data = train_data, method = "class")
# Examine complexity parameter table
printcp(tree_model)
##
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class")
##
## Variables actually used in tree construction:
## [1] odor spore.print.color
##
## Root node error: 1953/4062 = 0.4808
##
## n= 4062
##
## CP nsplit rel error xerror xstd
## 1 0.970302 0 1.000000 1.000000 0.0163049
## 2 0.016385 1 0.029698 0.029698 0.0038716
## 3 0.010000 2 0.013313 0.013313 0.0026025
plotcp(tree_model)
# Find the optimal cp
opt_cp <- tree_model$cptable[which.min(tree_model$cptable[,"xerror"]), "CP"]
cat("Optimal complexity parameter:", opt_cp, "\n")
## Optimal complexity parameter: 0.01
# Prune the tree
pruned_tree <- prune(tree_model, cp = opt_cp)
# Improved tree visualization
rpart.plot(
pruned_tree,
type = 2, # type 2: split labels below the node
extra = 106, # show probabilities + % of observations
fallen.leaves = TRUE, # make leaves at the bottom
shadow.col = "gray", # add shadow under boxes
box.palette = "Blues", # nicer blue gradient palette
under = TRUE, # show extra info under the box
cex = 0.7, # shrink text size for clarity
main = "Enhanced Classification Tree: Mushroom Edibility"
)
# Test model performance
tree_pred <- predict(pruned_tree, newdata = test_data, type = "class")
tree_accuracy <- mean(tree_pred == test_data$class)
cat("CART accuracy on test set:", tree_accuracy, "\n")
## CART accuracy on test set: 0.9945839
Insight:
The classification tree reveals that odor is the most influential feature in predicting whether a mushroom is edible (e) or poisonous (p).
Mushrooms with an odor of almond, anise, or none (a, l, n) are classified as edible in nearly all cases (branching to the left side of the tree).
Within this group, spore print color (specifically values such as b, h, k, n, o, u, w, or y) offers additional, though minor, refinement. However, the majority still remain classified as edible.
Mushrooms with any other odor are directly classified as poisonous (branching to the right side of the tree).
In summary, odor alone provides a highly accurate separation between edible and poisonous mushrooms, highlighting its role as the most decisive predictor in this dataset.
# Define values for number of trees (B)
b_values <- c(50, 100, 200, 500)
bagging_results <- data.frame(B = integer(), Accuracy = numeric())
# Loop over each B to train bagging models
for (b in b_values) {
bag_model <- randomForest(class ~ ., data = train_data,
mtry = ncol(train_data) - 1, # Use all predictors
ntree = b,
importance = TRUE)
# Predict and compute accuracy
bag_pred <- predict(bag_model, newdata = test_data)
bag_accuracy <- mean(bag_pred == test_data$class)
# Store results
bagging_results <- rbind(bagging_results,
data.frame(B = b, Accuracy = bag_accuracy))
cat("Bagging with B =", b, "- Accuracy:", bag_accuracy, "\n")
}
## Bagging with B = 50 - Accuracy: 0.9995076
## Bagging with B = 100 - Accuracy: 0.9995076
## Bagging with B = 200 - Accuracy: 0.9995076
## Bagging with B = 500 - Accuracy: 0.9995076
# Plot accuracy vs number of trees
plot(bagging_results$B, bagging_results$Accuracy,
type = "b", pch = 19, col = "darkblue",
xlab = "Number of Trees (B)", ylab = "Accuracy",
main = "Bagging: Accuracy vs Number of Trees")
# Identify best B
best_b <- bagging_results$B[which.max(bagging_results$Accuracy)]
cat("Best number of trees for bagging:", best_b, "\n")
## Best number of trees for bagging: 50
# Build final bagging model using optimal B
final_bag <- randomForest(class ~ ., data = train_data,
mtry = ncol(train_data) - 1,
ntree = best_b,
importance = TRUE)
# Plot top 10 most important variables
varImpPlot(final_bag, main = "Variable Importance - Bagging", n.var = 10)
Insights:
Bagging (Bootstrap Aggregation) with various numbers of trees (B = 50, 100, 200, 500) consistently achieved exceptionally high accuracy (~99.95%) on the test set.
The odor feature emerged as the most critical variable, significantly outperforming all others in importance.
Other contributing features include spore print color and stalk color below the ring.
This confirms that a few highly predictive features dominate the classification task and allow bagging to produce near-perfect results.
# RANDOM FOREST: mtry = sqrt(p)
mtry_values <- c(2, 4, floor(sqrt(ncol(train_data) - 1)))
b_values <- c(100, 300, 500)
rf_results <- data.frame(mtry = integer(), B = integer(), Accuracy = numeric())
# Grid search over mtry and B combinations
for (mtry in mtry_values) {
for (b in b_values) {
rf_model <- randomForest(class ~ ., data = train_data,
mtry = mtry,
ntree = b,
importance = TRUE)
# Evaluate model
rf_pred <- predict(rf_model, newdata = test_data)
rf_accuracy <- mean(rf_pred == test_data$class)
# Store results
rf_results <- rbind(rf_results,
data.frame(mtry = mtry, B = b, Accuracy = rf_accuracy))
cat("Random Forest with mtry =", mtry, "and B =", b, "- Accuracy:", rf_accuracy, "\n")
}
}
## Random Forest with mtry = 2 and B = 100 - Accuracy: 1
## Random Forest with mtry = 2 and B = 300 - Accuracy: 1
## Random Forest with mtry = 2 and B = 500 - Accuracy: 1
## Random Forest with mtry = 4 and B = 100 - Accuracy: 1
## Random Forest with mtry = 4 and B = 300 - Accuracy: 1
## Random Forest with mtry = 4 and B = 500 - Accuracy: 1
## Random Forest with mtry = 4 and B = 100 - Accuracy: 1
## Random Forest with mtry = 4 and B = 300 - Accuracy: 1
## Random Forest with mtry = 4 and B = 500 - Accuracy: 1
# Identify best combination of parameters
best_rf <- rf_results[which.max(rf_results$Accuracy), ]
cat("Best Random Forest parameters: mtry =", best_rf$mtry, "and B =", best_rf$B, "\n")
## Best Random Forest parameters: mtry = 2 and B = 100
# Train final Random Forest with optimal parameters
final_rf <- randomForest(class ~ ., data = train_data,
mtry = best_rf$mtry,
ntree = best_rf$B,
importance = TRUE)
# Plot top 10 most important variables
varImpPlot(final_rf, main = "Variable Importance - Random Forest", n.var = 10)
Insights: The Random Forest model achieved perfect
classification of mushrooms—accurately distinguishing edible from
poisonous mushrooms—across all tested combinations of tree count (B =
100, 300, 500) and mtry values (2, 4, √p).
Parameter tuning had minimal impact on performance, as every configuration delivered flawless accuracy. This suggests you can confidently opt for lower complexity settings (e.g., B = 100, mtry = 2) to conserve computational resources without compromising accuracy.
The variable importance plot reinforces that “odor” is by far the most decisive feature, followed by spore print color and gill characteristics.
# Define learning rates and tree counts
lambda_values <- c(0.001, 0.01, 0.1)
b_values <- c(100, 500, 1000)
boost_results <- data.frame(lambda = numeric(), B = integer(), Accuracy = numeric())
# Loop over combinations of lambda and B
for (lambda in lambda_values) {
for (b in b_values) {
# Prepare data (numeric target for gbm)
train_boost <- train_data
train_boost$class_num <- ifelse(train_data$class == "p", 1, 0)
# Fit the boosting model
boost_model <- gbm(class_num ~ . - class, data = train_boost,
distribution = "bernoulli",
n.trees = b,
interaction.depth = 1,
shrinkage = lambda,
verbose = FALSE)
# Predict on test data
test_boost <- test_data
boost_pred_prob <- predict(boost_model, newdata = test_boost,
n.trees = b, type = "response")
boost_pred_class <- ifelse(boost_pred_prob > 0.5, "p", "e")
boost_accuracy <- mean(boost_pred_class == test_data$class)
# Save results
boost_results <- rbind(boost_results,
data.frame(lambda = lambda, B = b, Accuracy = boost_accuracy))
cat("Boosting with lambda =", lambda, "and B =", b, "- Accuracy:", boost_accuracy, "\n")
}
}
## Boosting with lambda = 0.001 and B = 100 - Accuracy: 0.9847366
## Boosting with lambda = 0.001 and B = 500 - Accuracy: 0.9847366
## Boosting with lambda = 0.001 and B = 1000 - Accuracy: 0.9847366
## Boosting with lambda = 0.01 and B = 100 - Accuracy: 0.9847366
## Boosting with lambda = 0.01 and B = 500 - Accuracy: 0.9945839
## Boosting with lambda = 0.01 and B = 1000 - Accuracy: 0.9975382
## Boosting with lambda = 0.1 and B = 100 - Accuracy: 0.9975382
## Boosting with lambda = 0.1 and B = 500 - Accuracy: 1
## Boosting with lambda = 0.1 and B = 1000 - Accuracy: 1
# Identify best lambda-B combination
best_boost <- boost_results[which.max(boost_results$Accuracy), ]
cat("Best Boosting parameters: lambda =", best_boost$lambda, "and B =", best_boost$B, "\n")
## Best Boosting parameters: lambda = 0.1 and B = 500
# Heatmap visualization of accuracy by lambda and B
library(ggplot2)
ggplot(boost_results, aes(x = factor(B), y = factor(lambda), fill = Accuracy)) +
geom_tile() +
scale_fill_gradient(low = "black", high = "skyblue") +
labs(x = "Number of Trees (B)", y = "Learning Rate (λ)",
title = "Boosting: Accuracy by λ and B")
# Final Boosting model with best parameters
train_boost <- train_data
train_boost$class_num <- ifelse(train_data$class == "p", 1, 0)
final_boost <- gbm(class_num ~ . - class, data = train_boost,
distribution = "bernoulli",
n.trees = best_boost$B,
interaction.depth = 1,
shrinkage = best_boost$lambda,
verbose = FALSE)
# Variable importance plot
summary(final_boost, plotit = TRUE, n.trees = best_boost$B, cBars = 10)
Insight
The Boosting analysis demonstrates that high predictive accuracy—up to 100%—is achieved using a larger learning rate (λ = 0.1) combined with sufficient trees (B ≥ 500).
With smaller learning rates, a larger number of trees is required to reach comparable performance, confirming the classic bias-variance tradeoff in boosting.
The heatmap visualization effectively illustrates the balance between λ and B in achieving high accuracy.
The variable importance plot clearly shows “odor” as the dominant predictor, with minimal contributions from other variables. This reaffirms the pattern observed across all modeling techniques in this mushroom classification task.
# Combine results for all models
all_results <- data.frame(
Model = c("CART", "Bagging", "Random Forest", "Boosting"),
Accuracy = c(tree_accuracy,
max(bagging_results$Accuracy),
max(rf_results$Accuracy),
max(boost_results$Accuracy))
)
# Sort by descending accuracy
all_results <- all_results[order(-all_results$Accuracy), ]
print(all_results)
## Model Accuracy
## 3 Random Forest 1.0000000
## 4 Boosting 1.0000000
## 2 Bagging 0.9995076
## 1 CART 0.9945839
# Identify the best model
best_model <- all_results$Model[1]
best_accuracy <- all_results$Accuracy[1]
cat("The best model is", best_model, "with accuracy", round(best_accuracy, 4), "\n")
## The best model is Random Forest with accuracy 1
Insights:
The final comparison reveals that Random Forest stand out as top performer, offering exceptional accuracy and robustness.
These results align with ensemble learning theory: aggregating multiple trees—especially with Random Forests and Boosting—significantly enhances predictive performance over a single decision tree.
Note: Results may vary slightly between runs or systems due to randomization in model fitting, but with a fixed seed (e.g., 1234), students should observe very similar patterns.
Using the Bootstrap to estimate standard errors: In this problem, we will employ bootstrapping to estimate the variability of estimation where standard software does not exist. Use the function, or risk losing points!
Using the data from package , estimate the standard error of the difference in Median (miles per gallon) between 4 cylinder and 8 cylinder cars.
# Load required libraries
library(ISLR)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
# Load the Auto dataset
data(Auto)
# Define function to calculate the difference in median mpg between 4 and 8 cylinder cars
diff_median_mpg <- function(data, indices) {
sample_data <- data[indices, ]
median_4cyl <- median(sample_data$mpg[sample_data$cylinders == 4])
median_8cyl <- median(sample_data$mpg[sample_data$cylinders == 8])
return(median_4cyl - median_8cyl)
}
# Perform bootstrap with 1000 resamples
set.seed(1234)
boot_result <- boot(data = Auto, statistic = diff_median_mpg, R = 1000)
# View bootstrap result
print(boot_result)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = diff_median_mpg, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 14.4 -0.30745 0.797002
# Extract original estimate and bootstrap standard error
boot_result$t0 # Original difference in medians
## [1] 14.4
sd(boot_result$t) # Standard error estimate from bootstrap
## [1] 0.797002
Interpretation
The estimated difference in median miles per gallon (mpg) between 4-cylinder and 8-cylinder vehicles is approximately 14.4 mpg, indicating that 4-cylinder cars are significantly more fuel-efficient.
Using 1,000 bootstrap replications, the bootstrap-estimated standard error of this difference is approximately 0.797 mpg.
This suggests that while the median mpg difference is substantial, the estimate is also statistically stable with relatively low variability—supporting the conclusion that cylinder count is strongly associated with fuel efficiency in the Auto dataset.
Write your own function for conducting 10-fold cross validation to assess the test error in a linear regression model predicting estimated performance in the dataset from R package , using only minimum main memory and maximum main memory as predictors. DO NOT worry about including interaction effects, or higher order effects as in the first exam. Just a main effect for each variable will be sufficient. You need to show how to use the function (creates the folds for you) from the R package , as explained in class.
# Load libraries
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
# Load cpus data
data(cpus)
# Custom 10-fold cross-validation function
crossval_lm <- function(data, response, predictors, k = 10, seed = 1234) {
set.seed(seed)
# Create folds using caret::createFolds
folds <- createFolds(data[[response]], k = k, list = TRUE, returnTrain = FALSE)
# Store MSE values for each fold
mse_values <- numeric(k)
# Loop over folds
for (i in 1:k) {
test_indices <- folds[[i]]
train_data <- data[-test_indices, ]
test_data <- data[test_indices, ]
# Build formula dynamically: response ~ predictors
formula <- as.formula(paste(response, "~", paste(predictors, collapse = " + ")))
# Fit linear regression model
model <- lm(formula, data = train_data)
# Predict on test set
predictions <- predict(model, newdata = test_data)
# Compute MSE for this fold
mse_values[i] <- mean((test_data[[response]] - predictions)^2)
}
# Calculate average MSE and standard error
avg_mse <- mean(mse_values)
se_mse <- sd(mse_values) / sqrt(k)
# Return results as a list
return(list(average_mse = avg_mse, standard_error = se_mse, mse_by_fold = mse_values))
}
# Example usage: predict 'perf' using 'mmin' and 'mmax'
cv_results <- crossval_lm(data = cpus, response = "perf", predictors = c("mmin", "mmax"))
# Print results
cat("10-Fold Cross-Validation Results:\n")
## 10-Fold Cross-Validation Results:
cat("Average Test MSE:", round(cv_results$average_mse, 4), "\n")
## Average Test MSE: 6315.865
cat("Standard Error:", round(cv_results$standard_error, 4), "\n")
## Standard Error: 1815.464
Interpretation:
Using 10-fold cross-validation, the linear regression model predicting CPU performance (perf) from minimum main memory (mmin) and maximum main memory (mmax) resulted in:
Average Test Mean Squared Error (MSE): ~6315.87
Standard Error of Test MSE: ~1815.46
This indicates that the average squared difference between predicted and actual performance is around 6315 units² across the 10 folds.
The standard error (1815.46) shows moderate variability in MSE across different folds, which is expected for models with relatively simple predictors on this dataset.