Final Exam

1.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(gbm)
## Warning: package 'gbm' was built under R version 4.4.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(tidyverse)
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(dplyr)
mushrooms<- read.csv("C:/Users/MSKR/Downloads/mushrooms.csv", header = TRUE)
str(mushrooms)
## 'data.frame':    8124 obs. of  23 variables:
##  $ class                   : chr  "p" "e" "e" "p" ...
##  $ cap.shape               : chr  "x" "x" "b" "x" ...
##  $ cap.surface             : chr  "s" "s" "s" "y" ...
##  $ cap.color               : chr  "n" "y" "w" "w" ...
##  $ bruises                 : chr  "t" "t" "t" "t" ...
##  $ odor                    : chr  "p" "a" "l" "p" ...
##  $ gill.attachment         : chr  "f" "f" "f" "f" ...
##  $ gill.spacing            : chr  "c" "c" "c" "c" ...
##  $ gill.size               : chr  "n" "b" "b" "n" ...
##  $ gill.color              : chr  "k" "k" "n" "n" ...
##  $ stalk.shape             : chr  "e" "e" "e" "e" ...
##  $ stalk.root              : chr  "e" "c" "c" "e" ...
##  $ stalk.surface.above.ring: chr  "s" "s" "s" "s" ...
##  $ stalk.surface.below.ring: chr  "s" "s" "s" "s" ...
##  $ stalk.color.above.ring  : chr  "w" "w" "w" "w" ...
##  $ stalk.color.below.ring  : chr  "w" "w" "w" "w" ...
##  $ veil.type               : chr  "p" "p" "p" "p" ...
##  $ veil.color              : chr  "w" "w" "w" "w" ...
##  $ ring.number             : chr  "o" "o" "o" "o" ...
##  $ ring.type               : chr  "p" "p" "p" "p" ...
##  $ spore.print.color       : chr  "k" "n" "n" "k" ...
##  $ population              : chr  "s" "n" "n" "s" ...
##  $ habitat                 : chr  "u" "g" "m" "u" ...
# Convert characters to factors if needed
mushrooms <- mushrooms %>% 
  mutate_if(is.character, as.factor)

a. Train test split

# Split data randomly into training and test sets (50% each)
set.seed(1234)
train_index <- sample(1:nrow(mushrooms), size = 0.5 * nrow(mushrooms))
train_data <- mushrooms[train_index, ]
test_data <- mushrooms[-train_index, ]

# Verify splits
cat("Training set dimensions:", dim(train_data), "\n")
## Training set dimensions: 4062 23
cat("Test set dimensions:", dim(test_data), "\n")
## Test set dimensions: 4062 23

b. CART with Cost Complexity Pruning

# Build full classification tree
tree_model <- rpart(class ~ ., data = train_data, method = "class")

# Examine complexity parameter table
printcp(tree_model)
## 
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class")
## 
## Variables actually used in tree construction:
## [1] odor              spore.print.color
## 
## Root node error: 1953/4062 = 0.4808
## 
## n= 4062 
## 
##         CP nsplit rel error   xerror      xstd
## 1 0.970302      0  1.000000 1.000000 0.0163049
## 2 0.016385      1  0.029698 0.029698 0.0038716
## 3 0.010000      2  0.013313 0.013313 0.0026025
plotcp(tree_model)

# Find optimal complexity parameter
opt_cp <- tree_model$cptable[which.min(tree_model$cptable[,"xerror"]), "CP"]
cat("Optimal complexity parameter:", opt_cp, "\n")
## Optimal complexity parameter: 0.01
# Prune the tree
pruned_tree <- prune(tree_model, cp = opt_cp)

# Visualize the pruned tree
rpart.plot(pruned_tree, extra = 104, box.palette = "RdBu", 
          main = "Classification Tree for Mushroom Edibility")

# Test model performance
tree_pred <- predict(pruned_tree, newdata = test_data, type = "class")
tree_accuracy <- mean(tree_pred == test_data$class)
cat("CART accuracy on test set:", tree_accuracy, "\n")
## CART accuracy on test set: 0.9945839

Insight:

The classification tree shows that odor is the most important feature in determining whether a mushroom is edible (e) or poisonous (p).

  • If a mushroom’s odor is almond, anise, or none (a, l, n), it is almost always edible (left side of the tree).

    • Among those, spore print color (specifically being b, h, k, n, o, u, w, or y) provides further refinement, but most still end up edible.

    • If the odor is anything else, the mushroom is classified as poisonous (right side of the tree).

Overall, odor alone separates most edible mushrooms from poisonous ones with very high accuracy.
This suggests that odor is a dominant predictor for mushroom edibility in this dataset.

c. Bagging with Different Numbers of Trees

# Try different values for B (number of trees)
b_values <- c(50, 100, 200, 500)
bagging_results <- data.frame(B = integer(), Accuracy = numeric())
for (b in b_values) {
  # Build bagging model (mtry = p for bagging)
  bag_model <- randomForest(class ~ ., data = train_data,
                          mtry = ncol(train_data) - 1,  # All predictors
                          ntree = b,
                          importance = TRUE)
  
  # Test performance
  bag_pred <- predict(bag_model, newdata = test_data)
  bag_accuracy <- mean(bag_pred == test_data$class)
  
  bagging_results <- rbind(bagging_results, 
                         data.frame(B = b, Accuracy = bag_accuracy))
  
  cat("Bagging with B =", b, "- Accuracy:", bag_accuracy, "\n")
}
## Bagging with B = 50 - Accuracy: 0.9995076 
## Bagging with B = 100 - Accuracy: 0.9995076 
## Bagging with B = 200 - Accuracy: 0.9995076 
## Bagging with B = 500 - Accuracy: 0.9995076
# Plot accuracy vs number of trees
plot(bagging_results$B, bagging_results$Accuracy, 
     type = "b", xlab = "Number of Trees (B)", ylab = "Accuracy",
     main = "Bagging: Accuracy vs Number of Trees")

# Find optimal B
best_b <- bagging_results$B[which.max(bagging_results$Accuracy)]
cat("Best number of trees for bagging:", best_b, "\n")
## Best number of trees for bagging: 50
# Build final bagging model with optimal B
final_bag <- randomForest(class ~ ., data = train_data,
                         mtry = ncol(train_data) - 1,
                         ntree = best_b,
                         importance = TRUE)
# Variable importance plot
varImpPlot(final_bag, main = "Variable Importance - Bagging",
          n.var = 10)

Insight:

  • Using Bagging (Bootstrap Aggregating) with different numbers of trees (B = 50, 100, 200, 500) produced a very high and consistent accuracy of ~99.95%.

  • The odor feature is by far the most important predictor, followed by spore print color and stalk color below the ring.

  • This confirms that a few strong features drive almost perfect classification performance in this dataset.

d. Random Forests with Different Parameter Combinations

# Try different combinations of mtry and B
mtry_values <- c(2, 4, floor(sqrt(ncol(train_data) - 1)))
b_values <- c(100, 300, 500)
rf_results <- data.frame(mtry = integer(), B = integer(), Accuracy = numeric())
for (mtry in mtry_values) {
  for (b in b_values) {
    # Build random forest model
    rf_model <- randomForest(class ~ ., data = train_data,
                           mtry = mtry,
                           ntree = b,
                           importance = TRUE)
    
    # Test performance
    rf_pred <- predict(rf_model, newdata = test_data)
    rf_accuracy <- mean(rf_pred == test_data$class)
    
    rf_results <- rbind(rf_results, 
                      data.frame(mtry = mtry, B = b, Accuracy = rf_accuracy))
    
    cat("Random Forest with mtry =", mtry, "and B =", b, "- Accuracy:", rf_accuracy, "\n")
  }
}
## Random Forest with mtry = 2 and B = 100 - Accuracy: 1 
## Random Forest with mtry = 2 and B = 300 - Accuracy: 1 
## Random Forest with mtry = 2 and B = 500 - Accuracy: 1 
## Random Forest with mtry = 4 and B = 100 - Accuracy: 1 
## Random Forest with mtry = 4 and B = 300 - Accuracy: 1 
## Random Forest with mtry = 4 and B = 500 - Accuracy: 1 
## Random Forest with mtry = 4 and B = 100 - Accuracy: 1 
## Random Forest with mtry = 4 and B = 300 - Accuracy: 1 
## Random Forest with mtry = 4 and B = 500 - Accuracy: 1
# Find optimal combination
best_rf <- rf_results[which.max(rf_results$Accuracy), ]
cat("Best Random Forest parameters: mtry =", best_rf$mtry, "and B =", best_rf$B, "\n")
## Best Random Forest parameters: mtry = 2 and B = 100
# Build final RF model with optimal parameters
final_rf <- randomForest(class ~ ., data = train_data,
                       mtry = best_rf$mtry,
                       ntree = best_rf$B,
                       importance = TRUE)
# Variable importance plot
varImpPlot(final_rf, main = "Variable Importance - Random Forest",
          n.var = 10)

Insight

  • The model perfectly classifies mushrooms as edible or poisonous regardless of the number of trees (B) or features considered at each split (mtry).

  • Parameter tuning has minimal impact since all configurations achieve perfect results, suggesting you could use fewer trees (B=100) and fewer variables per split (mtry=2) for computational efficiency without sacrificing performance.

  • The variable importance plot confirms “odor” is by far the most significant predictor, followed by spore print color and gill characteristics.

e. Boosting with Different Learning Rates and Tree Numbers

# Try different combinations of lambda (learning rate) and B
lambda_values <- c(0.001, 0.01, 0.1)
b_values <- c(100, 500, 1000)
boost_results <- data.frame(lambda = numeric(), B = integer(), Accuracy = numeric())
for (lambda in lambda_values) {
  for (b in b_values) {
    # Prepare data for gbm (need numeric response)
    train_boost <- train_data
    train_boost$class_num <- ifelse(train_data$class == "p", 1, 0)
    
    # Build boosting model
    boost_model <- gbm(class_num ~ . - class, data = train_boost,
                      distribution = "bernoulli",
                      n.trees = b,
                      interaction.depth = 1,  # Using stumps as requested
                      shrinkage = lambda,
                      verbose = FALSE)
    
    # Test performance
    test_boost <- test_data
    boost_pred_prob <- predict(boost_model, newdata = test_boost, 
                              n.trees = b, type = "response")
    boost_pred_class <- ifelse(boost_pred_prob > 0.5, "p", "e")
    boost_accuracy <- mean(boost_pred_class == test_data$class)
    
    boost_results <- rbind(boost_results, 
                         data.frame(lambda = lambda, B = b, Accuracy = boost_accuracy))
    
    cat("Boosting with lambda =", lambda, "and B =", b, "- Accuracy:", boost_accuracy, "\n")
  }
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 and B = 100 - Accuracy: 0.9847366
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 and B = 500 - Accuracy: 0.9847366
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 and B = 1000 - Accuracy: 0.9847366
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 and B = 100 - Accuracy: 0.9847366
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 and B = 500 - Accuracy: 0.9945839
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 and B = 1000 - Accuracy: 0.9975382
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 and B = 100 - Accuracy: 0.9975382
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 and B = 500 - Accuracy: 1
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 and B = 1000 - Accuracy: 1
# Find optimal combination
best_boost <- boost_results[which.max(boost_results$Accuracy), ]
cat("Best Boosting parameters: lambda =", best_boost$lambda, "and B =", best_boost$B, "\n")
## Best Boosting parameters: lambda = 0.1 and B = 500
  • As lambda decreases, more trees (larger B) are typically needed to achieve optimal performance
# Visualize relationship with heatmap
library(ggplot2)
ggplot(boost_results, aes(x = factor(B), y = factor(lambda), fill = Accuracy)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(x = "Number of Trees (B)", y = "Learning Rate (lambda)", 
       title = "Boosting: Accuracy by lambda and B")

# Build final boosting model with optimal parameters
train_boost <- train_data
train_boost$class_num <- ifelse(train_data$class == "p", 1, 0)

final_boost <- gbm(class_num ~ . - class, data = train_boost,
                  distribution = "bernoulli",
                  n.trees = best_boost$B,
                  interaction.depth = 1,
                  shrinkage = best_boost$lambda,
                  verbose = FALSE)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
# Variable importance
summary(final_boost, plotit = TRUE, n.trees = best_boost$B, cBars = 10)

##                                               var      rel.inf
## odor                                         odor 9.492480e+01
## spore.print.color               spore.print.color 4.263208e+00
## stalk.surface.below.ring stalk.surface.below.ring 2.411991e-01
## gill.size                               gill.size 2.389130e-01
## stalk.color.below.ring     stalk.color.below.ring 1.942312e-01
## population                             population 4.641570e-02
## cap.color                               cap.color 3.233962e-02
## stalk.surface.above.ring stalk.surface.above.ring 2.365999e-02
## ring.number                           ring.number 1.405065e-02
## cap.shape                               cap.shape 6.622187e-03
## habitat                                   habitat 6.403945e-03
## ring.type                               ring.type 5.463953e-03
## gill.color                             gill.color 1.727668e-03
## stalk.root                             stalk.root 8.790685e-04
## stalk.color.above.ring     stalk.color.above.ring 8.690701e-05
## cap.surface                           cap.surface 0.000000e+00
## bruises                                   bruises 0.000000e+00
## gill.attachment                   gill.attachment 0.000000e+00
## gill.spacing                         gill.spacing 0.000000e+00
## stalk.shape                           stalk.shape 0.000000e+00
## veil.type                               veil.type 0.000000e+00
## veil.color                             veil.color 0.000000e+00

Insights

  • The boosting results show that high accuracy (up to 100%) is achieved with larger learning rates (λ = 0.1) and a sufficient number of trees (B ≥ 500), while lower learning rates require more trees to reach similar performance.

  • The variable importance plot highlights “odor” as overwhelmingly the most influential feature in predicting mushroom edibility, with other variables contributing minimally

f. Model Comparison and Best Model Selection

# Compare all models
all_results <- data.frame(
  Model = c("CART", "Bagging", "Random Forest", "Boosting"),
  Accuracy = c(tree_accuracy, 
              max(bagging_results$Accuracy),
              max(rf_results$Accuracy),
              max(boost_results$Accuracy))
)
# Sort by accuracy
all_results <- all_results[order(-all_results$Accuracy), ]
print(all_results)
##           Model  Accuracy
## 3 Random Forest 1.0000000
## 4      Boosting 1.0000000
## 2       Bagging 0.9995076
## 1          CART 0.9945839
# Identify best model
best_model <- all_results$Model[1]
best_accuracy <- all_results$Accuracy[1]

cat("The best model is", best_model, "with accuracy", round(best_accuracy, 4), "\n")
## The best model is Random Forest with accuracy 1
  • While we set a fixed seed (1234), other students should get similar results, though implementation details may cause minor variations.

2.

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
data(Auto)
# Define function to calculate difference in medians
diff_median_mpg <- function(data, indices) {
  sample_data <- data[indices, ]
  median_4cyl <- median(sample_data$mpg[sample_data$cylinders == 4])
  median_8cyl <- median(sample_data$mpg[sample_data$cylinders == 8])
  return(median_4cyl - median_8cyl)
}
# Set seed for reproducibility
set.seed(1234)  # for reproducibility
boot_result <- boot(data = Auto, statistic = diff_median_mpg, R = 1000)
print(boot_result)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Auto, statistic = diff_median_mpg, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original   bias    std. error
## t1*     14.4 -0.30745    0.797002
# Extract standard error estimate
boot_result$t0    # Original estimate
## [1] 14.4
sd(boot_result$t) # Bootstrap estimated standard error
## [1] 0.797002

Interpretation:

*4-cylinder cars have a median mpg about 14.4 units higher than 8-cylinder cars*,\

with a **standard error** of about **0.797**.

3.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
data(cpus)
# Create folds using createFolds() from caret package
set.seed(1234)  # For reproducibility
folds <- createFolds(cpus$perf, k = 10, list = TRUE, returnTrain = FALSE)

# Initialize vector to store MSE values
mse_values <- numeric(10)
# Perform 10-fold cross validation
for(i in 1:10) {
    # Split data into training and test sets
    test_indices <- folds[[i]]
    train_data <- cpus[-test_indices, ]
    test_data <- cpus[test_indices, ]
    
    # Fit linear regression model
    model <- lm(perf ~ mmin + mmax, data = train_data)
    
    # Make predictions on test set
    predictions <- predict(model, newdata = test_data)
    
    # Calculate MSE for this fold
    mse_values[i] <- mean((test_data$perf - predictions)^2)
}
# Calculate average test MSE and standard error
avg_mse <- mean(mse_values)
se_mse <- sd(mse_values)/sqrt(10)
cat("10-Fold Cross-Validation Results:\n")
## 10-Fold Cross-Validation Results:
cat("Average Test MSE:", avg_mse, "\n")
## Average Test MSE: 6315.865
cat("Standard Error:", se_mse)
## Standard Error: 1815.464