Final Exam

Question 1:

(a) Split Data:

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

mushrooms <- read.csv("/Users/rupeshswarnakar/Downloads/mushrooms.csv", stringsAsFactors = TRUE)

set.seed(1234)

# Randomly sample half the rows for training:
n <- nrow(mushrooms)
train_idx <- sample(seq_len(n), size = floor(0.5 * n))

# Creating train and test sets:
train <- mushrooms[train_idx, ]
test  <- mushrooms[-train_idx, ]

# Check sizes:
cat("Training set size:", nrow(train), "\n",
    "Test set size:    ", nrow(test), "\n")

## Training set size: 4062 
##  Test set size:     4062

(b) Fitting a tree using CART:

library(rpart)
library(rpart.plot)

# Growing a full tree (cp=0 → no early stopping):
cart.full <- rpart(
  class ~ ., 
  data   = train, 
  method = "class", 
  control= rpart.control(cp = 0)
)

# Inspecting the CP table and picking the cp with minimum xerror:
printcp(cart.full)

## 
## Classification tree:
## rpart(formula = class ~ ., data = train, method = "class", control = rpart.control(cp = 0))
## 
## Variables actually used in tree construction:
## [1] odor                   spore.print.color      stalk.color.below.ring
## [4] stalk.root            
## 
## Root node error: 1953/4062 = 0.4808
## 
## n= 4062 
## 
##          CP nsplit rel error    xerror      xstd
## 1 0.9703021      0 1.0000000 1.0000000 0.0163049
## 2 0.0163850      1 0.0296979 0.0296979 0.0038716
## 3 0.0061444      2 0.0133129 0.0133129 0.0026025
## 4 0.0023041      3 0.0071685 0.0071685 0.0019125
## 5 0.0000000      5 0.0025602 0.0025602 0.0011442

opt_cp <- cart.full$cptable[ which.min(cart.full$cptable[,"xerror"]), "CP" ]

# Pruning the tree at that cp:
cart.pruned <- prune(cart.full, cp = opt_cp)

# Plotting:
plot(cart.pruned, margin=0.1)
text(cart.pruned, use.n=TRUE, all=TRUE, pretty=0, cex=0.7)

# Predicting on the test set:
pred <- predict(cart.pruned, test, type = "class")

# Confusion matrix & accuracy:
conf_mat <- table(Predicted = pred, Actual = test$class)
print(conf_mat)

##          Actual
## Predicted    e    p
##         e 2099    3
##         p    0 1960

accuracy <- sum(diag(conf_mat)) / sum(conf_mat)
cat("Test‑set accuracy:", round(accuracy, 4), "\n")

## Test‑set accuracy: 0.9993

(c) Bootstrap Aggregation:

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

# Number of predictors:
p <- ncol(train) - 1  

# Trying different numbers of bagged trees:
set.seed(1234)  
bag100 <- randomForest(class ~ ., data = train, mtry = p, ntree = 100, importance = TRUE)
bag200 <- randomForest(class ~ ., data = train, mtry = p, ntree = 200, importance = TRUE)
bag500 <- randomForest(class ~ ., data = train, mtry = p, ntree = 500, importance = TRUE)

# Computing test‐set accuracies:
pred100 <- predict(bag100, test)
pred200 <- predict(bag200, test)
pred500 <- predict(bag500, test)

acc100 <- mean(pred100 == test$class)
acc200 <- mean(pred200 == test$class)
acc500 <- mean(pred500 == test$class)

cat("Bagging Accuracy (100 trees):", round(acc100, 4), "\n")

## Bagging Accuracy (100 trees): 0.9995

cat("Bagging Accuracy (200 trees):", round(acc200, 4), "\n")

## Bagging Accuracy (200 trees): 0.9995

cat("Bagging Accuracy (500 trees):", round(acc500, 4), "\n")

## Bagging Accuracy (500 trees): 0.9995

# Variable importance for the “best” bagged model:
bestBag <- if (acc200 >= acc500) bag200 else bag500

varImpPlot(bestBag, type = 1, main = "Variable Importance (Mean Decrease in Accuracy)")

Since accuracy already reaches 99.95% at 100 trees and does not improve further, 100 trees is sufficient. Beyond that, adding more trees simply increases training time without any measurable gain in predictive performance.

The variable‐importance plot for the chosen bagged model confirms that odor is by far the most critical feature, which randomly permuting it causes the largest drop in accuracy. The next most influential predictors are stalk.color.below.ring, spore.print.color, and stalk.surface.above.ring, highlighting the importance of stem and spore characteristics when odor alone is borderline. Mid‑ranked variables like habitat and gill.size contribute moderately, while attributes such as ring.number, ring.type, and veil.color have negligible impact and could potentially be omitted with minimal loss in model performance.

(d) Using Random Forest:

# Number of predictors:
p <- ncol(train) - 1  

# Candidate values:
ntrees <- c(100, 200, 500)
mtrys   <- c(floor(sqrt(p)), floor(p/3), floor(p/2))

# Grid search:
results <- expand.grid(ntree = ntrees, mtry = mtrys, Accuracy = NA_real_)
row <- 1

for (B in ntrees) {
  for (m in mtrys) {
    set.seed(1234)
    rf <- randomForest(class ~ ., data = train, 
                       ntree = B, mtry = m, importance = TRUE)
    pred <- predict(rf, test)
    acc  <- mean(pred == test$class)
    results$Accuracy[row] <- acc
    row <- row + 1
  }
}

print(results)

##   ntree mtry Accuracy
## 1   100    4        1
## 2   200    4        1
## 3   500    4        1
## 4   100    7        1
## 5   200    7        1
## 6   500    7        1
## 7   100   11        1
## 8   200   11        1
## 9   500   11        1

# Picking best row:
best   <- results[which.max(results$Accuracy), ]
cat("Best combination → ntree =", best$ntree, 
    ", mtry =", best$mtry, 
    "with accuracy =", round(best$Accuracy,4), "\n")

## Best combination → ntree = 100 , mtry = 4 with accuracy = 1

# Refitting best model for importance plot:
set.seed(1234)
best_rf <- randomForest(class ~ ., data = train,
                        ntree = best$ntree, mtry = best$mtry,
                        importance = TRUE)

# Variable importance:
varImpPlot(best_rf, type = 1, 
           main = paste0("VarImp — ntree=",best$ntree,
                         ", mtry=",best$mtry))

The variable‐importance plot for the best model (ntree = 100, mtry = 4) underscores the dominant role of odor, which tops the chart by a wide margin: permuting its values causes the largest drop in predictive accuracy. Close behind are population, gill.color, and spore.print.color, suggesting that both ecological context and spore characteristics carry strong signals for edibility. Mid‑ranked features like stalk.root, habitat, and stalk.color.below.ring also contribute meaningfully, whereas attributes such as veil.type, gill.attachment, and cap.surface have negligible impact and could be deprioritized in a streamlined model. These findings mirror ecological intuition—olfactory cues are paramount, supplemented by visual and habitat clues—validating both the dataset’s structure and the model’s variable‐selection behavior.

(e) Using boosting:

library(gbm)

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

# Defining grid of shrinkage (λ) and number of trees (B):
lambdas <- c(0.001, 0.01, 0.1)
ntrees  <- c(100, 1000, 5000)

# Preparing results storage:
results <- expand.grid(
  shrinkage = lambdas,
  n.trees   = ntrees,
  Accuracy  = NA_real_
)

# Recoding class → numeric 0/1:
train$class01 <- ifelse(train$class == "p", 1, 0)
test$class01  <- ifelse(test$class  == "p", 1, 0)

# Grid search:
set.seed(1234)
for (i in seq_len(nrow(results))) {
  lam <- results$shrinkage[i]
  B   <- results$n.trees[i]
  
  fit <- gbm(
    formula          = class01 ~ . - class,   # use numeric response, drop factor
    distribution     = "bernoulli",
    data             = train,
    n.trees          = B,
    interaction.depth= 1,      # stumps
    shrinkage        = lam,
    verbose          = FALSE
  )
  
  # Predicting probabilities on test set:
  prob <- predict(fit, newdata = test, n.trees = B, type = "response")
  pred <- ifelse(prob > 0.5, 1, 0)
  
  # Computing accuracy against numeric class01:
  results$Accuracy[i] <- mean(pred == test$class01)
}

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.

print(results)

##   shrinkage n.trees  Accuracy
## 1     0.001     100 0.9847366
## 2     0.010     100 0.9847366
## 3     0.100     100 0.9975382
## 4     0.001    1000 0.9847366
## 5     0.010    1000 0.9975382
## 6     0.100    1000 1.0000000
## 7     0.001    5000 0.9945839
## 8     0.010    5000 1.0000000
## 9     0.100    5000 1.0000000

# Identifying best combination:
best_row <- results[which.max(results$Accuracy), ]
cat("Best → shrinkage =", best_row$shrinkage,
    ", n.trees =",   best_row$n.trees,
    "with accuracy =", round(best_row$Accuracy,4), "\n")

## Best → shrinkage = 0.1 , n.trees = 1000 with accuracy = 1

# Refitting best model & plot variable importance:
set.seed(1234)
best_gbm <- gbm(
  formula          = class01 ~ . - class,
  distribution     = "bernoulli",
  data             = train,
  n.trees          = best_row$n.trees,
  interaction.depth= 1,
  shrinkage        = best_row$shrinkage,
  verbose          = FALSE
)

## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.

# Variable importance:
summary(
  best_gbm, 
  n.trees = best_row$n.trees,
  main    = paste0("Variable Importance — λ=", best_row$shrinkage,
                   ", B=", best_row$n.trees)
)

##                                               var      rel.inf
## odor                                         odor 9.483905e+01
## spore.print.color               spore.print.color 4.050948e+00
## stalk.color.below.ring     stalk.color.below.ring 6.154127e-01
## stalk.surface.below.ring stalk.surface.below.ring 2.081941e-01
## gill.size                               gill.size 1.517695e-01
## population                             population 4.860517e-02
## stalk.surface.above.ring stalk.surface.above.ring 3.023489e-02
## cap.color                               cap.color 2.558775e-02
## ring.number                           ring.number 1.017192e-02
## cap.shape                               cap.shape 6.973640e-03
## habitat                                   habitat 5.786727e-03
## ring.type                               ring.type 4.087464e-03
## gill.color                             gill.color 2.190928e-03
## stalk.root                             stalk.root 7.457573e-04
## stalk.color.above.ring     stalk.color.above.ring 2.387165e-04
## veil.color                             veil.color 1.178572e-06
## cap.surface                           cap.surface 0.000000e+00
## bruises                                   bruises 0.000000e+00
## gill.attachment                   gill.attachment 0.000000e+00
## gill.spacing                         gill.spacing 0.000000e+00
## stalk.shape                           stalk.shape 0.000000e+00
## veil.type                               veil.type 0.000000e+00

The boosting results reveal a clear inverse trade‑off between shrinkage (λ) and the number of trees (B). With a large learning rate of λ = 0.1, accuracy climbs very quickly, 100 trees already give 99.75% accuracy and 1,000 trees achieve perfect 100%. By contrast, smaller λ values (0.01 and especially 0.001) require many more trees to reach the same level of performance; λ = 0.01 needs roughly 1,000–5,000 trees to hit 100%, while λ = 0.001 never quite reaches perfection even at 5,000 trees. This pattern is typical of gradient boosting: a smaller step size (λ) makes learning more stable but slower, so it must be compensated with more trees.

From the nine (λ, B) combinations evaluated, λ = 0.1 with B = 1000 gives a perfect test‐set accuracy of 1.0000 using the fewest trees. Although λ = 0.01 with 5,000 trees also achieve 100%, they are less efficient. Thus λ = 0.1 and 1,000 trees is the best practical choice.

The variable‐importance summary for this final boosted model confirms that odor is overwhelmingly the dominant predictor, which permuting its values causes roughly a 94.8% drop in model accuracy. The next most influential feature is spore.print.color (about 4.1% relative influence), followed by stalk.color.below.ring (0.6%) and stalk.surface.below.ring (0.2%). All other attributes including cap shape, ring number, and veil type contribute virtually zero influence. This strongly reinforces that odor is the single most critical cue for distinguishing poisonous from edible mushrooms, with spore and stem characteristics providing only secondary refinements.

(f) Best Algorithm:

Across all methods, random forest is the best learning algorithm. While both random forest (ntree = 100, mtry = 4) and gradient‐boosted model (λ = 0.1, B = 1000) achieved perfect 100% accuracy on the held out test set, random forest reached perfection with far fewer trees and without the need to tune a learning rate. By contrast, bagging peaked at 99.95% and the single CART at 99.93%. Hence, random forest (100 trees, mtry = 4) is the simplest model that achieves perfect accuracy.

Question 2:

library(ISLR)   
library(boot)   

data(Auto)
df <- subset(Auto, cylinders %in% c(4, 8))

# Define bootstrap statistic: 
med_diff <- function(data, i) {
  d  <- data[i, ]
  m4 <- median(d$mpg[d$cylinders == 4])
  m8 <- median(d$mpg[d$cylinders == 8])
  return(m4 - m8)
}

# Running bootstrap (R = 1000 resamples):
set.seed(1234)
boot.out <- boot(data = df, statistic = med_diff, R = 1000)

# Extracting bootstrap standard error:
boot_se <- sd(boot.out$t)
boot_se

## [1] 0.7817146

So the bootstrap estimated standard error of the difference in medians between 4‑cylinder and 8‑cylinder cars is approximately 0.7817146  mpg.

Questions 3:

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:boot':
## 
##     melanoma

# Loading the cpus data and inspecting names:
data(cpus, package="MASS")
names(cpus)

## [1] "name"    "syct"    "mmin"    "mmax"    "cach"    "chmin"   "chmax"  
## [8] "perf"    "estperf"

# CV function using the correct names:
cv_lm_perf <- function(data, formula, K = 10, seed = 1234) {
  resp <- all.vars(formula)[1]
  if (!(resp %in% names(data))) {
    stop("Response ‘", resp, "’ not found; available: ", paste(names(data), collapse=", "))
  }
  
  set.seed(seed)
  folds <- createFolds(y = data[[resp]], k = K, list = TRUE)
  if (length(folds) != K) {
    stop("Expected ", K, " folds but got ", length(folds))
  }
  
  rmse_vals <- numeric(K)
  for (j in seq_len(K)) {
    test_idx   <- folds[[j]]
    train_set  <- data[-test_idx, ]
    test_set   <- data[ test_idx, ]
    fit        <- lm(formula, data = train_set)
    preds      <- predict(fit, newdata = test_set)
    rmse_vals[j] <- sqrt(mean((preds - test_set[[resp]])^2))
  }
  
  names(rmse_vals) <- paste0("Fold", seq_len(K))
  list(
    fold_RMSE = rmse_vals,
    mean_RMSE = mean(rmse_vals),
    sd_RMSE   = sd(rmse_vals)
  )
}

# Running 10‑fold CV on perf ~ mmin + mmax:
results <- cv_lm_perf(
  data    = cpus,
  formula = perf ~ mmin + mmax,
  K       = 10,
  seed    = 1234
)

# Showing the per‑fold and average RMSE:
print(results$fold_RMSE)

##     Fold1     Fold2     Fold3     Fold4     Fold5     Fold6     Fold7     Fold8 
##  54.06447  31.71710 143.64572  58.46880  88.62445  71.28984  95.93295  52.40910 
##     Fold9    Fold10 
##  89.20401  48.30400

cat("Average 10‑fold RMSE:", round(results$mean_RMSE, 2),
    " (±", round(results$sd_RMSE, 2), ")\n")

## Average 10‑fold RMSE: 73.37  (± 32.2 )

The 10 fold cross‑validation on the cpus data produced substantial variability in prediction error across folds. Fold RMSEs ranged from a low of about 31.7 (Fold 2) and 48.3 (Fold 10) up to a high of 143.6 (Fold 3), with most folds falling between roughly 50 and 100. This spread yields an average RMSE of 73.37 with a standard deviation of 32.2.

An average RMSE of 73.4 means that, on average, our simple linear model (using only minimum and maximum main memory) is off by around 73 “perf” units on unseen data. The very large error in Fold 3 suggests that particular subset of the data contained extreme or atypical observations, perhaps outliers in performance that the two‑predictor model cannot capture. The high standard deviation (nearly half the mean) indicates the model’s accuracy is quite sensitive to which observations end up in the test set.