Do not work in groups!
Do not discuss the exam with other students, or share any ideas or thoughts regarding the exam. This is academic misconduct, and will not be tolerated. Your presented solution must be your own work!
Must submit compiled RMarkdown file in PDF/HTML format.
If you cannot compile the RMarkdown file, please send an email to me for help before Thursday, May 8!
mushroom <- read.csv("C:/Users/13177/OneDrive/Statistical Learning/mushrooms.csv", stringsAsFactors = TRUE)
head(mushroom)
set.seed(403)
sample_index <- sample(1:nrow(mushroom), nrow(mushroom)/2)
train_data <- mushroom[sample_index, ]
test_data <- mushroom[-sample_index, ]
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
cart_model <- rpart(class ~ ., data = train_data, method = "class", cp = 0.001)
printcp(cart_model)
##
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class",
## cp = 0.001)
##
## Variables actually used in tree construction:
## [1] odor spore.print.color stalk.color.below.ring
## [4] stalk.root
##
## Root node error: 1955/4062 = 0.48129
##
## n= 4062
##
## CP nsplit rel error xerror xstd
## 1 0.9672634 0 1.0000000 1.0000000 0.01628879
## 2 0.0209719 1 0.0327366 0.0327366 0.00405971
## 3 0.0051151 2 0.0117647 0.0117647 0.00244616
## 4 0.0025575 3 0.0066496 0.0066496 0.00184132
## 5 0.0010000 5 0.0015345 0.0015345 0.00088563
bestcp <- cart_model$cptable[which.min(cart_model$cptable[,"xerror"]), "CP"]
pruned_tree <- prune(cart_model, cp = bestcp)
rpart.plot(pruned_tree, extra = 106)
cart_pred <- predict(pruned_tree, newdata = test_data, type = "class")
confusionMatrix(cart_pred, test_data$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 2101 5
## p 0 1956
##
## Accuracy : 0.9988
## 95% CI : (0.9971, 0.9996)
## No Information Rate : 0.5172
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9975
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 1.0000
## Specificity : 0.9975
## Pos Pred Value : 0.9976
## Neg Pred Value : 1.0000
## Prevalence : 0.5172
## Detection Rate : 0.5172
## Detection Prevalence : 0.5185
## Balanced Accuracy : 0.9987
##
## 'Positive' Class : e
##
Interpretation of the Tree: The most important variable in the tree was odor, which is consistent with the mushroom dataset documentation — some odors are highly indicative of edibility.
The decision rules in the tree allow us to quickly and accurately classify mushrooms into edible or poisonous categories.
The model showed near-perfect performance on the test set
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(403)
bag_model <- randomForest(class ~ ., data = train_data, mtry = ncol(train_data) - 1, ntree = 200)
bag_pred <- predict(bag_model, newdata = test_data)
confusionMatrix(bag_pred, test_data$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 2101 0
## p 0 1961
##
## Accuracy : 1
## 95% CI : (0.9991, 1)
## No Information Rate : 0.5172
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5172
## Detection Rate : 0.5172
## Detection Prevalence : 0.5172
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : e
##
varImpPlot(bag_model, main = "Bagging - Variable Importance")
b_values <- c(50, 100, 200, 500)
acc_results <- data.frame(B = b_values, Accuracy = NA)
set.seed(403)
for (i in seq_along(b_values)) {
bag_model <- randomForest(class ~ ., data = train_data,
mtry = ncol(train_data) - 1,
ntree = b_values[i])
pred <- predict(bag_model, newdata = test_data)
cm <- confusionMatrix(pred, test_data$class)
acc_results$Accuracy[i] <- cm$overall["Accuracy"]
}
print(acc_results)
## B Accuracy
## 1 50 0.9992614
## 2 100 1.0000000
## 3 200 1.0000000
## 4 500 1.0000000
I tested values of \(B = 50, 100, 200, 500\) and observed that the test accuracy remained consistently high (~99-100%) for all values. This indicates that even a modest number of trees is sufficient for this dataset, likely due to the presence of strong predictor variables such as odor. Increasing \(B\) did not improve accuracy, but may still improve stability in variable importance and model robustness.
set.seed(403)
rf_model <- randomForest(class ~ ., data = train_data, mtry = 3, ntree = 200)
rf_pred <- predict(rf_model, newdata = test_data)
confusionMatrix(rf_pred, test_data$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 2101 0
## p 0 1961
##
## Accuracy : 1
## 95% CI : (0.9991, 1)
## No Information Rate : 0.5172
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5172
## Detection Rate : 0.5172
## Detection Prevalence : 0.5172
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : e
##
varImpPlot(rf_model, main = "Random Forest - Variable Importance")
b_vals <- c(50, 100, 200)
mtry_vals <- c(2, 4, 6)
grid_results <- expand.grid(B = b_vals, mtry = mtry_vals)
grid_results$Accuracy <- NA
set.seed(403)
for (i in 1:nrow(grid_results)) {
model <- randomForest(class ~ ., data = train_data,
ntree = grid_results$B[i],
mtry = grid_results$mtry[i])
pred <- predict(model, newdata = test_data)
cm <- confusionMatrix(pred, test_data$class)
grid_results$Accuracy[i] <- cm$overall["Accuracy"]
}
print(grid_results)
## B mtry Accuracy
## 1 50 2 1
## 2 100 2 1
## 3 200 2 1
## 4 50 4 1
## 5 100 4 1
## 6 200 4 1
## 7 50 6 1
## 8 100 6 1
## 9 200 6 1
I evaluated a range of values for: B (ntree): 50, 100, 200 mtry: 2, 4, 6 Each model was trained on the training data and evaluated on the test data using accuracy as the performance metric.
Interpretation: Every model tested — regardless of the number of trees or variables tried at each split — achieved perfect classification accuracy (100%) on the test set.
This result indicates that:
The mushroom dataset is extremely well-separated based on the features.
Even with a smaller number of trees (B = 50), the model generalizes perfectly.
Adjusting mtry did not affect the model’s ability to make accurate predictions — even when fewer or more variables were randomly chosen at each split.
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
train_data$y_bin <- ifelse(train_data$class == "e", 1, 0)
test_data$y_bin <- ifelse(test_data$class == "e", 1, 0)
boost_model <- gbm(y_bin ~ . -class, data = train_data, distribution = "bernoulli",
n.trees = 200, interaction.depth = 1, shrinkage = 0.01)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
boost_pred <- predict(boost_model, newdata = test_data, n.trees = 200, type = "response")
boost_class <- ifelse(boost_pred > 0.5, "e", "p")
confusionMatrix(as.factor(boost_class), test_data$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 2101 56
## p 0 1905
##
## Accuracy : 0.9862
## 95% CI : (0.9821, 0.9896)
## No Information Rate : 0.5172
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9724
##
## Mcnemar's Test P-Value : 1.987e-13
##
## Sensitivity : 1.0000
## Specificity : 0.9714
## Pos Pred Value : 0.9740
## Neg Pred Value : 1.0000
## Prevalence : 0.5172
## Detection Rate : 0.5172
## Detection Prevalence : 0.5310
## Balanced Accuracy : 0.9857
##
## 'Positive' Class : e
##
summary(boost_model)
I fit a boosting model using:
λ (shrinkage) = 0.01 B (number of trees) = 200 Interaction depth = 1 (i.e., stumps)
Performance: Test Accuracy: 98.62% Kappa: 0.9724 Sensitivity: 100% (perfect classification of edible mushrooms) Specificity: 97.14% (very high classification of poisonous mushrooms)
This indicates excellent generalization, with near-perfect classification results on the test set even using just 200 boosting iterations (trees).
Relationship Between λ and B: Smaller λ values (e.g., 0.01) typically require larger B to converge fully. However, the model already achieved very strong performance with just B = 200, suggesting that:
Either the problem is easily separable (as shown by odor being dominant), or The chosen λ-B pair is close to optimal for this dataset.
Conclusion: There is still a trade-off — smaller λ often needs larger B, but even at B = 200, the model performs exceptionally.
Variable Importance: The model found:
odor as the overwhelmingly dominant feature (~99.2% importance). spore.print.color contributed very little. All other predictors had 0 importance, indicating the model did not rely on them.
library(gbm)
train_data$label <- ifelse(train_data$class == "e", 0, 1)
test_data$label <- ifelse(test_data$class == "e", 0, 1)
lambda_vals <- c(0.001, 0.01, 0.1, 0.3)
results_boost <- data.frame(lambda = lambda_vals, TestAccuracy = NA)
set.seed(403)
for (i in seq_along(lambda_vals)) {
boost_model <- gbm(
label ~ . -class,
data = train_data,
distribution = "bernoulli",
n.trees = 5000,
interaction.depth = 1,
shrinkage = lambda_vals[i],
verbose = FALSE
)
pred_prob <- predict(boost_model, newdata = test_data, n.trees = 5000, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
acc <- mean(pred_class == test_data$label)
results_boost$TestAccuracy[i] <- acc
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
print(results_boost)
## lambda TestAccuracy
## 1 0.001 1
## 2 0.010 1
## 3 0.100 1
## 4 0.300 1
What This Means: All tested \(\lambda\) values yielded perfect classification accuracy on the test set. This suggests that the boosting model is very robust for this dataset and the task of classifying mushrooms as edible vs. poisonous. Even with lower learning rates (more conservative models), the model eventually reached 100% accuracy due to the high signal in the data and the number of trees (n.trees) likely being sufficient.
Takeaways: Smaller \(\lambda\) values (like 0.001 or 0.01) require more trees (n.trees) to converge, but often generalize better. Larger \(\lambda\) values (like 0.3) learn faster but can risk overfitting if not tuned properly — but in this case, all values performed equally well. Given the consistent accuracy, you can choose a smaller \(\lambda\) (e.g., 0.01) and increase the number of trees as a conservative and stable choice.
Any of the tested \(\lambda\) values performed perfectly, but for generalization and stability, \(\lambda = 0.01\) with n.trees = 200 is a reliable combination that balances learning speed and robustness.
CART (Pruned Tree) : 99.88% Accuracy Bagging: 50- 99.93%
Accuracy
100- 100% Accuracy 200- 100% Accuracy
500- 100% Accuracy Random Forest: 100% Accuracy Boosting: 98.62%
Based on the results, the Random Forest model clearly performed best, achieving 100% accuracy, followed closely by Bagging (which also reached 100% accuracy at 100+ trees). The CART (Pruned Tree) model performed slightly below that at 99.88%, while Boosting was less accurate in this case at 98.62%. Therefore, the Random Forest model is the most effective and reliable choice for predicting mushroom edibility. While other students may achieve slightly different results depending on their train/test splits (influenced by their use of set.seed()), the Random Forest is likely to be a top performer for most, given the dataset’s structure and strong signal.
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
Using the data from package , estimate the standard error of the difference in Median (miles per gallon) between 4 cylinder and 8 cylinder cars.
Auto_4 <- subset(Auto, cylinders == 4)
Auto_8 <- subset(Auto, cylinders == 8)
boot_fn <- function(data, index) {
d <- data[index, ]
median_4 <- median(d$mpg[d$cylinders == 4])
median_8 <- median(d$mpg[d$cylinders == 8])
return(median_4 - median_8)
}
Auto_48 <- subset(Auto, cylinders %in% c(4, 8))
set.seed(123) # Set seed for reproducibility
boot_out <- boot(data = Auto_48, statistic = boot_fn, R = 1000)
print(boot_out)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto_48, statistic = boot_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 14.4 -0.32275 0.7922011
In this problem, we used the bootstrap method to estimate the standard error of the difference in median miles per gallon (mpg) between 4-cylinder and 8-cylinder cars from the Auto dataset. The original estimated difference in median mpg was 14.4, meaning 4-cylinder cars have a median mpg about 14.4 units higher than 8-cylinder cars. The bootstrap estimate of the standard error was approximately 0.79, indicating relatively low variability in this estimate. The bootstrap bias was small (-0.32), suggesting that the resampling procedure provides a reliable estimate of the difference.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(caret)
data(cpus)
cv_linear_regression <- function(data, predictors, response, k = 10) {
set.seed(1)
folds <- createFolds(data[[response]], k = k, list = TRUE, returnTrain = FALSE)
mse_values <- numeric(k)
for (i in 1:k) {
test_indices <- folds[[i]]
train_data <- data[-test_indices, ]
test_data <- data[test_indices, ]
model <- lm(as.formula(paste(response, "~", paste(predictors, collapse = " + "))), data = train_data)
predictions <- predict(model, newdata = test_data)
mse_values[i] <- mean((test_data[[response]] - predictions)^2)
}
mean(mse_values)
}
predictors <- c("mmin", "mmax")
response <- "perf"
average_test_mse <- cv_linear_regression(cpus, predictors, response, k = 10)
print(paste("Average Test MSE from 10-fold Cross-Validation:", round(average_test_mse, 2)))
## [1] "Average Test MSE from 10-fold Cross-Validation: 6376.57"
mean_model_mse <- mean((cpus$perf - mean(cpus$perf))^2)
print(mean_model_mse)
## [1] 25742.71
In this problem, we performed 10-fold cross-validation to estimate the test error for a linear regression model predicting CPU performance (perf) using minimum and maximum main memory (mmin and mmax) as predictors from the cpus dataset. Using a custom cross-validation function and the createFolds() function from the caret package, the average test mean squared error (MSE) across the folds was found to be 6376.57. Compared to the mean model MSE of 25742.71 (predicting the mean performance for all CPUs), this indicates that the linear regression model provides a much better fit than simply using the mean.