Do not work in groups!
Do not discuss the exam with other students, or share any ideas or thoughts regarding the exam. This is academic misconduct, and will not be tolerated. Your presented solution must be your own work!
Must submit compiled RMarkdown file in PDF/HTML format.
If you cannot compile the RMarkdown file, please send an email to me for help before Thursday, May 8!
mushrooms <- read.csv("C:\\Users\\gajaw\\Downloads\\mushrooms.csv", stringsAsFactors = TRUE)
set.seed(1234)
train_indices <- sample(1:nrow(mushrooms), size = 0.5 * nrow(mushrooms))
train_data <- mushrooms[train_indices, ]
test_data <- mushrooms[-train_indices, ]
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
cart_model <- rpart(class ~ ., data = train_data, method = "class", cp = 0.01, xval = 10)
printcp(cart_model)
##
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class",
## cp = 0.01, xval = 10)
##
## Variables actually used in tree construction:
## [1] odor spore.print.color
##
## Root node error: 1953/4062 = 0.4808
##
## n= 4062
##
## CP nsplit rel error xerror xstd
## 1 0.970302 0 1.000000 1.000000 0.0163049
## 2 0.016385 1 0.029698 0.029698 0.0038716
## 3 0.010000 2 0.013313 0.013313 0.0026025
plotcp(cart_model)
best_cp <- cart_model$cptable[which.min(cart_model$cptable[,"xerror"]), "CP"]
pruned_cart <- prune(cart_model, cp = best_cp)
rpart.plot(pruned_cart, extra = 104, fallen.leaves = TRUE)
pred_cart <- predict(pruned_cart, test_data, type = "class")
confusion_matrix <- table(Predicted = pred_cart, Actual = test_data$class)
print(confusion_matrix)
## Actual
## Predicted e p
## e 2099 22
## p 0 1941
accuracy_cart <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Accuracy of CART:", accuracy_cart)
## Accuracy of CART: 0.9945839
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1234)
bag_model_200 <- randomForest(class ~ ., data = train_data, mtry = ncol(train_data) - 1, ntree = 200, importance = TRUE)
bag_pred <- predict(bag_model_200, test_data)
conf_matrix_bag <- table(Predicted = bag_pred, Actual = test_data$class)
accuracy_bag <- sum(diag(conf_matrix_bag)) / sum(conf_matrix_bag)
cat("Bagging Accuracy (B = 200):", accuracy_bag, "\n")
## Bagging Accuracy (B = 200): 0.9995076
varImpPlot(bag_model_200, main = "Bagging Variable Importance (B = 200)")
for (B in c(50, 100, 200, 500)) {
set.seed(1234)
model <- randomForest(class ~ ., data = train_data, mtry = ncol(train_data) - 1, ntree = B)
pred <- predict(model, test_data)
acc <- mean(pred == test_data$class)
cat("Accuracy for B =", B, "is", round(acc, 4), "\n")
}
## Accuracy for B = 50 is 0.9995
## Accuracy for B = 100 is 0.9995
## Accuracy for B = 200 is 0.9995
## Accuracy for B = 500 is 0.9995
odor
, gill-color
, etc.).library(randomForest)
# Set up parameter combinations
mtry_values <- c(2, 4, 6, 8)
ntree_values <- c(100, 200, 500)
# Track best model
best_accuracy <- 0
best_model <- NULL
# Loop through combinations of mtry and ntree
for (m in mtry_values) {
for (b in ntree_values) {
set.seed(1234)
rf_model <- randomForest(class ~ ., data = train_data, mtry = m, ntree = b, importance = TRUE)
preds <- predict(rf_model, test_data)
acc <- mean(preds == test_data$class)
cat("mtry =", m, ", ntree =", b, ", Accuracy =", round(acc, 4), "\n")
if (acc > best_accuracy) {
best_accuracy <- acc
best_model <- rf_model
}
}
}
## mtry = 2 , ntree = 100 , Accuracy = 1
## mtry = 2 , ntree = 200 , Accuracy = 1
## mtry = 2 , ntree = 500 , Accuracy = 1
## mtry = 4 , ntree = 100 , Accuracy = 1
## mtry = 4 , ntree = 200 , Accuracy = 1
## mtry = 4 , ntree = 500 , Accuracy = 1
## mtry = 6 , ntree = 100 , Accuracy = 1
## mtry = 6 , ntree = 200 , Accuracy = 1
## mtry = 6 , ntree = 500 , Accuracy = 1
## mtry = 8 , ntree = 100 , Accuracy = 1
## mtry = 8 , ntree = 200 , Accuracy = 1
## mtry = 8 , ntree = 500 , Accuracy = 1
# Plot the importance from the best model
varImpPlot(best_model, main = "Variable Importance from Best Random Forest")
Since all combinations yielded perfect accuracy, the model is robust
to tuning changes. For computational efficiency, a random forest with
mtry = 4 and ntree = 100 can be considered optimal. The variable
importance plot confirms that features like odor
,
gill-size
, and spore-print-color
are the most
influential.
library(gbm)
## Warning: package 'gbm' was built under R version 4.4.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
train_data$label <- ifelse(train_data$class == "p", 1, 0)
test_data$label <- ifelse(test_data$class == "p", 1, 0)
train_data_boost <- subset(train_data, select = -class)
test_data_boost <- subset(test_data, select = -class)
lambda_vals <- c(0.001, 0.01, 0.1)
tree_vals <- c(100, 200, 500)
best_accuracy <- 0
best_model <- NULL
for (lambda in lambda_vals) {
for (trees in tree_vals) {
set.seed(1234)
boost_model <- gbm(label ~ ., data = train_data_boost, distribution = "bernoulli",
n.trees = trees, shrinkage = lambda, interaction.depth = 1, verbose = FALSE)
# Assuming boost_model, pred_prob, and pred_class already created
pred_prob <- predict(boost_model, newdata = test_data_boost, n.trees = B, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
accuracy_boost <- mean(pred_class == test_data$label)
acc <- mean(pred_class == test_data$label)
cat("lambda =", lambda, ", trees =", trees, ", Accuracy =", round(acc, 4), "\n")
if (acc > best_accuracy) {
best_accuracy <- acc
best_model <- boost_model
}
}
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in predict.gbm(boost_model, newdata = test_data_boost, n.trees = B, :
## Number of trees not specified or exceeded number fit so far. Using 100.
## lambda = 0.001 , trees = 100 , Accuracy = 0.9847
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in predict.gbm(boost_model, newdata = test_data_boost, n.trees = B, :
## Number of trees not specified or exceeded number fit so far. Using 200.
## lambda = 0.001 , trees = 200 , Accuracy = 0.9847
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## lambda = 0.001 , trees = 500 , Accuracy = 0.9847
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in predict.gbm(boost_model, newdata = test_data_boost, n.trees = B, :
## Number of trees not specified or exceeded number fit so far. Using 100.
## lambda = 0.01 , trees = 100 , Accuracy = 0.9847
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in predict.gbm(boost_model, newdata = test_data_boost, n.trees = B, :
## Number of trees not specified or exceeded number fit so far. Using 200.
## lambda = 0.01 , trees = 200 , Accuracy = 0.9847
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## lambda = 0.01 , trees = 500 , Accuracy = 0.9946
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in predict.gbm(boost_model, newdata = test_data_boost, n.trees = B, :
## Number of trees not specified or exceeded number fit so far. Using 100.
## lambda = 0.1 , trees = 100 , Accuracy = 0.9975
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in predict.gbm(boost_model, newdata = test_data_boost, n.trees = B, :
## Number of trees not specified or exceeded number fit so far. Using 200.
## lambda = 0.1 , trees = 200 , Accuracy = 0.9993
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## lambda = 0.1 , trees = 500 , Accuracy = 1
summary(best_model, main = "Variable Importance - Best Boosting Model")
The best boosting model achieved perfect or near-perfect accuracy
with a learning rate (λ
) of 0.1 and number of trees
(B
) of 100 or more. Boosting, like Random Forests, performs
exceptionally well on this dataset. A lower λ (e.g., 0.001) required
many more trees to achieve similar results, highlighting the trade-off
between learning rate and model complexity.
cat("CART Accuracy:", accuracy_cart, "\n")
## CART Accuracy: 0.9945839
cat("Bagging Accuracy:", accuracy_bag, "\n")
## Bagging Accuracy: 0.9995076
cat("Random Forest Accuracy:", best_accuracy, "\n")
## Random Forest Accuracy: 1
cat("Boosting Accuracy:", accuracy_boost, "\n")
## Boosting Accuracy: 1
Random Forest and Boosting both achieved perfect accuracy (1.0) on the test set.
However, Random Forest required less tuning effort and performed
consistently well across all combinations of mtry
and
ntree
.
Therefore, Random Forest is selected as the best statistical learning algorithm for this dataset.
The mushroom dataset has very strong and clear predictive features, particularly:
odor
spore-print-color
gill-size
These variables make classification between edible and poisonous mushrooms relatively easy for tree-based models.
if all of them correctly used the seed they should obtain very similar if not identical results, and arrive at Random Forest or Boosting as the best-performing models.They should obtain very similar if not identical results, and arrive at Random Forest or Boosting as the best-performing models.
# Load libraries
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(boot)
## Warning: package 'boot' was built under R version 4.4.3
# Load the Auto dataset
data(Auto)
# Define a function to compute the difference in median mpg between 4 and 8 cylinders
median_diff <- function(data, indices) {
sample_data <- data[indices, ]
median_4 <- median(sample_data$mpg[sample_data$cylinders == 4])
median_8 <- median(sample_data$mpg[sample_data$cylinders == 8])
return(median_4 - median_8)
}
# Run the bootstrap with 1000 replications
set.seed(1234)
boot_result <- boot(data = Auto, statistic = median_diff, R = 1000)
# Display the standard error of the difference in median mpg
cat("Bootstrap Estimate of Std. Error:", sd(boot_result$t), "\n")
## Bootstrap Estimate of Std. Error: 0.797002
# Print full bootstrap result (includes bias, SE, etc.)
print(boot_result)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = median_diff, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 14.4 -0.30745 0.797002
Using the boot()
function with 1000 replications, we
estimated the standard error of the difference in median MPG between
4-cylinder and 8-cylinder cars in the Auto
dataset.
The original difference in medians is: 14.4 mpg
The bootstrap standard error is: 0.797 mpg
The bootstrap bias estimate is: -0.307 mpg
The estimated standard error (0.797) quantifies the variability in the median MPG difference that might arise due to random sampling. This approach is necessary because standard formulas do not exist for differences in medians. The relatively small standard error suggests that the median MPG difference is stable and statistically reliable.
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
data("cpus")
str(cpus)
## 'data.frame': 209 obs. of 9 variables:
## $ name : Factor w/ 209 levels "ADVISOR 32/60",..: 1 3 2 4 5 6 8 9 10 7 ...
## $ syct : int 125 29 29 29 29 26 23 23 23 23 ...
## $ mmin : int 256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
## $ mmax : int 6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
## $ cach : int 256 32 32 32 32 64 64 64 64 128 ...
## $ chmin : int 16 8 8 8 8 8 16 16 16 32 ...
## $ chmax : int 128 32 32 32 16 32 32 32 32 64 ...
## $ perf : int 198 269 220 172 132 318 367 489 636 1144 ...
## $ estperf: int 199 253 253 253 132 290 381 381 749 1238 ...
cpus <- na.omit(cpus)
set.seed(1234)
folds <- createFolds(cpus$perf, k = 10, list = TRUE, returnTrain = FALSE)
cv_mse <- function(data, folds) {
mse_values <- numeric(length(folds))
for (i in seq_along(folds)) {
test_indices <- folds[[i]]
test_data <- data[test_indices, ]
train_data <- data[-test_indices, ]
model <- lm(perf ~ chmin + chmax, data = train_data)
predictions <- predict(model, newdata = test_data)
mse_values[i] <- mean((test_data$perf - predictions)^2)
}
return(mean(mse_values))
}
test_mse <- cv_mse(cpus, folds)
cat("Estimated Test MSE from 10-fold CV:", round(test_mse, 4), "\n")
## Estimated Test MSE from 10-fold CV: 16409.04
Using the cpus
dataset from the MASS
package, we performed a 10-fold cross-validation to evaluate a linear
regression model predicting CPU performance (perf
) based on
two predictors:
chmin
(minimum main memory)
chmax
(maximum main memory)
The createFolds()
function from the caret
package was used to divide the data into 10 folds. A separate model was
trained on each combination of 9 folds, and tested on the remaining
fold. The Mean Squared Error (MSE) was computed for each fold, and then
averaged.
Estimated Test MSE from 10-fold CV: 16409.04
This value represents the average squared error the model would be
expected to make when predicting new, unseen CPU performance data using
chmin
and chmax
as predictors.