Do not work in groups!
Do not discuss the exam with other students, or share any ideas or thoughts regarding the exam. This is academic misconduct, and will not be tolerated. Your presented solution must be your own work!
Must submit compiled RMarkdown file in PDF/HTML format.
If you cannot compile the RMarkdown file, please send an email to me for help before Thursday, May 8!
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tree)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(caret)
## Loading required package: lattice
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
mushrooms <- read.csv("mushrooms.csv", stringsAsFactors = TRUE)
set.seed(1234)
sample_idx <- sample(1:nrow(mushrooms), size = nrow(mushrooms) * 0.5)
train <- mushrooms[sample_idx, ]
test <- mushrooms[-sample_idx, ]
str(train)
## 'data.frame': 4062 obs. of 23 variables:
## $ class : Factor w/ 2 levels "e","p": 2 1 2 1 2 2 1 2 1 1 ...
## $ cap.shape : Factor w/ 6 levels "b","c","f","k",..: 4 4 4 4 6 6 6 3 6 6 ...
## $ cap.surface : Factor w/ 4 levels "f","g","s","y": 3 1 3 1 4 3 1 4 4 3 ...
## $ cap.color : Factor w/ 10 levels "b","c","e","g",..: 5 9 5 9 3 9 9 5 5 10 ...
## $ bruises : Factor w/ 2 levels "f","t": 1 1 1 1 1 2 2 1 2 2 ...
## $ odor : Factor w/ 9 levels "a","c","f","l",..: 3 6 8 6 8 7 4 8 6 4 ...
## $ gill.attachment : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
## $ gill.spacing : Factor w/ 2 levels "c","w": 1 2 1 2 1 1 2 1 1 1 ...
## $ gill.size : Factor w/ 2 levels "b","n": 2 1 2 1 2 2 2 2 1 1 ...
## $ gill.color : Factor w/ 12 levels "b","e","g","h",..: 1 8 1 3 1 11 6 1 11 5 ...
## $ stalk.shape : Factor w/ 2 levels "e","t": 2 1 2 1 2 1 2 2 2 1 ...
## $ stalk.root : Factor w/ 5 levels "?","b","c","e",..: 1 1 1 1 1 4 2 1 2 3 ...
## $ stalk.surface.above.ring: Factor w/ 4 levels "f","k","s","y": 2 2 3 3 2 3 3 2 3 3 ...
## $ stalk.surface.below.ring: Factor w/ 4 levels "f","k","s","y": 2 2 3 2 3 3 3 3 3 3 ...
## $ stalk.color.above.ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 7 8 7 8 8 8 8 8 ...
## $ stalk.color.below.ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 7 8 8 7 7 8 ...
## $ veil.type : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
## $ veil.color : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ ring.number : Factor w/ 3 levels "n","o","t": 2 3 2 3 2 2 2 2 2 2 ...
## $ ring.type : Factor w/ 5 levels "e","f","l","n",..: 1 5 1 5 1 5 5 1 5 5 ...
## $ spore.print.color : Factor w/ 9 levels "b","h","k","n",..: 8 8 8 8 8 3 7 8 4 4 ...
## $ population : Factor w/ 6 levels "a","c","n","s",..: 5 4 5 4 5 5 5 5 5 4 ...
## $ habitat : Factor w/ 7 levels "d","g","l","m",..: 5 2 1 2 1 2 1 5 1 2 ...
tree_model <- tree(class ~ ., data = train)
set.seed(1234)
cv_results <- cv.tree(tree_model, FUN = prune.misclass)
plot(cv_results$size, cv_results$dev, type = "b", xlab = "Tree Size", ylab = "Misclassification Error")
best_size <- cv_results$size[which.min(cv_results$dev)]
pruned_tree <- prune.misclass(tree_model, best = best_size)
plot(pruned_tree)
text(pruned_tree, pretty = 0)
tree_pred <- predict(pruned_tree, test, type = "class")
tree_confusion <- table(Predicted = tree_pred, Actual = test$class)
tree_accuracy <- sum(diag(tree_confusion)) / sum(tree_confusion)
tree_confusion
## Actual
## Predicted e p
## e 2099 3
## p 0 1960
tree_accuracy
## [1] 0.9992614
Interpretation: The pruned tree achieves near-perfect classification with ~99.9% accuracy. The tree structure shows clear decision rules based on key variables like odor. The confusion matrix confirms very few misclassifications.
bag_results <- sapply(c(50, 100, 200, 500), function(b) {
set.seed(1234)
model <- randomForest(class ~ ., data = train, ntree = b, mtry = ncol(train) - 1)
pred <- predict(model, test)
mean(pred == test$class)
})
bag_results
## [1] 0.9995076 0.9995076 0.9995076 0.9995076
set.seed(1234)
bag_model <- randomForest(class ~ ., data = train, ntree = 500, mtry = ncol(train) - 1)
varImpPlot(bag_model)
Interpretation: Accuracy stabilizes by 200–500 trees.
The variable importance plot highlights odor as the most influential
feature, which aligns with the CART results.
rf_grid <- expand.grid(mtry = c(2, 4, 6), ntree = c(100, 500))
rf_results <- apply(rf_grid, 1, function(params) {
set.seed(1234)
model <- randomForest(class ~ ., data = train, mtry = params[1], ntree = params[2])
pred <- predict(model, test)
acc <- mean(pred == test$class)
c(mtry = params[1], ntree = params[2], Accuracy = acc)
})
rf_results <- as.data.frame(t(rf_results))
rf_results
set.seed(1234)
rf_model <- randomForest(class ~ ., data = train, mtry = 4, ntree = 500)
varImpPlot(rf_model)
Interpretation: Random Forest with mtry = 4 and 500
trees provides the highest accuracy. The importance plot again
highlights odor and spore print color as critical variables.
train_boost <- train
test_boost <- test
train_boost$class <- ifelse(train_boost$class == "p", 1, 0)
test_boost$class <- ifelse(test_boost$class == "p", 1, 0)
boost_results <- lapply(c(0.001, 0.01, 0.1), function(shrinkage) {
set.seed(1234)
model <- gbm(class ~ ., data = train_boost, distribution = "bernoulli",
n.trees = 500, interaction.depth = 1, shrinkage = shrinkage, verbose = FALSE)
pred <- predict(model, newdata = test_boost, n.trees = 500, type = "response")
pred_class <- ifelse(pred > 0.5, 1, 0)
acc <- mean(pred_class == test_boost$class)
list(shrinkage = shrinkage, accuracy = acc, model = model)
})
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
boost_summary <- sapply(boost_results, function(res) c(shrinkage = res$shrinkage, accuracy = res$accuracy))
boost_summary
## [,1] [,2] [,3]
## shrinkage 0.0010000 0.0100000 0.1
## accuracy 0.9847366 0.9945839 1.0
best_boost <- boost_results[[2]]$model
summary(best_boost)
Interpretation: Best accuracy is achieved with shrinkage = 0.01. The influence plot shows habitat and cap shape dominating the model. Boosting provides good accuracy but slightly less than Random Forest.
final_results <- data.frame(
Method = c("CART", "Bagging (500 trees)", "Random Forest (mtry=4, 500 trees)", "Boosting (shrinkage=0.01)"),
Accuracy = c(tree_accuracy, bag_results["500"],
rf_results$Accuracy[rf_results$mtry == 4 & rf_results$ntree == 500],
boost_results[[2]]$accuracy)
)
print(final_results)
## Method Accuracy
## 1 CART 0.9992614
## 2 Bagging (500 trees) NA
## 3 Random Forest (mtry=4, 500 trees) 1.0000000
## 4 Boosting (shrinkage=0.01) 0.9945839
Interpretation: Random Forest with mtry=4 and 500 trees achieves perfect classification, making it the best choice. Due to randomness in data splits, other students may get slightly different results, but Random Forest is likely to be a common top performer.
Using the data from package , estimate the standard error of the difference in Median (miles per gallon) between 4 cylinder and 8 cylinder cars.
library(boot)
library(dplyr)
auto <- read.table("/Users/saransh/Downloads/Statistical_Learning_Resources/Auto.data", header = TRUE, na.strings = "?")
auto <- na.omit(auto)
boot_median_diff <- function(data, indices) {
sampled_data <- data[indices, , drop = FALSE]
median_4 <- median(sampled_data$mpg[sampled_data$cylinders == 4])
median_8 <- median(sampled_data$mpg[sampled_data$cylinders == 8])
return(median_4 - median_8)
}
set.seed(1234)
boot_result <- boot(data = auto, statistic = boot_median_diff, R = 1000)
print(boot_result)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = auto, statistic = boot_median_diff, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 14.4 -0.30745 0.797002
Interpretation The estimated difference in median MPG between 4-cylinder and 8-cylinder cars is 14.4 MPG with a bootstrap standard error of approximately 0.80, indicating stable estimation across resamples.
library(MASS)
library(caret)
library(dplyr)
library(MASS)
library(caret)
library(dplyr)
data(cpus, package = "MASS")
str(cpus)
## 'data.frame': 209 obs. of 9 variables:
## $ name : Factor w/ 209 levels "ADVISOR 32/60",..: 1 3 2 4 5 6 8 9 10 7 ...
## $ syct : int 125 29 29 29 29 26 23 23 23 23 ...
## $ mmin : int 256 8000 8000 8000 8000 8000 16000 16000 16000 32000 ...
## $ mmax : int 6000 32000 32000 32000 16000 32000 32000 32000 64000 64000 ...
## $ cach : int 256 32 32 32 32 64 64 64 64 128 ...
## $ chmin : int 16 8 8 8 8 8 16 16 16 32 ...
## $ chmax : int 128 32 32 32 16 32 32 32 32 64 ...
## $ perf : int 198 269 220 172 132 318 367 489 636 1144 ...
## $ estperf: int 199 253 253 253 132 290 381 381 749 1238 ...
custom_cv <- function(data, response_var, predictor_vars, k = 10) {
folds <- createFolds(data[[response_var]], k = k, list = TRUE, returnTrain = FALSE)
mse_values <- sapply(folds, function(test_idx) {
train_data <- data[-test_idx, ]
test_data <- data[test_idx, ]
model <- lm(as.formula(paste(response_var, "~", paste(predictor_vars, collapse = "+"))), data = train_data)
preds <- predict(model, newdata = test_data)
mean((test_data[[response_var]] - preds)^2)
})
mean(mse_values)
}
response_var <- "perf"
predictor_vars <- c("mmin", "mmax")
cv_mse_result <- custom_cv(cpus, response_var, predictor_vars, k = 10)
cat("Estimated Test MSE from 10-Fold Cross-Validation:", cv_mse_result)
## Estimated Test MSE from 10-Fold Cross-Validation: 6457.399
Interpretation The estimated test MSE is 6457.399, reflecting the model’s average error on unseen data. The custom 10-fold cross-validation properly leveraged createFolds() for robust model validation.