Do not work in groups!
Do not discuss the exam with other students, or share any ideas or thoughts regarding the exam. This is academic misconduct, and will not be tolerated. Your presented solution must be your own work!
Must submit compiled RMarkdown file in PDF/HTML format.
If you cannot compile the RMarkdown file, please send an email to me for help before Thursday, May 8!
# Load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
# Load data
mushrooms <- read.csv("mushrooms.csv")
# Set seed and split 50/50
set.seed(1234)
train_index <- sample(1:nrow(mushrooms), nrow(mushrooms)/2)
train <- mushrooms[train_index, ]
test <- mushrooms[-train_index, ]
library(rpart)
library(rpart.plot)
# Train full tree
tree_model <- rpart(class ~ ., data = train, method = "class", cp = 0.001)
# Use cross-validation to find optimal cp
printcp(tree_model)
##
## Classification tree:
## rpart(formula = class ~ ., data = train, method = "class", cp = 0.001)
##
## Variables actually used in tree construction:
## [1] odor spore.print.color stalk.color.below.ring
## [4] stalk.root
##
## Root node error: 1953/4062 = 0.4808
##
## n= 4062
##
## CP nsplit rel error xerror xstd
## 1 0.9703021 0 1.0000000 1.0000000 0.0163049
## 2 0.0163850 1 0.0296979 0.0296979 0.0038716
## 3 0.0061444 2 0.0133129 0.0133129 0.0026025
## 4 0.0023041 3 0.0071685 0.0071685 0.0019125
## 5 0.0010000 5 0.0025602 0.0025602 0.0011442
best_cp <- tree_model$cptable[which.min(tree_model$cptable[,"xerror"]), "CP"]
# Prune tree
pruned_tree <- prune(tree_model, cp = best_cp)
# Visualize
rpart.plot(pruned_tree, type = 3, extra = 104)
# Predict on test data
tree_pred <- predict(pruned_tree, newdata = test, type = "class")
tree_accuracy <- mean(tree_pred == test$class)
tree_accuracy
## [1] 0.9992614
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Ensure the target is a factor
train$class <- as.factor(train$class)
test$class <- as.factor(test$class)
# Bagging model (mtry = all predictors)
bag_model <- randomForest(class ~ ., data = train, mtry = ncol(train) - 1, ntree = 100, importance = TRUE)
# Predict and evaluate
bag_pred <- predict(bag_model, newdata = test)
bag_accuracy <- mean(bag_pred == test$class)
cat("Bagging Accuracy (100 trees):", bag_accuracy, "\\n")
## Bagging Accuracy (100 trees): 1 \n
# Try different numbers of trees
for (b in c(50, 100, 200, 500)) {
model <- randomForest(class ~ ., data = train, mtry = ncol(train) - 1, ntree = b)
acc <- mean(predict(model, test) == test$class)
cat("Bagging - Trees:", b, "- Accuracy:", acc, "\\n")
}
## Bagging - Trees: 50 - Accuracy: 1 \nBagging - Trees: 100 - Accuracy: 1 \nBagging - Trees: 200 - Accuracy: 1 \nBagging - Trees: 500 - Accuracy: 1 \n
# Variable importance plot
varImpPlot(bag_model)
# Try different mtry values
results <- data.frame()
for (m in c(2, 4, 6, 8)) {
rf_model <- randomForest(class ~ ., data = train, mtry = m, ntree = 200)
acc <- mean(predict(rf_model, test) == test$class)
results <- rbind(results, data.frame(mtry = m, accuracy = acc))
}
print(results)
## mtry accuracy
## 1 2 1
## 2 4 1
## 3 6 1
## 4 8 1
# Final model
final_rf <- randomForest(class ~ ., data = train, mtry = 4, ntree = 200, importance = TRUE)
varImpPlot(final_rf)
# Load libraries
library(tidyverse)
library(gbm)
## Warning: package 'gbm' was built under R version 4.4.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
# Load and fix data
mushrooms <- read.csv("mushrooms.csv")
# Convert character columns to factors BEFORE splitting
mushrooms[] <- lapply(mushrooms, function(x) if(is.character(x)) as.factor(x) else x)
# Split 50/50 with seed
set.seed(1234)
train_index <- sample(1:nrow(mushrooms), nrow(mushrooms)/2)
train <- mushrooms[train_index, ]
test <- mushrooms[-train_index, ]
# Convert the response 'class' to numeric binary (poisonous = 1, edible = 0)
train$class <- ifelse(train$class == "p", 1, 0)
test$class <- ifelse(test$class == "p", 1, 0)
# Try different shrinkage (learning rates)
for (lr in c(0.001, 0.01, 0.1)) {
boost_model <- gbm(class ~ .,
data = train,
distribution = "bernoulli",
n.trees = 500,
interaction.depth = 1,
shrinkage = lr,
verbose = FALSE)
# Predict
pred_prob <- predict(boost_model, newdata = test, n.trees = 500, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
# Accuracy
cat("Learning rate:", lr, "- Accuracy:", mean(pred_class == test$class), "\n")
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Learning rate: 0.001 - Accuracy: 0.9847366
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Learning rate: 0.01 - Accuracy: 0.9945839
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Learning rate: 0.1 - Accuracy: 1
# Fit final boosting model
final_boost <- gbm(class ~ .,
data = train,
distribution = "bernoulli",
n.trees = 500,
interaction.depth = 1,
shrinkage = 0.1,
verbose = FALSE)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
# Variable importance plot
summary(final_boost)
Using the data from package , estimate the standard error of the difference in Median (miles per gallon) between 4 cylinder and 8 cylinder cars.
# Load libraries
library(ISLR)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
# Load the data
data(Auto)
# Define a bootstrap function
boot_median_diff <- function(data, indices) {
d <- data[indices, ] # resample data
median_4cyl <- median(d$mpg[d$cylinders == 4])
median_8cyl <- median(d$mpg[d$cylinders == 8])
return(median_4cyl - median_8cyl)
}
# Apply bootstrapping
set.seed(1234)
boot_results <- boot(data = Auto, statistic = boot_median_diff, R = 1000)
# View the bootstrap standard error
print(boot_results)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = boot_median_diff, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 14.4 -0.30745 0.797002
cat("Bootstrap estimated standard error:", sd(boot_results$t), "\n")
## Bootstrap estimated standard error: 0.797002
# Load libraries
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
# Load the data
data(cpus)
# Define the custom 10-fold cross-validation function
my_cv_function <- function(data, response, predictors, k = 10) {
# Create folds
set.seed(1234) # for reproducibility
folds <- createFolds(data[[response]], k = k, list = TRUE, returnTrain = FALSE)
# Initialize vector to store test errors
test_errors <- numeric(length = k)
# Loop over each fold
for (i in 1:k) {
test_idx <- folds[[i]]
train_data <- data[-test_idx, ]
test_data <- data[test_idx, ]
# Fit linear model on training data
formula <- as.formula(
paste(response, "~", paste(predictors, collapse = " + "))
)
model <- lm(formula, data = train_data)
# Predict on test data
predictions <- predict(model, newdata = test_data)
# Calculate mean squared error (MSE) on test data
mse <- mean((predictions - test_data[[response]])^2)
# Store the error
test_errors[i] <- mse
}
# Return average test error
return(mean(test_errors))
}
# Use the function
# Predicting perf using mmin and mmax
cv_error <- my_cv_function(cpus, response = "perf", predictors = c("mmin", "mmax"), k = 10)
# Print result
cat("Estimated test error (10-fold CV):", cv_error, "\n")
## Estimated test error (10-fold CV): 6315.865