Do not work in groups!
Do not discuss the exam with other students, or share any ideas or thoughts regarding the exam. This is academic misconduct, and will not be tolerated. Your presented solution must be your own work!
Must submit compiled RMarkdown file in PDF/HTML format.
If you cannot compile the RMarkdown file, please send an email to me for help before Thursday, May 8!
library(TeachingDemos)
## Warning: package 'TeachingDemos' was built under R version 4.3.3
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mushroom <- read.csv("C:\\Users\\brian\\OneDrive\\Documents\\Stat Learning\\mushrooms.csv", stringsAsFactors = TRUE)
char2seed("Bri")
train_test <- sample(1:nrow(mushroom), nrow(mushroom) / 2)
train_data <- mushroom[train_test, ]
test_data <- mushroom[-train_test, ]
cat("Training set size:", nrow(train_data), "\n")
## Training set size: 4062
cat("Test set size:", nrow(test_data), "\n")
## Test set size: 4062
tree_model <- rpart(class ~ .,
data = train_data,
method = "class",
cp = 0.001)
printcp(tree_model)
##
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class",
## cp = 0.001)
##
## Variables actually used in tree construction:
## [1] odor spore.print.color stalk.color.below.ring
## [4] stalk.root
##
## Root node error: 1944/4062 = 0.47858
##
## n= 4062
##
## CP nsplit rel error xerror xstd
## 1 0.9696502 0 1.0000000 1.0000000 0.0163774
## 2 0.0200617 1 0.0303498 0.0303498 0.0039224
## 3 0.0046296 2 0.0102881 0.0102881 0.0022948
## 4 0.0018004 3 0.0056584 0.0056584 0.0017038
## 5 0.0010000 5 0.0020576 0.0036008 0.0013598
best_cp <- tree_model$cptable[which.min(tree_model$cptable[, "xerror"]), "CP"]
pruned_tree <- prune(tree_model, cp = best_cp)
pred_class <- predict(pruned_tree,
newdata = test_data,
type = "class")
conf_matrix <- confusionMatrix(pred_class, test_data$class)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 2090 4
## p 0 1968
##
## Accuracy : 0.999
## 95% CI : (0.9975, 0.9997)
## No Information Rate : 0.5145
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.998
##
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 1.0000
## Specificity : 0.9980
## Pos Pred Value : 0.9981
## Neg Pred Value : 1.0000
## Prevalence : 0.5145
## Detection Rate : 0.5145
## Detection Prevalence : 0.5155
## Balanced Accuracy : 0.9990
##
## 'Positive' Class : e
##
rpart.plot(pruned_tree,
type = 2,
extra = 104,
fallen.leaves = TRUE,
box.palette = "BuGn",
main = "Pruned Classification Tree - Mushroom Dataset")
##The root node will typically split on a highly informative variable, in this case odor, which is strongly predictive of whether a mushroom is edible or poisonous. The other nodes used in the model include spore print color, stalk color below ring, and stalk root. Each internal node represents a decision rule (odor = yes or no), and terminal nodes show the predicted class (edible 'e' or poisonous 'p'), with probabilities and sample counts.
train_data$class <- as.factor(train_data$class)
test_data$class <- as.factor(test_data$class)
tree_counts <- c(10, 50, 100, 200, 500)
results <- data.frame(B = tree_counts, Accuracy = NA)
for (i in seq_along(tree_counts)) {
char2seed("Bri")
B <- tree_counts[i]
model_bag <- randomForest(class ~ .,
data = train_data,
ntree = B,
mtry = ncol(train_data) - 1,
importance = TRUE
)
preds <- predict(model_bag, newdata = test_data)
acc <- confusionMatrix(preds, test_data$class)$overall['Accuracy']
results$Accuracy[i] <- acc
cat(sprintf("B = %d → Accuracy: %.4f\n", B, acc))
}
## B = 10 → Accuracy: 1.0000
## B = 50 → Accuracy: 1.0000
## B = 100 → Accuracy: 1.0000
## B = 200 → Accuracy: 1.0000
## B = 500 → Accuracy: 1.0000
best_B <- results$B[which.max(results$Accuracy)]
cat("Best number of trees (B):", best_B, "\n")
## Best number of trees (B): 10
final_bagged_model <- randomForest(
class ~ .,
data = train_data,
ntree = best_B,
mtry = ncol(train_data) - 1,
importance = TRUE
)
varImpPlot(final_bagged_model, main = paste("Variable Importance - Bagged Trees (B =", best_B, ")"))
##The model's accuracy plateaus after 10 trees, which means that adding more trees doesn't provide additional predictive performance. Therefore 10 trees is sufficient. The variable importance plot shows the features that contribute most to classification are odor, spore print color, and stalk color below ring. "MeanDecreaseGini" is a measure of how each variable contributes to homogeneity in the decision trees. We can see that odor is the top feature in decision making.
mtry_values <- c(2, 3, 4, 5)
ntree_values <- c(50, 100, 200, 500)
results <- expand.grid(mtry = mtry_values, ntree = ntree_values)
results$Accuracy <- NA
for (i in 1:nrow(results)) {
m <- results$mtry[i]
b <- results$ntree[i]
char2seed("Bri")
rf_model <- randomForest(class ~ .,
data = train_data,
mtry = m,
ntree = b,
importance = TRUE)
preds <- predict(rf_model, newdata = test_data)
acc <- confusionMatrix(preds, test_data$class)$overall['Accuracy']
results$Accuracy[i] <- acc
cat(sprintf("mtry = %d, ntree = %d → Accuracy: %.4f\n", m, b, acc))
}
## mtry = 2, ntree = 50 → Accuracy: 1.0000
## mtry = 3, ntree = 50 → Accuracy: 1.0000
## mtry = 4, ntree = 50 → Accuracy: 1.0000
## mtry = 5, ntree = 50 → Accuracy: 1.0000
## mtry = 2, ntree = 100 → Accuracy: 1.0000
## mtry = 3, ntree = 100 → Accuracy: 1.0000
## mtry = 4, ntree = 100 → Accuracy: 1.0000
## mtry = 5, ntree = 100 → Accuracy: 1.0000
## mtry = 2, ntree = 200 → Accuracy: 1.0000
## mtry = 3, ntree = 200 → Accuracy: 1.0000
## mtry = 4, ntree = 200 → Accuracy: 1.0000
## mtry = 5, ntree = 200 → Accuracy: 1.0000
## mtry = 2, ntree = 500 → Accuracy: 1.0000
## mtry = 3, ntree = 500 → Accuracy: 1.0000
## mtry = 4, ntree = 500 → Accuracy: 1.0000
## mtry = 5, ntree = 500 → Accuracy: 1.0000
best <- results[which.max(results$Accuracy), ]
cat("Best combination: mtry =", best$mtry, ", ntree =", best$ntree, "\n")
## Best combination: mtry = 2 , ntree = 50
final_rf <- randomForest(class ~ .,
data = train_data,
mtry = best$mtry,
ntree = best$ntree,
importance = TRUE)
preds_final <- predict(final_rf, newdata = test_data)
conf_final <- confusionMatrix(preds_final, test_data$class)
print(conf_final)
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 2090 0
## p 0 1972
##
## Accuracy : 1
## 95% CI : (0.9991, 1)
## No Information Rate : 0.5145
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5145
## Detection Rate : 0.5145
## Detection Prevalence : 0.5145
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : e
##
varImpPlot(final_rf, main = "Variable Importance - Random Forest")
importance(final_rf)[order(importance(final_rf)[,1], decreasing = TRUE), ][1:5, ]
## e p MeanDecreaseAccuracy MeanDecreaseGini
## odor 6.428389 6.612640 6.893163 582.16104
## stalk.root 5.215246 3.884339 5.400034 63.42953
## gill.size 5.054424 4.290423 4.938693 137.69141
## habitat 4.765217 4.159139 4.989728 83.28506
## spore.print.color 4.681299 4.991718 5.601151 243.18425
##Top important feature is odor, followed by spore.print.color and then gill.size. A high MeanDecreaseGini indicates that the variable contributes significantly to tree splits. Accuracy does not improve with the addition of more trees, so the ideal number for computation is 100 trees. The best combination is mtry = 2 and ntree = 100, giving an accuracy of 1.00.
true_labels <- test_data$class
train_gbm <- train_data[, !(names(train_data) %in% "veil.type")]
test_gbm <- test_data[, !(names(test_data) %in% "veil.type")]
train_gbm$class <- ifelse(train_gbm$class == "p", 1, 0)
test_gbm$class <- ifelse(test_gbm$class == "p", 1, 0)
lambda_values <- c(0.001, 0.01, 0.1)
B_values <- c(50, 100, 200, 500)
results <- expand.grid(lambda = lambda_values, B = B_values)
results$Accuracy <- NA
for (i in 1:nrow(results)) {
lambda <- results$lambda[i]
B <- results$B[i]
char2seed("Bri")
gbm_model <- gbm(
class ~ .,
data = train_gbm,
distribution = "bernoulli",
n.trees = B,
shrinkage = lambda,
interaction.depth = 1,
n.cores = 1,
verbose = FALSE
)
probs <- predict(gbm_model, newdata = test_gbm, n.trees = B, type = "response")
preds <- ifelse(probs > 0.5, "p", "e")
preds <- factor(preds, levels = c("e", "p"))
true <- factor(true_labels, levels = c("e", "p"))
acc <- confusionMatrix(preds, true)$overall["Accuracy"]
results$Accuracy[i] <- acc
cat(sprintf("λ = %.4f, B = %d → Accuracy: %.4f\n", lambda, B, acc))
}
## λ = 0.0010, B = 50 → Accuracy: 0.9850
## λ = 0.0100, B = 50 → Accuracy: 0.9850
## λ = 0.1000, B = 50 → Accuracy: 0.9931
## λ = 0.0010, B = 100 → Accuracy: 0.9850
## λ = 0.0100, B = 100 → Accuracy: 0.9850
## λ = 0.1000, B = 100 → Accuracy: 0.9951
## λ = 0.0010, B = 200 → Accuracy: 0.9850
## λ = 0.0100, B = 200 → Accuracy: 0.9850
## λ = 0.1000, B = 200 → Accuracy: 0.9990
## λ = 0.0010, B = 500 → Accuracy: 0.9850
## λ = 0.0100, B = 500 → Accuracy: 0.9931
## λ = 0.1000, B = 500 → Accuracy: 1.0000
best <- results[which.max(results$Accuracy), ]
cat("Best combination → λ =", best$lambda, ", B =", best$B, "\n")
## Best combination → λ = 0.1 , B = 500
final_gbm <- gbm(
class ~ .,
data = train_gbm,
distribution = "bernoulli",
n.trees = best$B,
shrinkage = best$lambda,
interaction.depth = 1,
n.cores = 1,
verbose = FALSE
)
final_probs <- predict(final_gbm, newdata = test_gbm, n.trees = best$B, type = "response")
final_preds <- ifelse(final_probs > 0.5, "p", "e")
final_preds <- factor(final_preds, levels = c("e", "p"))
true_final <- factor(true_labels, levels = c("e", "p"))
conf_matrix <- confusionMatrix(final_preds, true_final)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction e p
## e 2090 0
## p 0 1972
##
## Accuracy : 1
## 95% CI : (0.9991, 1)
## No Information Rate : 0.5145
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5145
## Detection Rate : 0.5145
## Detection Prevalence : 0.5145
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : e
##
summary(final_gbm, n.trees = best$B, main = "Variable Importance - Boosting")
##The optimal values for this model are λ = 0.1 and B = 500, which perfectly classified all of the test data. The summary() function for gbm provides relative influence scores for each variable showing that odor is the top contributor, followed by spore print color. After these two, the importance is very low.
Using the data from package , estimate the standard error of the difference in Median (miles per gallon) between 4 cylinder and 8 cylinder cars.
data("Auto")
data_4cyl <- filter(Auto, cylinders == 4)
data_8cyl <- filter(Auto, cylinders == 8)
median_4cyl <- median(data_4cyl$mpg, na.rm = TRUE)
median_8cyl <- median(data_8cyl$mpg, na.rm = TRUE)
observed_diff <- median_4cyl - median_8cyl
cat("Observed difference in medians:", observed_diff, "\n")
## Observed difference in medians: 14.4
bootstrap_diff <- function(data, indices) {
sample_data <- data[indices, ]
median_4cyl_boot <- median(sample_data$mpg[sample_data$cylinders == 4], na.rm = TRUE)
median_8cyl_boot <- median(sample_data$mpg[sample_data$cylinders == 8], na.rm = TRUE)
return(median_4cyl_boot - median_8cyl_boot)
}
combined_data <- Auto %>% filter(cylinders %in% c(4, 8))
char2seed("Bri")
bootstrap_results <- boot(data = combined_data,
statistic = bootstrap_diff,
R = 1000)
bootstrap_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = combined_data, statistic = bootstrap_diff, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 14.4 -0.2831 0.8057317
cat("Standard error of the difference in medians:", bootstrap_results$t0, "\n")
## Standard error of the difference in medians: 14.4
cat("Bootstrap estimated standard error:", sd(bootstrap_results$t), "\n")
## Bootstrap estimated standard error: 0.8057317
##The estimated median mpg difference between 4- and 8-cylinder cars is 14.4 mpg. The bootstrap standard error of this estimate is approximately 0.806 mpg.
data("cpus")
cv_lm <- function(data, predictors, response, k = 10) {
formula <- as.formula(paste(response, "~", paste(predictors, collapse = "+")))
char2seed("Bri")
folds <- createFolds(data[[response]], k = k, list = TRUE, returnTrain = FALSE)
mse_values <- numeric(k)
for (i in 1:k) {
test_idx <- folds[[i]]
train_data <- data[-test_idx, ]
test_data <- data[test_idx, ]
model <- lm(formula, data = train_data)
predictions <- predict(model, newdata = test_data)
mse <- mean((predictions - test_data[[response]])^2)
mse_values[i] <- mse
}
mean_mse <- mean(mse_values)
return(list(mean_mse = mean_mse, all_mse = mse_values))
}
result <- cv_lm(data = cpus,
predictors = c("mmin", "mmax"),
response = "estperf")
print(paste("Mean Test MSE:", round(result$mean_mse, 2)))
## [1] "Mean Test MSE: 4776.17"
##The test error (MSE) gives us an estimate of how well the model generalizes to the auto data. An MSE of 4776.17 means that, on average, the model predictions deviate from the true values of "estperf" by a squared amount of 4776.17.