Do not work in groups!
Do not discuss the exam with other students, or share any ideas or thoughts regarding the exam. This is academic misconduct, and will not be tolerated. Your presented solution must be your own work!
Must submit compiled RMarkdown file in PDF/HTML format.
If you cannot compile the RMarkdown file, please send an email to me for help before Thursday, May 8!
#Load data
mush <- read.csv("C:/Users/nsepe/Downloads/mushrooms.csv", stringsAsFactors=TRUE)
mush$class <- factor(mush$class, levels=c("e","p"),
labels=c("edible","poisonous"))
set.seed(1234)
#Create a random 50/50 split
n <- nrow(mush)
train_idx <- sample(seq_len(n), size = n/2)
train <- mush[ train_idx, ]
test <- mush[-train_idx, ]
#Quick sanity check
table(train$class)
##
## edible poisonous
## 2109 1953
table(test$class)
##
## edible poisonous
## 2099 1963
library(tree)
#Fit a full tree
full.tree <- tree(class ~ ., data = train)
#Cross‐validate to find optimal size
cv.t <- cv.tree(full.tree, FUN = prune.misclass)
plot(cv.t$size, cv.t$dev, type="b",
xlab="Tree size (terminal nodes)", ylab="CV deviance")
#Choose the size with lowest CV error
best.size <- cv.t$size[which.min(cv.t$dev)]
#Prune
pruned.tree <- prune.misclass(full.tree, best = best.size)
plot(pruned.tree); text(pruned.tree, pretty=0)
#Test‐set performance
pred.b <- predict(pruned.tree, test, type="class")
table(pred.b, test$class)
##
## pred.b edible poisonous
## edible 2099 3
## poisonous 0 1960
mean(pred.b == test$class) # accuracy
## [1] 0.9992614
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
#Number of predictors
p <- ncol(train) - 1
#Bagging
bag.50 <- randomForest(class ~ ., data=train, mtry=p, ntree= 50, importance=TRUE)
bag.200 <- randomForest(class ~ ., data=train, mtry=p, ntree=200, importance=TRUE)
#Plot OOB error vs number of trees
plot(bag.200)
#Test performance
for(mod in list(bag.50, bag.200)) {
pred <- predict(mod, test)
cat("ntree =", mod$ntree, "→",
mean(pred == test$class), "\n")
}
## ntree = 50 → 0.9995076
## ntree = 200 → 0.9995076
#Variable importance
varImpPlot(bag.200)
results <- expand.grid(ntree = c(100, 500),
mtry = c(floor(sqrt(p)), floor(p/3), p))
results$accuracy <- NA
for(i in seq_len(nrow(results))) {
rf <- randomForest(class~., data=train,
ntree = results$ntree[i],
mtry = results$mtry[i])
preds <- predict(rf, test)
results$accuracy[i] <- mean(preds == test$class)
}
print(results)
## ntree mtry accuracy
## 1 100 4 1.0000000
## 2 500 4 1.0000000
## 3 100 7 1.0000000
## 4 500 7 1.0000000
## 5 100 22 0.9995076
## 6 500 22 0.9995076
# Fit best model
best <- results[which.max(results$accuracy), ]
rf.final <- randomForest(class~., data=train,
ntree=best$ntree, mtry=best$mtry,
importance=TRUE)
varImpPlot(rf.final)
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
# Create Numeric response 0/1
train$y01 <- ifelse(train$class=="poisonous", 1, 0)
test$y01 <- ifelse(test$class =="poisonous", 1, 0)
#Grid of shrinkage and trees
shrinkages <- c(0.001, 0.01, 0.1)
n.trees <- c(100, 500, 1000)
boost.res <- expand.grid(lambda = shrinkages,
n.trees = n.trees,
accuracy = NA)
for(i in seq_len(nrow(boost.res))) {
gb <- gbm(y01 ~ . -class,
data = train,
distribution = "bernoulli",
n.trees = boost.res$n.trees[i],
shrinkage = boost.res$lambda[i],
interaction.depth = 1,
verbose = FALSE)
# Predict probabilities then class
prob <- predict(gb, test,
n.trees = boost.res$n.trees[i],
type="response")
pred <- ifelse(prob > 0.5, 1, 0)
boost.res$accuracy[i] <- mean(pred == test$y01)
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
print(boost.res)
## lambda n.trees accuracy
## 1 0.001 100 0.9847366
## 2 0.010 100 0.9847366
## 3 0.100 100 0.9975382
## 4 0.001 500 0.9847366
## 5 0.010 500 0.9945839
## 6 0.100 500 1.0000000
## 7 0.001 1000 0.9847366
## 8 0.010 1000 0.9975382
## 9 0.100 1000 1.0000000
# Fit & show variable influence
best.b <- boost.res[which.max(boost.res$accuracy), ]
gb.final <- gbm(y01 ~ . -class, data=train,
distribution="bernoulli",
n.trees=best.b$n.trees,
shrinkage=best.b$lambda,
interaction.depth=1)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
summary(gb.final)
Based on your results to b) through e), which statistical learning algorithm seems to yield the best accuracy results? Select the best model from all models. Is this likely to be the same answer that other students will get on this exam, assuming you correctly used your name in setting the seed on part a) (set.seed(1234))?
# Gather the top accuracies
accs <- data.frame(
Model = c("Pruned CART",
"Bagging (200)",
"RF final",
"Boosting final"),
Accuracy = c(
mean(predict(pruned.tree, test, type="class") == test$class),
mean(predict(bag.200, test) == test$class),
mean(predict(rf.final, test) == test$class),
max(boost.res$accuracy)
)
)
print(accs)
## Model Accuracy
## 1 Pruned CART 0.9992614
## 2 Bagging (200) 0.9995076
## 3 RF final 1.0000000
## 4 Boosting final 1.0000000
Using the data from package , estimate the standard error of the difference in Median (miles per gallon) between 4 cylinder and 8 cylinder cars.
# install.packages(c("ISLR","boot"))
library(ISLR) # for Auto
library(boot) # for boot()
data(Auto)
# Statistic: median(mpg|cyl=4) – median(mpg|cyl=8)
diff_med <- function(df, idx) {
d <- df[idx, ]
m4 <- median(d$mpg[d$cylinders==4])
m8 <- median(d$mpg[d$cylinders==8])
m4 - m8
}
set.seed(123)
boot.out <- boot(data = Auto,
statistic = diff_med,
R = 1000)
# Print bootstrap results
print(boot.out)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = diff_med, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 14.4 -0.2439 0.7671612
cat("\nObserved Δ-median =", boot.out$t0,
"\nBootstrap SE =", sd(boot.out$t), "\n\n")
##
## Observed Δ-median = 14.4
## Bootstrap SE = 0.7671612
library(MASS)
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
#Define the CV function
cv_lm <- function(data, response, predictors, k = 10, seed = 101) {
set.seed(seed)
folds <- createFolds(data[[response]], k = k, list = TRUE)
mses <- numeric(length(folds))
for (i in seq_along(folds)) {
test_idx <- folds[[i]]
train_dat <- data[-test_idx, ]
test_dat <- data[ test_idx, ]
fmla <- as.formula(
paste(response, "~", paste(predictors, collapse = " + "))
)
fit <- lm(fmla, data = train_dat)
pred <- predict(fit, newdata = test_dat)
mses[i] <- mean((test_dat[[response]] - pred)^2)
}
#Return average MSE
mean(mses)
}
#Run 10-fold CV
data(cpus)
cv_error <- cv_lm(
data = cpus,
response = "perf",
predictors = c("mmin", "mmax"),
k = 10,
seed = 123
)
#Print the result
cat("10-fold CV MSE for lm(perf ~ mmin + mmax):", round(cv_error, 2), "\n")
## 10-fold CV MSE for lm(perf ~ mmin + mmax): 6233.69
-The 10-fold CV MSE is 6233.69 (about 79 RMSE)
summary(cpus$perf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.0 27.0 50.0 105.6 113.0 1150.0
sd(cpus$perf)
## [1] 160.8306