install.packages("ISLR")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("tidymodels")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ recipes 1.3.0
## ✔ dials 1.4.0 ✔ rsample 1.3.0
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.2 ✔ tidyr 1.3.1
## ✔ infer 1.0.8 ✔ tune 1.3.0
## ✔ modeldata 1.4.0 ✔ workflows 1.2.0
## ✔ parsnip 1.3.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.4 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(ggplot2)
5.1 The Validation Set Approach
set.seed(1)
train_index <- sample(1:nrow(Auto), nrow(Auto) / 2)
# Model 1: mpg ~ horsepower
lm_fit1 <- lm(mpg ~ horsepower, data = Auto, subset = train_index)
mean((Auto$mpg[-train_index] - predict(lm_fit1, Auto[-train_index, ]))^2)
## [1] 23.26601
# Model 2: mpg ~ horsepower + horsepower^2
lm_fit2 <- lm(mpg ~ poly(horsepower, 2), data = Auto, subset = train_index)
mean((Auto$mpg[-train_index] - predict(lm_fit2, Auto[-train_index, ]))^2)
## [1] 18.71646
# Model 3: mpg ~ horsepower + horsepower^2 + horsepower^3
lm_fit3 <- lm(mpg ~ poly(horsepower, 3), data = Auto, subset = train_index)
mean((Auto$mpg[-train_index] - predict(lm_fit3, Auto[-train_index, ]))^2)
## [1] 18.79401
5.2 Leave-One-Out Cross-Validation (LOOCV)
library(boot)
set.seed(17)
glm_fit <- glm(mpg ~ horsepower, data = Auto)
cv.glm(Auto, glm_fit)$delta
## [1] 24.23151 24.23114
# Degree 1 to 5 polynomial fits with LOOCV
cv_error <- rep(0, 5)
for (d in 1:5) {
glm_fit <- glm(mpg ~ poly(horsepower, d), data = Auto)
cv_error[d] <- cv.glm(Auto, glm_fit)$delta[1]
}
cv_error
## [1] 24.23151 19.24821 19.33498 19.42443 19.03321
5.3 k-Fold Cross-Validation (k = 10)
set.seed(1)
cv_error_10fold <- rep(0, 10)
for (d in 1:10) {
glm_fit <- glm(mpg ~ poly(horsepower, d), data = Auto)
cv_error_10fold[d] <- cv.glm(Auto, glm_fit, K = 10)$delta[1]
}
cv_error_10fold
## [1] 24.21538 19.28327 19.12998 19.29201 19.09471 18.87874 18.78127 18.80484
## [9] 18.92676 18.99439
5.4 Bias-Variance Trade-Off for k-Fold CV
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
set.seed(2)
n <- 100
x <- rnorm(n)
y <- x - 2 * x^2 + rnorm(n)
data <- tibble(x = x, y = y)
# Generate a test set
x_test <- seq(-2, 2, length.out = 100)
y_true <- x_test - 2 * x_test^2
# Function to train model and return predictions
get_pred <- function(seed, degree) {
set.seed(seed)
x <- rnorm(n)
y <- x - 2 * x^2 + rnorm(n)
df <- tibble(x = x, y = y)
model <- lm(y ~ poly(x, degree), data = df)
predict(model, newdata = tibble(x = x_test))
}
preds <- map_dfr(1:100, function(seed) {
tibble(
pred1 = get_pred(seed, 1),
pred2 = get_pred(seed, 2),
pred10 = get_pred(seed, 10)
)
})
# Reshape and compute bias-variance
preds_long <- preds %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row, names_to = "model", values_to = "pred") %>%
mutate(x = rep(x_test, times = 3 * 100),
y_true = rep(y_true, times = 3 * 100),
model = factor(model, levels = c("pred1", "pred2", "pred10"),
labels = c("Degree 1", "Degree 2", "Degree 10")))
bias_var <- preds_long %>%
group_by(model, x) %>%
summarise(
bias2 = (mean(pred) - y_true[1])^2,
variance = var(pred),
.groups = "drop"
) %>%
pivot_longer(cols = c(bias2, variance), names_to = "type", values_to = "value")
# Plot bias and variance
ggplot(bias_var, aes(x = x, y = value, color = type)) +
geom_line() +
facet_wrap(~ model, ncol = 1) +
labs(title = "Bias-Variance Trade-Off",
y = "Value", color = "") +
theme_minimal()
