Install

install.packages("ISLR")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("tidymodels")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.7     ✔ recipes      1.3.0
## ✔ dials        1.4.0     ✔ rsample      1.3.0
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.2     ✔ tidyr        1.3.1
## ✔ infer        1.0.8     ✔ tune         1.3.0
## ✔ modeldata    1.4.0     ✔ workflows    1.2.0
## ✔ parsnip      1.3.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.4     ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
library(ggplot2)

5.1 The Validation Set Approach

set.seed(1)
train_index <- sample(1:nrow(Auto), nrow(Auto) / 2)

# Model 1: mpg ~ horsepower
lm_fit1 <- lm(mpg ~ horsepower, data = Auto, subset = train_index)
mean((Auto$mpg[-train_index] - predict(lm_fit1, Auto[-train_index, ]))^2)
## [1] 23.26601
# Model 2: mpg ~ horsepower + horsepower^2
lm_fit2 <- lm(mpg ~ poly(horsepower, 2), data = Auto, subset = train_index)
mean((Auto$mpg[-train_index] - predict(lm_fit2, Auto[-train_index, ]))^2)
## [1] 18.71646
# Model 3: mpg ~ horsepower + horsepower^2 + horsepower^3
lm_fit3 <- lm(mpg ~ poly(horsepower, 3), data = Auto, subset = train_index)
mean((Auto$mpg[-train_index] - predict(lm_fit3, Auto[-train_index, ]))^2)
## [1] 18.79401

5.2 Leave-One-Out Cross-Validation (LOOCV)

library(boot)
set.seed(17)
glm_fit <- glm(mpg ~ horsepower, data = Auto)
cv.glm(Auto, glm_fit)$delta
## [1] 24.23151 24.23114
# Degree 1 to 5 polynomial fits with LOOCV
cv_error <- rep(0, 5)
for (d in 1:5) {
  glm_fit <- glm(mpg ~ poly(horsepower, d), data = Auto)
  cv_error[d] <- cv.glm(Auto, glm_fit)$delta[1]
}
cv_error
## [1] 24.23151 19.24821 19.33498 19.42443 19.03321

5.3 k-Fold Cross-Validation (k = 10)

set.seed(1)
cv_error_10fold <- rep(0, 10)
for (d in 1:10) {
  glm_fit <- glm(mpg ~ poly(horsepower, d), data = Auto)
  cv_error_10fold[d] <- cv.glm(Auto, glm_fit, K = 10)$delta[1]
}
cv_error_10fold
##  [1] 24.21538 19.28327 19.12998 19.29201 19.09471 18.87874 18.78127 18.80484
##  [9] 18.92676 18.99439

5.4 Bias-Variance Trade-Off for k-Fold CV

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
set.seed(2)
n <- 100
x <- rnorm(n)
y <- x - 2 * x^2 + rnorm(n)
data <- tibble(x = x, y = y)

# Generate a test set
x_test <- seq(-2, 2, length.out = 100)
y_true <- x_test - 2 * x_test^2

# Function to train model and return predictions
get_pred <- function(seed, degree) {
  set.seed(seed)
  x <- rnorm(n)
  y <- x - 2 * x^2 + rnorm(n)
  df <- tibble(x = x, y = y)
  model <- lm(y ~ poly(x, degree), data = df)
  predict(model, newdata = tibble(x = x_test))
}

preds <- map_dfr(1:100, function(seed) {
  tibble(
    pred1 = get_pred(seed, 1),
    pred2 = get_pred(seed, 2),
    pred10 = get_pred(seed, 10)
  )
})

# Reshape and compute bias-variance
preds_long <- preds %>%
  mutate(row = row_number()) %>%
  pivot_longer(cols = -row, names_to = "model", values_to = "pred") %>%
  mutate(x = rep(x_test, times = 3 * 100),
         y_true = rep(y_true, times = 3 * 100),
         model = factor(model, levels = c("pred1", "pred2", "pred10"),
                        labels = c("Degree 1", "Degree 2", "Degree 10")))

bias_var <- preds_long %>%
  group_by(model, x) %>%
  summarise(
    bias2 = (mean(pred) - y_true[1])^2,
    variance = var(pred),
    .groups = "drop"
  ) %>%
  pivot_longer(cols = c(bias2, variance), names_to = "type", values_to = "value")

# Plot bias and variance
ggplot(bias_var, aes(x = x, y = value, color = type)) +
  geom_line() +
  facet_wrap(~ model, ncol = 1) +
  labs(title = "Bias-Variance Trade-Off",
       y = "Value", color = "") +
  theme_minimal()