5.1 The Validation Set Approach

set.seed(1)
Auto_split <- initial_split(Auto, strata = mpg, prop = 0.5)

Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)

lm_spec <- linear_reg() %>%
  set_mode("regression") %>%
  set_engine("lm")

lm_fit <- lm_spec %>% 
  fit(mpg ~ horsepower, data = Auto_train)

augment(lm_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        5.06
augment(lm_fit, new_data = Auto_train) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.74
poly_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
  step_poly(horsepower, degree = 2)

poly_wf <- workflow() %>%
  add_recipe(poly_rec) %>%
  add_model(lm_spec)

poly_fit <- fit(poly_wf, data = Auto_train)

augment(poly_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.37
set.seed(2)
Auto_split <- initial_split(Auto)

Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)

poly_fit <- fit(poly_wf, data = Auto_train)

augment(poly_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.35

5.2 Leave-One-Out Cross-Validation

library(tidymodels)
library(ISLR)

Auto <- tibble::as_tibble(Auto)

# LOOCV: v-fold with v = n (number of rows)
loocv_folds <- vfold_cv(Auto, v = nrow(Auto))

# Model spec
lm_spec <- linear_reg() %>%
  set_engine("lm")

# Recipe (you can skip this if not using preprocessing)
lm_rec <- recipe(mpg ~ horsepower, data = Auto)

# Workflow
lm_wf <- workflow() %>%
  add_recipe(lm_rec) %>%
  add_model(lm_spec)

# Fit LOOCV
loocv_res <- fit_resamples(
  lm_wf,
  resamples = loocv_folds,
  metrics = metric_set(rmse),
  control = control_resamples(save_pred = TRUE)
)

collect_metrics(loocv_res)
## # A tibble: 1 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard    3.85   392   0.155 Preprocessor1_Model1

5.3 k-Fold Cross-Validation

poly_tuned_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
  step_poly(horsepower, degree = tune())

poly_tuned_wf <- workflow() %>%
  add_recipe(poly_tuned_rec) %>%
  add_model(lm_spec)

Auto_folds <- vfold_cv(Auto_train, v = 10)

degree_grid <- tibble(degree = seq(1, 10))

tune_res <- tune_grid(
  object = poly_tuned_wf, 
  resamples = Auto_folds, 
  grid = degree_grid
)

autoplot(tune_res)

collect_metrics(tune_res)
## # A tibble: 20 × 7
##    degree .metric .estimator  mean     n std_err .config              
##     <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
##  1      1 rmse    standard   4.81     10  0.167  Preprocessor01_Model1
##  2      1 rsq     standard   0.628    10  0.0205 Preprocessor01_Model1
##  3      2 rmse    standard   4.37     10  0.150  Preprocessor02_Model1
##  4      2 rsq     standard   0.684    10  0.0203 Preprocessor02_Model1
##  5      3 rmse    standard   4.40     10  0.143  Preprocessor03_Model1
##  6      3 rsq     standard   0.681    10  0.0200 Preprocessor03_Model1
##  7      4 rmse    standard   4.41     10  0.147  Preprocessor04_Model1
##  8      4 rsq     standard   0.680    10  0.0205 Preprocessor04_Model1
##  9      5 rmse    standard   4.37     10  0.180  Preprocessor05_Model1
## 10      5 rsq     standard   0.686    10  0.0225 Preprocessor05_Model1
## 11      6 rmse    standard   4.33     10  0.206  Preprocessor06_Model1
## 12      6 rsq     standard   0.690    10  0.0237 Preprocessor06_Model1
## 13      7 rmse    standard   4.30     10  0.220  Preprocessor07_Model1
## 14      7 rsq     standard   0.696    10  0.0247 Preprocessor07_Model1
## 15      8 rmse    standard   4.30     10  0.219  Preprocessor08_Model1
## 16      8 rsq     standard   0.695    10  0.0246 Preprocessor08_Model1
## 17      9 rmse    standard   4.32     10  0.214  Preprocessor09_Model1
## 18      9 rsq     standard   0.692    10  0.0244 Preprocessor09_Model1
## 19     10 rmse    standard   4.33     10  0.214  Preprocessor10_Model1
## 20     10 rsq     standard   0.691    10  0.0240 Preprocessor10_Model1
show_best(tune_res, metric = "rmse")
## # A tibble: 5 × 7
##   degree .metric .estimator  mean     n std_err .config              
##    <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1      7 rmse    standard    4.30    10   0.220 Preprocessor07_Model1
## 2      8 rmse    standard    4.30    10   0.219 Preprocessor08_Model1
## 3      9 rmse    standard    4.32    10   0.214 Preprocessor09_Model1
## 4     10 rmse    standard    4.33    10   0.214 Preprocessor10_Model1
## 5      6 rmse    standard    4.33    10   0.206 Preprocessor06_Model1
select_by_one_std_err(tune_res, degree, metric = "rmse")
## # A tibble: 1 × 2
##   degree .config              
##    <int> <chr>                
## 1      2 Preprocessor02_Model1
best_degree <- select_by_one_std_err(tune_res, degree, metric = "rmse")

final_wf <- finalize_workflow(poly_wf, best_degree)

final_fit <- fit(final_wf, Auto_train)

final_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_poly()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
##       (Intercept)  horsepower_poly_1  horsepower_poly_2  
##             23.34            -104.85              34.39

5.4 The Bootstrap

Portfolio_boots <- bootstraps(Portfolio, times = 1000)

alpha.fn <- function(split) {
  data <- analysis(split)
  X <- data$X
  Y <- data$Y
  (var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y))
}

alpha_res <- Portfolio_boots %>%
  mutate(alpha = map_dbl(splits, alpha.fn))

alpha_res
## # Bootstrap sampling 
## # A tibble: 1,000 × 3
##    splits           id            alpha
##    <list>           <chr>         <dbl>
##  1 <split [100/37]> Bootstrap0001 0.582
##  2 <split [100/40]> Bootstrap0002 0.663
##  3 <split [100/41]> Bootstrap0003 0.618
##  4 <split [100/39]> Bootstrap0004 0.430
##  5 <split [100/36]> Bootstrap0005 0.603
##  6 <split [100/36]> Bootstrap0006 0.679
##  7 <split [100/35]> Bootstrap0007 0.484
##  8 <split [100/37]> Bootstrap0008 0.648
##  9 <split [100/39]> Bootstrap0009 0.610
## 10 <split [100/36]> Bootstrap0010 0.470
## # ℹ 990 more rows
Auto_boots <- bootstraps(Auto)

boot.fn <- function(split) {
  lm_fit <- lm_spec %>% fit(mpg ~ horsepower, data = analysis(split))
  tidy(lm_fit)
}

boot_res <- Auto_boots %>%
  mutate(models = map(splits, boot.fn))

boot_res %>%
  unnest(cols = c(models)) %>%
  group_by(term) %>%
  summarise(mean = mean(estimate),
            sd = sd(estimate))
## # A tibble: 2 × 3
##   term          mean      sd
##   <chr>        <dbl>   <dbl>
## 1 (Intercept) 40.0   0.673  
## 2 horsepower  -0.159 0.00616