5.1 The Validation Set Approach
set.seed(1)
Auto_split <- initial_split(Auto, strata = mpg, prop = 0.5)
Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)
lm_spec <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
lm_fit <- lm_spec %>%
fit(mpg ~ horsepower, data = Auto_train)
augment(lm_fit, new_data = Auto_test) %>%
rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 5.06
augment(lm_fit, new_data = Auto_train) %>%
rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.74
poly_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
step_poly(horsepower, degree = 2)
poly_wf <- workflow() %>%
add_recipe(poly_rec) %>%
add_model(lm_spec)
poly_fit <- fit(poly_wf, data = Auto_train)
augment(poly_fit, new_data = Auto_test) %>%
rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.37
set.seed(2)
Auto_split <- initial_split(Auto)
Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)
poly_fit <- fit(poly_wf, data = Auto_train)
augment(poly_fit, new_data = Auto_test) %>%
rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.35
5.2 Leave-One-Out Cross-Validation
library(tidymodels)
library(ISLR)
Auto <- tibble::as_tibble(Auto)
# LOOCV: v-fold with v = n (number of rows)
loocv_folds <- vfold_cv(Auto, v = nrow(Auto))
# Model spec
lm_spec <- linear_reg() %>%
set_engine("lm")
# Recipe (you can skip this if not using preprocessing)
lm_rec <- recipe(mpg ~ horsepower, data = Auto)
# Workflow
lm_wf <- workflow() %>%
add_recipe(lm_rec) %>%
add_model(lm_spec)
# Fit LOOCV
loocv_res <- fit_resamples(
lm_wf,
resamples = loocv_folds,
metrics = metric_set(rmse),
control = control_resamples(save_pred = TRUE)
)
collect_metrics(loocv_res)
## # A tibble: 1 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 3.85 392 0.155 Preprocessor1_Model1
5.3 k-Fold Cross-Validation
poly_tuned_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
step_poly(horsepower, degree = tune())
poly_tuned_wf <- workflow() %>%
add_recipe(poly_tuned_rec) %>%
add_model(lm_spec)
Auto_folds <- vfold_cv(Auto_train, v = 10)
degree_grid <- tibble(degree = seq(1, 10))
tune_res <- tune_grid(
object = poly_tuned_wf,
resamples = Auto_folds,
grid = degree_grid
)
autoplot(tune_res)

collect_metrics(tune_res)
## # A tibble: 20 × 7
## degree .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 rmse standard 4.81 10 0.167 Preprocessor01_Model1
## 2 1 rsq standard 0.628 10 0.0205 Preprocessor01_Model1
## 3 2 rmse standard 4.37 10 0.150 Preprocessor02_Model1
## 4 2 rsq standard 0.684 10 0.0203 Preprocessor02_Model1
## 5 3 rmse standard 4.40 10 0.143 Preprocessor03_Model1
## 6 3 rsq standard 0.681 10 0.0200 Preprocessor03_Model1
## 7 4 rmse standard 4.41 10 0.147 Preprocessor04_Model1
## 8 4 rsq standard 0.680 10 0.0205 Preprocessor04_Model1
## 9 5 rmse standard 4.37 10 0.180 Preprocessor05_Model1
## 10 5 rsq standard 0.686 10 0.0225 Preprocessor05_Model1
## 11 6 rmse standard 4.33 10 0.206 Preprocessor06_Model1
## 12 6 rsq standard 0.690 10 0.0237 Preprocessor06_Model1
## 13 7 rmse standard 4.30 10 0.220 Preprocessor07_Model1
## 14 7 rsq standard 0.696 10 0.0247 Preprocessor07_Model1
## 15 8 rmse standard 4.30 10 0.219 Preprocessor08_Model1
## 16 8 rsq standard 0.695 10 0.0246 Preprocessor08_Model1
## 17 9 rmse standard 4.32 10 0.214 Preprocessor09_Model1
## 18 9 rsq standard 0.692 10 0.0244 Preprocessor09_Model1
## 19 10 rmse standard 4.33 10 0.214 Preprocessor10_Model1
## 20 10 rsq standard 0.691 10 0.0240 Preprocessor10_Model1
show_best(tune_res, metric = "rmse")
## # A tibble: 5 × 7
## degree .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 7 rmse standard 4.30 10 0.220 Preprocessor07_Model1
## 2 8 rmse standard 4.30 10 0.219 Preprocessor08_Model1
## 3 9 rmse standard 4.32 10 0.214 Preprocessor09_Model1
## 4 10 rmse standard 4.33 10 0.214 Preprocessor10_Model1
## 5 6 rmse standard 4.33 10 0.206 Preprocessor06_Model1
select_by_one_std_err(tune_res, degree, metric = "rmse")
## # A tibble: 1 × 2
## degree .config
## <int> <chr>
## 1 2 Preprocessor02_Model1
best_degree <- select_by_one_std_err(tune_res, degree, metric = "rmse")
final_wf <- finalize_workflow(poly_wf, best_degree)
final_fit <- fit(final_wf, Auto_train)
final_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_poly()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept) horsepower_poly_1 horsepower_poly_2
## 23.34 -104.85 34.39
5.4 The Bootstrap
Portfolio_boots <- bootstraps(Portfolio, times = 1000)
alpha.fn <- function(split) {
data <- analysis(split)
X <- data$X
Y <- data$Y
(var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y))
}
alpha_res <- Portfolio_boots %>%
mutate(alpha = map_dbl(splits, alpha.fn))
alpha_res
## # Bootstrap sampling
## # A tibble: 1,000 × 3
## splits id alpha
## <list> <chr> <dbl>
## 1 <split [100/37]> Bootstrap0001 0.582
## 2 <split [100/40]> Bootstrap0002 0.663
## 3 <split [100/41]> Bootstrap0003 0.618
## 4 <split [100/39]> Bootstrap0004 0.430
## 5 <split [100/36]> Bootstrap0005 0.603
## 6 <split [100/36]> Bootstrap0006 0.679
## 7 <split [100/35]> Bootstrap0007 0.484
## 8 <split [100/37]> Bootstrap0008 0.648
## 9 <split [100/39]> Bootstrap0009 0.610
## 10 <split [100/36]> Bootstrap0010 0.470
## # ℹ 990 more rows
Auto_boots <- bootstraps(Auto)
boot.fn <- function(split) {
lm_fit <- lm_spec %>% fit(mpg ~ horsepower, data = analysis(split))
tidy(lm_fit)
}
boot_res <- Auto_boots %>%
mutate(models = map(splits, boot.fn))
boot_res %>%
unnest(cols = c(models)) %>%
group_by(term) %>%
summarise(mean = mean(estimate),
sd = sd(estimate))
## # A tibble: 2 × 3
## term mean sd
## <chr> <dbl> <dbl>
## 1 (Intercept) 40.0 0.673
## 2 horsepower -0.159 0.00616