Resampling Methods

library(tidymodels)
library(ISLR2)

Auto <- tibble(Auto)
Portfolio <- tibble(Portfolio)

5.1 The Validation Set Approach

set.seed(1)
Auto_split <- initial_split(Auto, strata = mpg, prop = 0.5)
Auto_split
## <Training/Testing/Total>
## <194/198/392>

The testing and training data sets can be materialized using the testing() and training() functions respectively.

Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)
Auto_train
## # A tibble: 194 × 9
##      mpg cylinders displacement horsepower weight acceleration  year origin
##    <dbl>     <int>        <dbl>      <int>  <int>        <dbl> <int>  <int>
##  1    15         8          350        165   3693         11.5    70      1
##  2    16         8          304        150   3433         12      70      1
##  3    14         8          440        215   4312          8.5    70      1
##  4    14         8          455        225   4425         10      70      1
##  5    10         8          307        200   4376         15      70      1
##  6    17         6          250        100   3329         15.5    71      1
##  7    14         8          400        175   4464         11.5    71      1
##  8    14         8          351        153   4154         13.5    71      1
##  9    14         8          318        150   4096         13      71      1
## 10    13         8          400        170   4746         12      71      1
## # ℹ 184 more rows
## # ℹ 1 more variable: name <fct>
Auto_test
## # A tibble: 198 × 9
##      mpg cylinders displacement horsepower weight acceleration  year origin
##    <dbl>     <int>        <dbl>      <int>  <int>        <dbl> <int>  <int>
##  1    18         8          318        150   3436         11      70      1
##  2    17         8          302        140   3449         10.5    70      1
##  3    15         8          429        198   4341         10      70      1
##  4    14         8          454        220   4354          9      70      1
##  5    15         8          390        190   3850          8.5    70      1
##  6    15         8          383        170   3563         10      70      1
##  7    14         8          340        160   3609          8      70      1
##  8    15         8          400        150   3761          9.5    70      1
##  9    14         8          455        225   3086         10      70      1
## 10    22         6          198         95   2833         15.5    70      1
## # ℹ 188 more rows
## # ℹ 1 more variable: name <fct>

Our modeling goal is to predict mpg by horsepower using a simple linear regression model, and a polynomial regression model. First, we set up a linear regression specification.

lm_spec <- linear_reg() %>%
  set_mode("regression") %>%
  set_engine("lm")
lm_fit <- lm_spec %>% 
  fit(mpg ~ horsepower, data = Auto_train)

We can now use the augment() function to extract the prediction and rmse() to calculate the root mean squared error. This will be the testing RMSE since we are evaluating on Auto_test.

augment(lm_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        5.06

Using this framework makes it easy for us to calculate the training RMSE

augment(lm_fit, new_data = Auto_train) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.74

Next we will fit a polynomial regression model. We can use the linear model specification lm_spec to add a preprocessing unit with recipe() and step_poly() to create the polynomial expansion of horsepower. we can combine these two with workflow() to create a workflow object.

poly_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
  step_poly(horsepower, degree = 2)

poly_wf <- workflow() %>%
  add_recipe(poly_rec) %>%
  add_model(lm_spec)

poly_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_poly()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm
poly_fit <- fit(poly_wf, data = Auto_train)
augment(poly_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.37

Lastly, we show below how changing the seed results in a slightly different estimate.

set.seed(2)
Auto_split <- initial_split(Auto)

Auto_train <- training(Auto_split)
Auto_test <- testing(Auto_split)

poly_fit <- fit(poly_wf, data = Auto_train)

augment(poly_fit, new_data = Auto_test) %>%
  rmse(truth = mpg, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.35

5.2 Leave-One-Out Cross-Validation

Leave-One-Out Cross-Validation is not integrated into the broader tidymodels framework.

5.3 k-Fold Cross-Validation

poly_tuned_rec <- recipe(mpg ~ horsepower, data = Auto_train) %>%
  step_poly(horsepower, degree = tune())

poly_tuned_wf <- workflow() %>%
  add_recipe(poly_tuned_rec) %>%
  add_model(lm_spec)

The next thing we need to create is the k-Fold data set. This can be done using the vfold_cv() function. Note that the function uses v instead of k which is the terminology of ISLR. we set v = 10 as a common choice for k.

Auto_folds <- vfold_cv(Auto_train, v = 10)
Auto_folds
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [264/30]> Fold01
##  2 <split [264/30]> Fold02
##  3 <split [264/30]> Fold03
##  4 <split [264/30]> Fold04
##  5 <split [265/29]> Fold05
##  6 <split [265/29]> Fold06
##  7 <split [265/29]> Fold07
##  8 <split [265/29]> Fold08
##  9 <split [265/29]> Fold09
## 10 <split [265/29]> Fold10
degree_grid <- grid_regular(degree(range = c(1, 10)), levels = 10)
degree_grid <- tibble(degree = seq(1, 10))
tune_res <- tune_grid(
  object = poly_tuned_wf, 
  resamples = Auto_folds, 
  grid = degree_grid
)

autoplot() gives a visual overview of the performance of different hyperparameter pairs.

autoplot(tune_res)

The number used for plotting can be extracted directly with collect_metrics(). We also get an estimate of the standard error of the performance metric. We get this since we have 10 different estimates, one for each fold.

collect_metrics(tune_res)
## # A tibble: 20 × 7
##    degree .metric .estimator  mean     n std_err .config              
##     <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
##  1      1 rmse    standard   4.81     10  0.172  Preprocessor01_Model1
##  2      1 rsq     standard   0.621    10  0.0316 Preprocessor01_Model1
##  3      2 rmse    standard   4.37     10  0.209  Preprocessor02_Model1
##  4      2 rsq     standard   0.677    10  0.0436 Preprocessor02_Model1
##  5      3 rmse    standard   4.40     10  0.217  Preprocessor03_Model1
##  6      3 rsq     standard   0.675    10  0.0446 Preprocessor03_Model1
##  7      4 rmse    standard   4.43     10  0.218  Preprocessor04_Model1
##  8      4 rsq     standard   0.670    10  0.0453 Preprocessor04_Model1
##  9      5 rmse    standard   4.42     10  0.203  Preprocessor05_Model1
## 10      5 rsq     standard   0.674    10  0.0436 Preprocessor05_Model1
## 11      6 rmse    standard   4.41     10  0.189  Preprocessor06_Model1
## 12      6 rsq     standard   0.670    10  0.0423 Preprocessor06_Model1
## 13      7 rmse    standard   4.40     10  0.176  Preprocessor07_Model1
## 14      7 rsq     standard   0.670    10  0.0420 Preprocessor07_Model1
## 15      8 rmse    standard   4.41     10  0.175  Preprocessor08_Model1
## 16      8 rsq     standard   0.670    10  0.0420 Preprocessor08_Model1
## 17      9 rmse    standard   4.47     10  0.207  Preprocessor09_Model1
## 18      9 rsq     standard   0.663    10  0.0445 Preprocessor09_Model1
## 19     10 rmse    standard   4.50     10  0.227  Preprocessor10_Model1
## 20     10 rsq     standard   0.658    10  0.0465 Preprocessor10_Model1

You can also use show_best() to only show the best performing models.

show_best(tune_res, metric = "rmse")
## # A tibble: 5 × 7
##   degree .metric .estimator  mean     n std_err .config              
##    <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1      2 rmse    standard    4.37    10   0.209 Preprocessor02_Model1
## 2      3 rmse    standard    4.40    10   0.217 Preprocessor03_Model1
## 3      7 rmse    standard    4.40    10   0.176 Preprocessor07_Model1
## 4      6 rmse    standard    4.41    10   0.189 Preprocessor06_Model1
## 5      8 rmse    standard    4.41    10   0.175 Preprocessor08_Model1

lower polynomials models are simpler so we ditch desc().

select_by_one_std_err(tune_res, degree, metric = "rmse")
## # A tibble: 1 × 9
##   degree .metric .estimator  mean     n std_err .config             .best .bound
##    <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               <dbl>  <dbl>
## 1      2 rmse    standard    4.37    10   0.209 Preprocessor02_Mod…  4.37   4.58

This selected degree = 2. And we will use this value since we simpler models sometimes can be very beneficial. Especially if we want to explain what happens in it.

best_degree <- select_by_one_std_err(tune_res, degree, metric = "rmse")

This selected value can be now be used to specify the previous unspecified degree argument in poly_wf using finalize_workflow().

final_wf <- finalize_workflow(poly_wf, best_degree)

final_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_poly()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

This workflow can now be fitted. And we want to make sure we fit it on the full training data set.

final_fit <- fit(final_wf, Auto_train)

final_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_poly()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
##       (Intercept)  horsepower_poly_1  horsepower_poly_2  
##             23.34            -104.85              34.39

5.4 The Bootstrap

First, we want to look at the accuracy of a statistic of interest. This statistic is justified in ISLR. We want to calculate the metric within many different bootstraps. We start by calculating 1000 bootstraps of the Portfolio data set.

Portfolio_boots <- bootstraps(Portfolio, times = 1000)
Portfolio_boots
## # Bootstrap sampling 
## # A tibble: 1,000 × 2
##    splits           id           
##    <list>           <chr>        
##  1 <split [100/36]> Bootstrap0001
##  2 <split [100/39]> Bootstrap0002
##  3 <split [100/39]> Bootstrap0003
##  4 <split [100/33]> Bootstrap0004
##  5 <split [100/39]> Bootstrap0005
##  6 <split [100/34]> Bootstrap0006
##  7 <split [100/40]> Bootstrap0007
##  8 <split [100/38]> Bootstrap0008
##  9 <split [100/36]> Bootstrap0009
## 10 <split [100/41]> Bootstrap0010
## # ℹ 990 more rows

Next, we create a function that takes a boot_split object and returns the calculated metric.

alpha.fn <- function(split) {
  data <- analysis(split)
  X <- data$X
  Y <- data$Y
  
  (var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y))
}

Now we can use mutate() and map_dbl() from dplyr and purrr respectively to apply alpha.fn to each of the bootstraps.

alpha_res <- Portfolio_boots %>%
  mutate(alpha = map_dbl(splits, alpha.fn))

alpha_res
## # Bootstrap sampling 
## # A tibble: 1,000 × 3
##    splits           id            alpha
##    <list>           <chr>         <dbl>
##  1 <split [100/36]> Bootstrap0001 0.516
##  2 <split [100/39]> Bootstrap0002 0.687
##  3 <split [100/39]> Bootstrap0003 0.599
##  4 <split [100/33]> Bootstrap0004 0.556
##  5 <split [100/39]> Bootstrap0005 0.549
##  6 <split [100/34]> Bootstrap0006 0.619
##  7 <split [100/40]> Bootstrap0007 0.387
##  8 <split [100/38]> Bootstrap0008 0.675
##  9 <split [100/36]> Bootstrap0009 0.538
## 10 <split [100/41]> Bootstrap0010 0.407
## # ℹ 990 more rows

In the next example do we want to study the variability of the slope and intercept estimate of the linear regression model. And it follows the same structure. First, we create some bootstraps of the data. Then we create a function that takes a split and returns some values. This function will return a tibble for each bootstrap.

Auto_boots <- bootstraps(Auto)

boot.fn <- function(split) {
  lm_fit <- lm_spec %>% fit(mpg ~ horsepower, data = analysis(split))
  tidy(lm_fit)
}

then we use mutate() and map() to apply the function to each of the bootstraps.

boot_res <- Auto_boots %>%
  mutate(models = map(splits, boot.fn))

And we can now unnest() and use group_by() and summarise() to get an estimate of the variability of the slope and intercept in this linear regression model.

boot_res %>%
  unnest(cols = c(models)) %>%
  group_by(term) %>%
  summarise(mean = mean(estimate),
            sd = sd(estimate))
## # A tibble: 2 × 3
##   term          mean      sd
##   <chr>        <dbl>   <dbl>
## 1 (Intercept) 39.8   0.759  
## 2 horsepower  -0.156 0.00593