Libraries and Data

library(tidymodels) # Includes the workflows package
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.1 ──
## ✓ broom     0.7.6      ✓ recipes   0.1.14
## ✓ dials     0.0.9      ✓ rsample   0.0.9 
## ✓ dplyr     1.0.6      ✓ tibble    3.1.2 
## ✓ ggplot2   3.3.3      ✓ tidyr     1.1.3 
## ✓ infer     0.5.3      ✓ tune      0.1.1 
## ✓ modeldata 0.1.0      ✓ workflows 0.2.2 
## ✓ parsnip   0.1.5      ✓ yardstick 0.0.7 
## ✓ purrr     0.3.4
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ✓ stringr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x stringr::fixed()    masks recipes::fixed()
## x dplyr::lag()        masks stats::lag()
## x readr::spec()       masks yardstick::spec()
library(ISLR)
data("mlc_churn")

Leave-One-Out Cross-Validation

Create a test-train rsplit object of mlc_churn using initial_split(). Use the arguments to set the proportions of the training data to be 80%. Stratify the sampling according to the churn variable.

# see the first six observations from the data
head(mlc_churn)
## # A tibble: 6 x 20
##   state account_length area_code     international_plan voice_mail_plan
##   <fct>          <int> <fct>         <fct>              <fct>          
## 1 KS               128 area_code_415 no                 yes            
## 2 OH               107 area_code_415 no                 yes            
## 3 NJ               137 area_code_415 no                 no             
## 4 OH                84 area_code_408 yes                no             
## 5 OK                75 area_code_415 yes                no             
## 6 AL               118 area_code_510 yes                no             
## # … with 15 more variables: number_vmail_messages <int>,
## #   total_day_minutes <dbl>, total_day_calls <int>, total_day_charge <dbl>,
## #   total_eve_minutes <dbl>, total_eve_calls <int>, total_eve_charge <dbl>,
## #   total_night_minutes <dbl>, total_night_calls <int>,
## #   total_night_charge <dbl>, total_intl_minutes <dbl>, total_intl_calls <int>,
## #   total_intl_charge <dbl>, number_customer_service_calls <int>, churn <fct>
set.seed(1234)
mlc_split <- initial_split(mlc_churn, prop = 0.8, strata = churn)
mlc_split
## <Analysis/Assess/Total>
## <4001/999/5000>
# splitting the data to the training set
mlc_train <- training(mlc_split)
  1. Create a LDA model specification.

The workflows package allows the user to bind modeling and preprocessing objects together.

lda_spec <- discrim::discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS")

# create a workflow object
lda_wf <- workflow() %>%
  add_model(lda_spec) %>%
  add_formula(churn ~ total_intl_charge + account_length)
  1. Create a 10-fold cross-validation split object.
mlc_folds <- vfold_cv(mlc_train, v = 10, strata = churn) # repeats = 1
mlc_folds$splits
## [[1]]
## <Analysis/Assess/Total>
## <3600/401/4001>
## 
## [[2]]
## <Analysis/Assess/Total>
## <3600/401/4001>
## 
## [[3]]
## <Analysis/Assess/Total>
## <3600/401/4001>
## 
## [[4]]
## <Analysis/Assess/Total>
## <3600/401/4001>
## 
## [[5]]
## <Analysis/Assess/Total>
## <3600/401/4001>
## 
## [[6]]
## <Analysis/Assess/Total>
## <3601/400/4001>
## 
## [[7]]
## <Analysis/Assess/Total>
## <3602/399/4001>
## 
## [[8]]
## <Analysis/Assess/Total>
## <3602/399/4001>
## 
## [[9]]
## <Analysis/Assess/Total>
## <3602/399/4001>
## 
## [[10]]
## <Analysis/Assess/Total>
## <3602/399/4001>
  1. Fit the model within each of the folds.
fit_folds <- fit_resamples(lda_wf, resamples = mlc_folds, 
                           control = control_resamples(save_pred = TRUE))
  1. Extract the performance metrics for each fold.

Exemplification of calculating the accuracy metric for each fold.

fit_folds %>%
  collect_metrics(summarize = FALSE) %>%
  filter(.metric == "accuracy") %>%
  select(-.estimator)
## # A tibble: 10 x 3
##    id     .metric  .estimate
##    <chr>  <chr>        <dbl>
##  1 Fold01 accuracy     0.858
##  2 Fold02 accuracy     0.858
##  3 Fold03 accuracy     0.858
##  4 Fold04 accuracy     0.858
##  5 Fold05 accuracy     0.858
##  6 Fold06 accuracy     0.858
##  7 Fold07 accuracy     0.860
##  8 Fold08 accuracy     0.860
##  9 Fold09 accuracy     0.860
## 10 Fold10 accuracy     0.860

The confusion matrix is depicted below.

fit_folds %>%
  collect_predictions() %>%
  # filter(id == "Fold01") %>%
  conf_mat(truth = churn, estimate = .pred_class)
##           Truth
## Prediction  yes   no
##        yes    0    0
##        no   566 3435

The model has 86 % predicted accuracy on testing set.

final_fit <- fit(lda_wf, data = mlc_train)
augment(final_fit, new_data = testing(mlc_split)) %>%
  accuracy(truth = churn, estimate = .pred_class)
## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.859

The Bootstraps

Repeat the above steps, but create 10 bootstraps instead. What does the different results mean?

Let’s take a look at what’s going on.

  1. Create a 10 bootstraps split object.
mlc_boots <- bootstraps(mlc_train,strata = churn, time = 10) # repeats = 1
mlc_boots$splits
## [[1]]
## <Analysis/Assess/Total>
## <4001/1479/4001>
## 
## [[2]]
## <Analysis/Assess/Total>
## <4001/1486/4001>
## 
## [[3]]
## <Analysis/Assess/Total>
## <4001/1471/4001>
## 
## [[4]]
## <Analysis/Assess/Total>
## <4001/1468/4001>
## 
## [[5]]
## <Analysis/Assess/Total>
## <4001/1449/4001>
## 
## [[6]]
## <Analysis/Assess/Total>
## <4001/1474/4001>
## 
## [[7]]
## <Analysis/Assess/Total>
## <4001/1477/4001>
## 
## [[8]]
## <Analysis/Assess/Total>
## <4001/1457/4001>
## 
## [[9]]
## <Analysis/Assess/Total>
## <4001/1497/4001>
## 
## [[10]]
## <Analysis/Assess/Total>
## <4001/1501/4001>
  1. Fit the model within each of the folds. We can use fit_resamples() to fit the workflow that we created within each bootstrap in tune package.
fit_boots <- fit_resamples(lda_wf, resamples = mlc_boots, 
                           control = control_resamples(save_pred = TRUE))
fit_boots$.metrics
## [[1]]
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.853
## 2 roc_auc  binary         0.527
## 
## [[2]]
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.857
## 2 roc_auc  binary         0.518
## 
## [[3]]
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.855
## 2 roc_auc  binary         0.566
## 
## [[4]]
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.861
## 2 roc_auc  binary         0.543
## 
## [[5]]
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.861
## 2 roc_auc  binary         0.536
## 
## [[6]]
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.855
## 2 roc_auc  binary         0.525
## 
## [[7]]
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.863
## 2 roc_auc  binary         0.541
## 
## [[8]]
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.869
## 2 roc_auc  binary         0.532
## 
## [[9]]
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.864
## 2 roc_auc  binary         0.507
## 
## [[10]]
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.864
## 2 roc_auc  binary         0.514
  1. Extract the performance metrics for each fold.

For each extraction, we need to write down summarize = FALSE in collect_metrics() in order to allows us the see the individual performance metrics for each fold.

fit_boots %>%
  collect_metrics(summarize = FALSE) 
## # A tibble: 20 x 4
##    id          .metric  .estimator .estimate
##    <chr>       <chr>    <chr>          <dbl>
##  1 Bootstrap01 accuracy binary         0.853
##  2 Bootstrap01 roc_auc  binary         0.527
##  3 Bootstrap02 accuracy binary         0.857
##  4 Bootstrap02 roc_auc  binary         0.518
##  5 Bootstrap03 accuracy binary         0.855
##  6 Bootstrap03 roc_auc  binary         0.566
##  7 Bootstrap04 accuracy binary         0.861
##  8 Bootstrap04 roc_auc  binary         0.543
##  9 Bootstrap05 accuracy binary         0.861
## 10 Bootstrap05 roc_auc  binary         0.536
## 11 Bootstrap06 accuracy binary         0.855
## 12 Bootstrap06 roc_auc  binary         0.525
## 13 Bootstrap07 accuracy binary         0.863
## 14 Bootstrap07 roc_auc  binary         0.541
## 15 Bootstrap08 accuracy binary         0.869
## 16 Bootstrap08 roc_auc  binary         0.532
## 17 Bootstrap09 accuracy binary         0.864
## 18 Bootstrap09 roc_auc  binary         0.507
## 19 Bootstrap10 accuracy binary         0.864
## 20 Bootstrap10 roc_auc  binary         0.514

Here is a confusion metrix of 10 bootstraps.

fit_boots %>%
  collect_predictions() %>%
  conf_mat(truth = churn, estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes     0     0
##        no   2062 12697

Comparison

The mean accuracy of 10-fold cross-validation is comparable to that of 10 bootstraps. Cross-Validation provides estimates of the test error, and Bootstrap provides the standard error of the estimates. A bootstrapped data collection may contain numerous occurrences of the same original cases and may completely miss other original cases due to the drawing with replacement. Cross validation resamples without replacing data, resulting in smaller surrogate data sets than the original. It does not have an obvious result in this data set.

fit_folds %>%
  collect_metrics() 
## # A tibble: 2 x 5
##   .metric  .estimator  mean     n  std_err
##   <chr>    <chr>      <dbl> <int>    <dbl>
## 1 accuracy binary     0.859    10 0.000305
## 2 roc_auc  binary     0.541    10 0.0168
fit_boots %>%
  collect_metrics() 
## # A tibble: 2 x 5
##   .metric  .estimator  mean     n std_err
##   <chr>    <chr>      <dbl> <int>   <dbl>
## 1 accuracy binary     0.860    10 0.00159
## 2 roc_auc  binary     0.531    10 0.00538