library(tidymodels) # Includes the workflows package## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.1 ──
## ✓ broom 0.7.6 ✓ recipes 0.1.14
## ✓ dials 0.0.9 ✓ rsample 0.0.9
## ✓ dplyr 1.0.6 ✓ tibble 3.1.2
## ✓ ggplot2 3.3.3 ✓ tidyr 1.1.3
## ✓ infer 0.5.3 ✓ tune 0.1.1
## ✓ modeldata 0.1.0 ✓ workflows 0.2.2
## ✓ parsnip 0.1.5 ✓ yardstick 0.0.7
## ✓ purrr 0.3.4
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x recipes::step() masks stats::step()
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ✓ stringr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x stringr::fixed() masks recipes::fixed()
## x dplyr::lag() masks stats::lag()
## x readr::spec() masks yardstick::spec()
library(ISLR)
data("mlc_churn")Create a test-train rsplit object of mlc_churn using initial_split(). Use the arguments to set the proportions of the training data to be 80%. Stratify the sampling according to the churn variable.
# see the first six observations from the data
head(mlc_churn)## # A tibble: 6 x 20
## state account_length area_code international_plan voice_mail_plan
## <fct> <int> <fct> <fct> <fct>
## 1 KS 128 area_code_415 no yes
## 2 OH 107 area_code_415 no yes
## 3 NJ 137 area_code_415 no no
## 4 OH 84 area_code_408 yes no
## 5 OK 75 area_code_415 yes no
## 6 AL 118 area_code_510 yes no
## # … with 15 more variables: number_vmail_messages <int>,
## # total_day_minutes <dbl>, total_day_calls <int>, total_day_charge <dbl>,
## # total_eve_minutes <dbl>, total_eve_calls <int>, total_eve_charge <dbl>,
## # total_night_minutes <dbl>, total_night_calls <int>,
## # total_night_charge <dbl>, total_intl_minutes <dbl>, total_intl_calls <int>,
## # total_intl_charge <dbl>, number_customer_service_calls <int>, churn <fct>
set.seed(1234)
mlc_split <- initial_split(mlc_churn, prop = 0.8, strata = churn)
mlc_split## <Analysis/Assess/Total>
## <4001/999/5000>
# splitting the data to the training set
mlc_train <- training(mlc_split)The workflows package allows the user to bind modeling and preprocessing objects together.
lda_spec <- discrim::discrim_linear() %>%
set_mode("classification") %>%
set_engine("MASS")
# create a workflow object
lda_wf <- workflow() %>%
add_model(lda_spec) %>%
add_formula(churn ~ total_intl_charge + account_length)mlc_folds <- vfold_cv(mlc_train, v = 10, strata = churn) # repeats = 1
mlc_folds$splits## [[1]]
## <Analysis/Assess/Total>
## <3600/401/4001>
##
## [[2]]
## <Analysis/Assess/Total>
## <3600/401/4001>
##
## [[3]]
## <Analysis/Assess/Total>
## <3600/401/4001>
##
## [[4]]
## <Analysis/Assess/Total>
## <3600/401/4001>
##
## [[5]]
## <Analysis/Assess/Total>
## <3600/401/4001>
##
## [[6]]
## <Analysis/Assess/Total>
## <3601/400/4001>
##
## [[7]]
## <Analysis/Assess/Total>
## <3602/399/4001>
##
## [[8]]
## <Analysis/Assess/Total>
## <3602/399/4001>
##
## [[9]]
## <Analysis/Assess/Total>
## <3602/399/4001>
##
## [[10]]
## <Analysis/Assess/Total>
## <3602/399/4001>
fit_folds <- fit_resamples(lda_wf, resamples = mlc_folds,
control = control_resamples(save_pred = TRUE))Exemplification of calculating the accuracy metric for each fold.
fit_folds %>%
collect_metrics(summarize = FALSE) %>%
filter(.metric == "accuracy") %>%
select(-.estimator)## # A tibble: 10 x 3
## id .metric .estimate
## <chr> <chr> <dbl>
## 1 Fold01 accuracy 0.858
## 2 Fold02 accuracy 0.858
## 3 Fold03 accuracy 0.858
## 4 Fold04 accuracy 0.858
## 5 Fold05 accuracy 0.858
## 6 Fold06 accuracy 0.858
## 7 Fold07 accuracy 0.860
## 8 Fold08 accuracy 0.860
## 9 Fold09 accuracy 0.860
## 10 Fold10 accuracy 0.860
The confusion matrix is depicted below.
fit_folds %>%
collect_predictions() %>%
# filter(id == "Fold01") %>%
conf_mat(truth = churn, estimate = .pred_class)## Truth
## Prediction yes no
## yes 0 0
## no 566 3435
The model has 86 % predicted accuracy on testing set.
final_fit <- fit(lda_wf, data = mlc_train)
augment(final_fit, new_data = testing(mlc_split)) %>%
accuracy(truth = churn, estimate = .pred_class)## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.859
Repeat the above steps, but create 10 bootstraps instead. What does the different results mean?
Let’s take a look at what’s going on.
mlc_boots <- bootstraps(mlc_train,strata = churn, time = 10) # repeats = 1
mlc_boots$splits## [[1]]
## <Analysis/Assess/Total>
## <4001/1479/4001>
##
## [[2]]
## <Analysis/Assess/Total>
## <4001/1486/4001>
##
## [[3]]
## <Analysis/Assess/Total>
## <4001/1471/4001>
##
## [[4]]
## <Analysis/Assess/Total>
## <4001/1468/4001>
##
## [[5]]
## <Analysis/Assess/Total>
## <4001/1449/4001>
##
## [[6]]
## <Analysis/Assess/Total>
## <4001/1474/4001>
##
## [[7]]
## <Analysis/Assess/Total>
## <4001/1477/4001>
##
## [[8]]
## <Analysis/Assess/Total>
## <4001/1457/4001>
##
## [[9]]
## <Analysis/Assess/Total>
## <4001/1497/4001>
##
## [[10]]
## <Analysis/Assess/Total>
## <4001/1501/4001>
fit_resamples() to fit the workflow that we created within each bootstrap in tune package.fit_boots <- fit_resamples(lda_wf, resamples = mlc_boots,
control = control_resamples(save_pred = TRUE))
fit_boots$.metrics## [[1]]
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.853
## 2 roc_auc binary 0.527
##
## [[2]]
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.857
## 2 roc_auc binary 0.518
##
## [[3]]
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.855
## 2 roc_auc binary 0.566
##
## [[4]]
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.861
## 2 roc_auc binary 0.543
##
## [[5]]
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.861
## 2 roc_auc binary 0.536
##
## [[6]]
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.855
## 2 roc_auc binary 0.525
##
## [[7]]
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.863
## 2 roc_auc binary 0.541
##
## [[8]]
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.869
## 2 roc_auc binary 0.532
##
## [[9]]
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.864
## 2 roc_auc binary 0.507
##
## [[10]]
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.864
## 2 roc_auc binary 0.514
For each extraction, we need to write down summarize = FALSE in collect_metrics() in order to allows us the see the individual performance metrics for each fold.
fit_boots %>%
collect_metrics(summarize = FALSE) ## # A tibble: 20 x 4
## id .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 Bootstrap01 accuracy binary 0.853
## 2 Bootstrap01 roc_auc binary 0.527
## 3 Bootstrap02 accuracy binary 0.857
## 4 Bootstrap02 roc_auc binary 0.518
## 5 Bootstrap03 accuracy binary 0.855
## 6 Bootstrap03 roc_auc binary 0.566
## 7 Bootstrap04 accuracy binary 0.861
## 8 Bootstrap04 roc_auc binary 0.543
## 9 Bootstrap05 accuracy binary 0.861
## 10 Bootstrap05 roc_auc binary 0.536
## 11 Bootstrap06 accuracy binary 0.855
## 12 Bootstrap06 roc_auc binary 0.525
## 13 Bootstrap07 accuracy binary 0.863
## 14 Bootstrap07 roc_auc binary 0.541
## 15 Bootstrap08 accuracy binary 0.869
## 16 Bootstrap08 roc_auc binary 0.532
## 17 Bootstrap09 accuracy binary 0.864
## 18 Bootstrap09 roc_auc binary 0.507
## 19 Bootstrap10 accuracy binary 0.864
## 20 Bootstrap10 roc_auc binary 0.514
Here is a confusion metrix of 10 bootstraps.
fit_boots %>%
collect_predictions() %>%
conf_mat(truth = churn, estimate = .pred_class)## Truth
## Prediction yes no
## yes 0 0
## no 2062 12697
The mean accuracy of 10-fold cross-validation is comparable to that of 10 bootstraps. Cross-Validation provides estimates of the test error, and Bootstrap provides the standard error of the estimates. A bootstrapped data collection may contain numerous occurrences of the same original cases and may completely miss other original cases due to the drawing with replacement. Cross validation resamples without replacing data, resulting in smaller surrogate data sets than the original. It does not have an obvious result in this data set.
fit_folds %>%
collect_metrics() ## # A tibble: 2 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 0.859 10 0.000305
## 2 roc_auc binary 0.541 10 0.0168
fit_boots %>%
collect_metrics() ## # A tibble: 2 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 0.860 10 0.00159
## 2 roc_auc binary 0.531 10 0.00538