Evaluating Models

Author

Jamal Rogers

Published

August 17, 2023

from the previous section

library(tidymodels)
library(modeldatatoo)

taxi <- data_taxi() |>
        drop_na()

# Data Budget
set.seed(18)
taxi_split <- initial_split(taxi, prop = 0.8, strata = tip)
taxi_split

<Training/Testing/Total>
<8000/2000/10000>

taxi_train <- training(taxi_split)
taxi_test <- testing(taxi_split)

tree_spec <- decision_tree(cost_complexity = 0.0001, mode = "classification")
taxi_wflow <- workflow(tip ~ ., tree_spec)
taxi_fit <- fit(taxi_wflow, taxi_train)

Looking at predictions

augment(taxi_fit, new_data = taxi_train) |>
  relocate(tip, .pred_class, .pred_yes, .pred_no)

# A tibble: 8,000 × 10
   tip   .pred_class .pred_yes .pred_no distance company local dow   month  hour
   <fct> <fct>           <dbl>    <dbl>    <dbl> <fct>   <fct> <fct> <fct> <int>
 1 yes   yes             0.973   0.0271    17.2  Chicag… no    Thu   Feb      16
 2 yes   yes             0.909   0.0907     0.88 City S… yes   Thu   Mar       8
 3 yes   yes             0.968   0.0316    20.7  Chicag… no    Mon   Apr       8
 4 yes   yes             0.952   0.0484    12.2  Chicag… no    Sun   Mar      21
 5 yes   yes             0.812   0.188      0.94 Sun Ta… yes   Sat   Apr      23
 6 yes   yes             0.932   0.0680     1.85 Taxica… no    Fri   Apr      12
 7 yes   yes             0.932   0.0680     1.47 City S… no    Tue   Mar      14
 8 yes   yes             0.954   0.0465     0.53 Sun Ta… no    Tue   Mar      18
 9 yes   yes             0.886   0.114      6.65 Taxica… no    Sun   Apr      11
10 yes   yes             0.909   0.0907     1.21 Sun Ta… yes   Thu   Apr      19
# ℹ 7,990 more rows

Confusion Matrix

augment(taxi_fit, new_data = taxi_train) |>
        conf_mat(truth = tip, estimate = .pred_class)

          Truth
Prediction  yes   no
       yes 7314  533
       no    57   96

augment(taxi_fit, new_data = taxi_train) |>
        conf_mat(truth = tip, estimate = .pred_class) |>
        autoplot(type = "heatmap")

Metrics for model performance

Accuracy

augment(taxi_fit, new_data = taxi_train) |>
        accuracy(truth = tip, estimate = .pred_class)

# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.926

Note: We need to be careful of using accuracy() since it can give “good” performance by only predicting one way with imbalanced data

Sensitivity

augment(taxi_fit, new_data = taxi_train) |>
        sensitivity(truth = tip, estimate = .pred_class)

# A tibble: 1 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 sensitivity binary         0.992

Specificity

augment(taxi_fit, new_data = taxi_train) |>
        specificity(truth = tip, estimate = .pred_class)

# A tibble: 1 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 specificity binary         0.153

Metric set

We can use metric_set() to combine multiple calculations into one

taxi_metrics <- metric_set(accuracy, specificity, sensitivity)

augment(taxi_fit, new_data = taxi_train) |>
        taxi_metrics(truth = tip, estimate = .pred_class)

# A tibble: 3 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary         0.926
2 specificity binary         0.153
3 sensitivity binary         0.992

ROC curves

To make an ROC (receiver operator characteristics) curve, we:

calculate the sensitivity and specificity for all possible thresholds
plot false positive rate (x-axis) versus true positive rate (y-axis)

given that sensitivity is the true positive rate, and specificity is the true negative rate. Hence 1 - specificity is the false positive rate.

We can use the area under the ROC curve as a classification metric:

ROC AUC = 1 💯
ROC AUC = 1/2 😢

augment(taxi_fit, new_data = taxi_train) |>
        roc_curve(truth = tip, .pred_yes) |>
        slice(1, 20, 50)

# A tibble: 3 × 3
  .threshold specificity sensitivity
       <dbl>       <dbl>       <dbl>
1   -Inf           0          1     
2      0.812       0.226      0.976 
3      0.979       0.994      0.0365

augment(taxi_fit, new_data = taxi_train) |>
        roc_auc(truth = tip, .pred_yes)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.724

ROC curve plot

augment(taxi_fit, new_data = taxi_train) |>
        roc_curve(truth = tip, .pred_yes) |>
        autoplot()

How can we use the training data to compare and evaluate different models?

Cross-validation

vfold_cv(taxi_train) # v = 10 is default

#  10-fold cross-validation 
# A tibble: 10 × 2
   splits             id    
   <list>             <chr> 
 1 <split [7200/800]> Fold01
 2 <split [7200/800]> Fold02
 3 <split [7200/800]> Fold03
 4 <split [7200/800]> Fold04
 5 <split [7200/800]> Fold05
 6 <split [7200/800]> Fold06
 7 <split [7200/800]> Fold07
 8 <split [7200/800]> Fold08
 9 <split [7200/800]> Fold09
10 <split [7200/800]> Fold10

taxi_folds <- vfold_cv(taxi_train)
taxi_folds$splits[1:3]

[[1]]
<Analysis/Assess/Total>
<7200/800/8000>

[[2]]
<Analysis/Assess/Total>
<7200/800/8000>

[[3]]
<Analysis/Assess/Total>
<7200/800/8000>

vfold_cv(taxi_train, v = 5)

#  5-fold cross-validation 
# A tibble: 5 × 2
  splits              id   
  <list>              <chr>
1 <split [6400/1600]> Fold1
2 <split [6400/1600]> Fold2
3 <split [6400/1600]> Fold3
4 <split [6400/1600]> Fold4
5 <split [6400/1600]> Fold5

vfold_cv(taxi_train, strata = tip)

#  10-fold cross-validation using stratification 
# A tibble: 10 × 2
   splits             id    
   <list>             <chr> 
 1 <split [7200/800]> Fold01
 2 <split [7200/800]> Fold02
 3 <split [7200/800]> Fold03
 4 <split [7200/800]> Fold04
 5 <split [7200/800]> Fold05
 6 <split [7200/800]> Fold06
 7 <split [7200/800]> Fold07
 8 <split [7200/800]> Fold08
 9 <split [7200/800]> Fold09
10 <split [7200/800]> Fold10

We’ll use this!

set.seed(123)
taxi_folds <- vfold_cv(taxi_train, v = 10, strata = tip)
taxi_folds

#  10-fold cross-validation using stratification 
# A tibble: 10 × 2
   splits             id    
   <list>             <chr> 
 1 <split [7200/800]> Fold01
 2 <split [7200/800]> Fold02
 3 <split [7200/800]> Fold03
 4 <split [7200/800]> Fold04
 5 <split [7200/800]> Fold05
 6 <split [7200/800]> Fold06
 7 <split [7200/800]> Fold07
 8 <split [7200/800]> Fold08
 9 <split [7200/800]> Fold09
10 <split [7200/800]> Fold10

Fit our model to the resamples

taxi_res <- fit_resamples(taxi_wflow, taxi_folds)
taxi_res

# Resampling results
# 10-fold cross-validation using stratification 
# A tibble: 10 × 4
   splits             id     .metrics         .notes          
   <list>             <chr>  <list>           <list>          
 1 <split [7200/800]> Fold01 <tibble [2 × 4]> <tibble [0 × 3]>
 2 <split [7200/800]> Fold02 <tibble [2 × 4]> <tibble [0 × 3]>
 3 <split [7200/800]> Fold03 <tibble [2 × 4]> <tibble [0 × 3]>
 4 <split [7200/800]> Fold04 <tibble [2 × 4]> <tibble [0 × 3]>
 5 <split [7200/800]> Fold05 <tibble [2 × 4]> <tibble [0 × 3]>
 6 <split [7200/800]> Fold06 <tibble [2 × 4]> <tibble [0 × 3]>
 7 <split [7200/800]> Fold07 <tibble [2 × 4]> <tibble [0 × 3]>
 8 <split [7200/800]> Fold08 <tibble [2 × 4]> <tibble [0 × 3]>
 9 <split [7200/800]> Fold09 <tibble [2 × 4]> <tibble [0 × 3]>
10 <split [7200/800]> Fold10 <tibble [2 × 4]> <tibble [0 × 3]>

Evaluating model performance

taxi_res %>%
  collect_metrics()

# A tibble: 2 × 6
  .metric  .estimator  mean     n std_err .config             
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
1 accuracy binary     0.914    10 0.00262 Preprocessor1_Model1
2 roc_auc  binary     0.621    10 0.00769 Preprocessor1_Model1

We can reliably measure performance using only the training data 🎉

Comparing the metrics

taxi_res %>%
  collect_metrics() %>% 
  select(.metric, mean, n)

# A tibble: 2 × 3
  .metric   mean     n
  <chr>    <dbl> <int>
1 accuracy 0.914    10
2 roc_auc  0.621    10

Remember that:

⚠️ the training set gives you overly optimistic metrics

⚠️ the test set is precious

Evaluating model performance

# Save the assessment set results
ctrl_taxi <- control_resamples(save_pred = TRUE)
taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi)

taxi_preds <- collect_predictions(taxi_res)
taxi_preds

# A tibble: 8,000 × 7
   id     .pred_yes .pred_no  .row .pred_class tip   .config             
   <chr>      <dbl>    <dbl> <int> <fct>       <fct> <chr>               
 1 Fold01     0.967   0.0329     1 yes         yes   Preprocessor1_Model1
 2 Fold01     0.930   0.0701     6 yes         yes   Preprocessor1_Model1
 3 Fold01     0.930   0.0701    20 yes         yes   Preprocessor1_Model1
 4 Fold01     0.930   0.0701    30 yes         yes   Preprocessor1_Model1
 5 Fold01     0.930   0.0701    61 yes         yes   Preprocessor1_Model1
 6 Fold01     0.967   0.0329    90 yes         yes   Preprocessor1_Model1
 7 Fold01     0.967   0.0329    91 yes         yes   Preprocessor1_Model1
 8 Fold01     0.930   0.0701   111 yes         yes   Preprocessor1_Model1
 9 Fold01     0.914   0.0856   113 yes         no    Preprocessor1_Model1
10 Fold01     0.896   0.104    115 yes         yes   Preprocessor1_Model1
# ℹ 7,990 more rows

taxi_preds %>% 
  group_by(id) %>%
  taxi_metrics(truth = tip, estimate = .pred_class)

# A tibble: 30 × 4
   id     .metric  .estimator .estimate
   <chr>  <chr>    <chr>          <dbl>
 1 Fold01 accuracy binary         0.919
 2 Fold02 accuracy binary         0.91 
 3 Fold03 accuracy binary         0.91 
 4 Fold04 accuracy binary         0.914
 5 Fold05 accuracy binary         0.91 
 6 Fold06 accuracy binary         0.916
 7 Fold07 accuracy binary         0.908
 8 Fold08 accuracy binary         0.902
 9 Fold09 accuracy binary         0.922
10 Fold10 accuracy binary         0.931
# ℹ 20 more rows

Where are the fitted models?

taxi_res

# Resampling results
# 10-fold cross-validation using stratification 
# A tibble: 10 × 5
   splits             id     .metrics         .notes           .predictions
   <list>             <chr>  <list>           <list>           <list>      
 1 <split [7200/800]> Fold01 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
 2 <split [7200/800]> Fold02 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
 3 <split [7200/800]> Fold03 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
 4 <split [7200/800]> Fold04 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
 5 <split [7200/800]> Fold05 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
 6 <split [7200/800]> Fold06 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
 7 <split [7200/800]> Fold07 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
 8 <split [7200/800]> Fold08 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
 9 <split [7200/800]> Fold09 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>    
10 <split [7200/800]> Fold10 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>

Bootstrapping

set.seed(3214)
bootstraps(taxi_train)

# Bootstrap sampling 
# A tibble: 25 × 2
   splits              id         
   <list>              <chr>      
 1 <split [8000/2902]> Bootstrap01
 2 <split [8000/2916]> Bootstrap02
 3 <split [8000/3004]> Bootstrap03
 4 <split [8000/2979]> Bootstrap04
 5 <split [8000/2961]> Bootstrap05
 6 <split [8000/2962]> Bootstrap06
 7 <split [8000/3026]> Bootstrap07
 8 <split [8000/2926]> Bootstrap08
 9 <split [8000/2972]> Bootstrap09
10 <split [8000/2972]> Bootstrap10
# ℹ 15 more rows

Monte Carlo Cross-Validation

set.seed(322)
mc_cv(taxi_train, times = 10)

# Monte Carlo cross-validation (0.75/0.25) with 10 resamples  
# A tibble: 10 × 2
   splits              id        
   <list>              <chr>     
 1 <split [6000/2000]> Resample01
 2 <split [6000/2000]> Resample02
 3 <split [6000/2000]> Resample03
 4 <split [6000/2000]> Resample04
 5 <split [6000/2000]> Resample05
 6 <split [6000/2000]> Resample06
 7 <split [6000/2000]> Resample07
 8 <split [6000/2000]> Resample08
 9 <split [6000/2000]> Resample09
10 <split [6000/2000]> Resample10

Validation set

set.seed(853)
validation_split(taxi_train, strata = tip)

# Validation Set Split (0.75/0.25)  using stratification 
# A tibble: 1 × 2
  splits              id        
  <list>              <chr>     
1 <split [6000/2000]> validation

The whole game - status update

Random Forest 🌳🌲🌴🌵

Ensemble many decision tree models
All the trees vote! 🗳️
Bootstrap aggregating + random predictor sampling

Often works well without tuning hyperparameters (more on this in Advanced tidymodels!), as long as there are enough trees

Model Specification

rf_spec <- rand_forest(trees = 1000, mode = "classification")
rf_spec

Random Forest Model Specification (classification)

Main Arguments:
  trees = 1000

Computational engine: ranger

Workflow

rf_wflow <- workflow(tip ~ ., rf_spec)
rf_wflow

══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Formula
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
tip ~ .

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)

Main Arguments:
  trees = 1000

Computational engine: ranger

Evaluate Performance

ctrl_taxi <- control_resamples(save_pred = TRUE)

set.seed(2)
rf_res <- fit_resamples(rf_wflow, taxi_folds, control = ctrl_taxi)
collect_metrics(rf_res)

# A tibble: 2 × 6
  .metric  .estimator  mean     n std_err .config             
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
1 accuracy binary     0.922    10 0.00265 Preprocessor1_Model1
2 roc_auc  binary     0.634    10 0.00850 Preprocessor1_Model1

The whole game - status update

The final fit

Suppose that we are happy with our random forest model.

Let's fit the model on the training set and verify our performance using the test set.

We've shown you fit() and predict() (+ augment()) but there is a shortcut:

# taxi_split has train + test info
final_fit <- last_fit(rf_wflow, taxi_split) 

final_fit

# Resampling results
# Manual resampling 
# A tibble: 1 × 6
  splits              id               .metrics .notes   .predictions .workflow 
  <list>              <chr>            <list>   <list>   <list>       <list>    
1 <split [8000/2000]> train/test split <tibble> <tibble> <tibble>     <workflow>

collect_metrics(final_fit)

# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.921 Preprocessor1_Model1
2 roc_auc  binary         0.631 Preprocessor1_Model1

These are metrics computed with the test set

collect_predictions(final_fit)

# A tibble: 2,000 × 7
   id               .pred_yes .pred_no  .row .pred_class tip   .config          
   <chr>                <dbl>    <dbl> <int> <fct>       <fct> <chr>            
 1 train/test split     0.988   0.0123     3 yes         yes   Preprocessor1_Mo…
 2 train/test split     0.982   0.0180     7 yes         yes   Preprocessor1_Mo…
 3 train/test split     0.963   0.0374     8 yes         yes   Preprocessor1_Mo…
 4 train/test split     0.980   0.0197    13 yes         yes   Preprocessor1_Mo…
 5 train/test split     0.849   0.151     29 yes         yes   Preprocessor1_Mo…
 6 train/test split     0.943   0.0574    37 yes         yes   Preprocessor1_Mo…
 7 train/test split     0.867   0.133     45 yes         yes   Preprocessor1_Mo…
 8 train/test split     0.885   0.115     46 yes         yes   Preprocessor1_Mo…
 9 train/test split     0.949   0.0507    49 yes         yes   Preprocessor1_Mo…
10 train/test split     0.989   0.0114    53 yes         yes   Preprocessor1_Mo…
# ℹ 1,990 more rows

extract_workflow(final_fit)

══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Formula
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
tip ~ .

── Model ───────────────────────────────────────────────────────────────────────
Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~1000,      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1), probability = TRUE) 

Type:                             Probability estimation 
Number of trees:                  1000 
Sample size:                      8000 
Number of independent variables:  6 
Mtry:                             2 
Target node size:                 10 
Variable importance mode:         none 
Splitrule:                        gini 
OOB prediction error (Brier s.):  0.07171122

Use this for prediction on new data, like for deploying

The whole game