from the previous section
library (tidymodels)
library (modeldatatoo)
taxi <- data_taxi () |>
drop_na ()
# Data Budget
set.seed (18 )
taxi_split <- initial_split (taxi, prop = 0.8 , strata = tip)
taxi_split
<Training/Testing/Total>
<8000/2000/10000>
taxi_train <- training (taxi_split)
taxi_test <- testing (taxi_split)
tree_spec <- decision_tree (cost_complexity = 0.0001 , mode = "classification" )
taxi_wflow <- workflow (tip ~ ., tree_spec)
taxi_fit <- fit (taxi_wflow, taxi_train)
Looking at predictions
augment (taxi_fit, new_data = taxi_train) |>
relocate (tip, .pred_class, .pred_yes, .pred_no)
# A tibble: 8,000 × 10
tip .pred_class .pred_yes .pred_no distance company local dow month hour
<fct> <fct> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct> <int>
1 yes yes 0.973 0.0271 17.2 Chicag… no Thu Feb 16
2 yes yes 0.909 0.0907 0.88 City S… yes Thu Mar 8
3 yes yes 0.968 0.0316 20.7 Chicag… no Mon Apr 8
4 yes yes 0.952 0.0484 12.2 Chicag… no Sun Mar 21
5 yes yes 0.812 0.188 0.94 Sun Ta… yes Sat Apr 23
6 yes yes 0.932 0.0680 1.85 Taxica… no Fri Apr 12
7 yes yes 0.932 0.0680 1.47 City S… no Tue Mar 14
8 yes yes 0.954 0.0465 0.53 Sun Ta… no Tue Mar 18
9 yes yes 0.886 0.114 6.65 Taxica… no Sun Apr 11
10 yes yes 0.909 0.0907 1.21 Sun Ta… yes Thu Apr 19
# ℹ 7,990 more rows
Confusion Matrix
augment (taxi_fit, new_data = taxi_train) |>
conf_mat (truth = tip, estimate = .pred_class)
Truth
Prediction yes no
yes 7314 533
no 57 96
augment (taxi_fit, new_data = taxi_train) |>
conf_mat (truth = tip, estimate = .pred_class) |>
autoplot (type = "heatmap" )
How can we use the training data to compare and evaluate different models?
Cross-validation
vfold_cv (taxi_train) # v = 10 is default
# 10-fold cross-validation
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [7200/800]> Fold01
2 <split [7200/800]> Fold02
3 <split [7200/800]> Fold03
4 <split [7200/800]> Fold04
5 <split [7200/800]> Fold05
6 <split [7200/800]> Fold06
7 <split [7200/800]> Fold07
8 <split [7200/800]> Fold08
9 <split [7200/800]> Fold09
10 <split [7200/800]> Fold10
taxi_folds <- vfold_cv (taxi_train)
taxi_folds$ splits[1 : 3 ]
[[1]]
<Analysis/Assess/Total>
<7200/800/8000>
[[2]]
<Analysis/Assess/Total>
<7200/800/8000>
[[3]]
<Analysis/Assess/Total>
<7200/800/8000>
vfold_cv (taxi_train, v = 5 )
# 5-fold cross-validation
# A tibble: 5 × 2
splits id
<list> <chr>
1 <split [6400/1600]> Fold1
2 <split [6400/1600]> Fold2
3 <split [6400/1600]> Fold3
4 <split [6400/1600]> Fold4
5 <split [6400/1600]> Fold5
vfold_cv (taxi_train, strata = tip)
# 10-fold cross-validation using stratification
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [7200/800]> Fold01
2 <split [7200/800]> Fold02
3 <split [7200/800]> Fold03
4 <split [7200/800]> Fold04
5 <split [7200/800]> Fold05
6 <split [7200/800]> Fold06
7 <split [7200/800]> Fold07
8 <split [7200/800]> Fold08
9 <split [7200/800]> Fold09
10 <split [7200/800]> Fold10
We’ll use this!
set.seed (123 )
taxi_folds <- vfold_cv (taxi_train, v = 10 , strata = tip)
taxi_folds
# 10-fold cross-validation using stratification
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [7200/800]> Fold01
2 <split [7200/800]> Fold02
3 <split [7200/800]> Fold03
4 <split [7200/800]> Fold04
5 <split [7200/800]> Fold05
6 <split [7200/800]> Fold06
7 <split [7200/800]> Fold07
8 <split [7200/800]> Fold08
9 <split [7200/800]> Fold09
10 <split [7200/800]> Fold10
Fit our model to the resamples
taxi_res <- fit_resamples (taxi_wflow, taxi_folds)
taxi_res
# Resampling results
# 10-fold cross-validation using stratification
# A tibble: 10 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [7200/800]> Fold01 <tibble [2 × 4]> <tibble [0 × 3]>
2 <split [7200/800]> Fold02 <tibble [2 × 4]> <tibble [0 × 3]>
3 <split [7200/800]> Fold03 <tibble [2 × 4]> <tibble [0 × 3]>
4 <split [7200/800]> Fold04 <tibble [2 × 4]> <tibble [0 × 3]>
5 <split [7200/800]> Fold05 <tibble [2 × 4]> <tibble [0 × 3]>
6 <split [7200/800]> Fold06 <tibble [2 × 4]> <tibble [0 × 3]>
7 <split [7200/800]> Fold07 <tibble [2 × 4]> <tibble [0 × 3]>
8 <split [7200/800]> Fold08 <tibble [2 × 4]> <tibble [0 × 3]>
9 <split [7200/800]> Fold09 <tibble [2 × 4]> <tibble [0 × 3]>
10 <split [7200/800]> Fold10 <tibble [2 × 4]> <tibble [0 × 3]>
Evaluating model performance
taxi_res %>%
collect_metrics ()
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.914 10 0.00262 Preprocessor1_Model1
2 roc_auc binary 0.621 10 0.00769 Preprocessor1_Model1
We can reliably measure performance using only the training data 🎉
Comparing the metrics
taxi_res %>%
collect_metrics () %>%
select (.metric, mean, n)
# A tibble: 2 × 3
.metric mean n
<chr> <dbl> <int>
1 accuracy 0.914 10
2 roc_auc 0.621 10
Remember that:
⚠️ the training set gives you overly optimistic metrics
⚠️ the test set is precious
Evaluating model performance
# Save the assessment set results
ctrl_taxi <- control_resamples (save_pred = TRUE )
taxi_res <- fit_resamples (taxi_wflow, taxi_folds, control = ctrl_taxi)
taxi_preds <- collect_predictions (taxi_res)
taxi_preds
# A tibble: 8,000 × 7
id .pred_yes .pred_no .row .pred_class tip .config
<chr> <dbl> <dbl> <int> <fct> <fct> <chr>
1 Fold01 0.967 0.0329 1 yes yes Preprocessor1_Model1
2 Fold01 0.930 0.0701 6 yes yes Preprocessor1_Model1
3 Fold01 0.930 0.0701 20 yes yes Preprocessor1_Model1
4 Fold01 0.930 0.0701 30 yes yes Preprocessor1_Model1
5 Fold01 0.930 0.0701 61 yes yes Preprocessor1_Model1
6 Fold01 0.967 0.0329 90 yes yes Preprocessor1_Model1
7 Fold01 0.967 0.0329 91 yes yes Preprocessor1_Model1
8 Fold01 0.930 0.0701 111 yes yes Preprocessor1_Model1
9 Fold01 0.914 0.0856 113 yes no Preprocessor1_Model1
10 Fold01 0.896 0.104 115 yes yes Preprocessor1_Model1
# ℹ 7,990 more rows
taxi_preds %>%
group_by (id) %>%
taxi_metrics (truth = tip, estimate = .pred_class)
# A tibble: 30 × 4
id .metric .estimator .estimate
<chr> <chr> <chr> <dbl>
1 Fold01 accuracy binary 0.919
2 Fold02 accuracy binary 0.91
3 Fold03 accuracy binary 0.91
4 Fold04 accuracy binary 0.914
5 Fold05 accuracy binary 0.91
6 Fold06 accuracy binary 0.916
7 Fold07 accuracy binary 0.908
8 Fold08 accuracy binary 0.902
9 Fold09 accuracy binary 0.922
10 Fold10 accuracy binary 0.931
# ℹ 20 more rows
Where are the fitted models?
# Resampling results
# 10-fold cross-validation using stratification
# A tibble: 10 × 5
splits id .metrics .notes .predictions
<list> <chr> <list> <list> <list>
1 <split [7200/800]> Fold01 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
2 <split [7200/800]> Fold02 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
3 <split [7200/800]> Fold03 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
4 <split [7200/800]> Fold04 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
5 <split [7200/800]> Fold05 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
6 <split [7200/800]> Fold06 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
7 <split [7200/800]> Fold07 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
8 <split [7200/800]> Fold08 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
9 <split [7200/800]> Fold09 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
10 <split [7200/800]> Fold10 <tibble [2 × 4]> <tibble [0 × 3]> <tibble>
Bootstrapping
set.seed (3214 )
bootstraps (taxi_train)
# Bootstrap sampling
# A tibble: 25 × 2
splits id
<list> <chr>
1 <split [8000/2902]> Bootstrap01
2 <split [8000/2916]> Bootstrap02
3 <split [8000/3004]> Bootstrap03
4 <split [8000/2979]> Bootstrap04
5 <split [8000/2961]> Bootstrap05
6 <split [8000/2962]> Bootstrap06
7 <split [8000/3026]> Bootstrap07
8 <split [8000/2926]> Bootstrap08
9 <split [8000/2972]> Bootstrap09
10 <split [8000/2972]> Bootstrap10
# ℹ 15 more rows
Monte Carlo Cross-Validation
set.seed (322 )
mc_cv (taxi_train, times = 10 )
# Monte Carlo cross-validation (0.75/0.25) with 10 resamples
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [6000/2000]> Resample01
2 <split [6000/2000]> Resample02
3 <split [6000/2000]> Resample03
4 <split [6000/2000]> Resample04
5 <split [6000/2000]> Resample05
6 <split [6000/2000]> Resample06
7 <split [6000/2000]> Resample07
8 <split [6000/2000]> Resample08
9 <split [6000/2000]> Resample09
10 <split [6000/2000]> Resample10
Validation set
set.seed (853 )
validation_split (taxi_train, strata = tip)
# Validation Set Split (0.75/0.25) using stratification
# A tibble: 1 × 2
splits id
<list> <chr>
1 <split [6000/2000]> validation
The whole game - status update
Random Forest 🌳🌲🌴🌵
Often works well without tuning hyperparameters (more on this in Advanced tidymodels!), as long as there are enough trees
Model Specification
rf_spec <- rand_forest (trees = 1000 , mode = "classification" )
rf_spec
Random Forest Model Specification (classification)
Main Arguments:
trees = 1000
Computational engine: ranger
Workflow
rf_wflow <- workflow (tip ~ ., rf_spec)
rf_wflow
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Formula
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
tip ~ .
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)
Main Arguments:
trees = 1000
Computational engine: ranger
Evaluate Performance
ctrl_taxi <- control_resamples (save_pred = TRUE )
set.seed (2 )
rf_res <- fit_resamples (rf_wflow, taxi_folds, control = ctrl_taxi)
collect_metrics (rf_res)
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.922 10 0.00265 Preprocessor1_Model1
2 roc_auc binary 0.634 10 0.00850 Preprocessor1_Model1
The whole game - status update
The final fit
Suppose that we are happy with our random forest model.
Let's fit the model on the training set and verify our performance using the test set.
We've shown you fit() and predict() (+ augment()) but there is a shortcut:
# taxi_split has train + test info
final_fit <- last_fit (rf_wflow, taxi_split)
final_fit
# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [8000/2000]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics (final_fit)
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.921 Preprocessor1_Model1
2 roc_auc binary 0.631 Preprocessor1_Model1
These are metrics computed with the test set
collect_predictions (final_fit)
# A tibble: 2,000 × 7
id .pred_yes .pred_no .row .pred_class tip .config
<chr> <dbl> <dbl> <int> <fct> <fct> <chr>
1 train/test split 0.988 0.0123 3 yes yes Preprocessor1_Mo…
2 train/test split 0.982 0.0180 7 yes yes Preprocessor1_Mo…
3 train/test split 0.963 0.0374 8 yes yes Preprocessor1_Mo…
4 train/test split 0.980 0.0197 13 yes yes Preprocessor1_Mo…
5 train/test split 0.849 0.151 29 yes yes Preprocessor1_Mo…
6 train/test split 0.943 0.0574 37 yes yes Preprocessor1_Mo…
7 train/test split 0.867 0.133 45 yes yes Preprocessor1_Mo…
8 train/test split 0.885 0.115 46 yes yes Preprocessor1_Mo…
9 train/test split 0.949 0.0507 49 yes yes Preprocessor1_Mo…
10 train/test split 0.989 0.0114 53 yes yes Preprocessor1_Mo…
# ℹ 1,990 more rows
extract_workflow (final_fit)
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Formula
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
tip ~ .
── Model ───────────────────────────────────────────────────────────────────────
Ranger result
Call:
ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~1000, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
Type: Probability estimation
Number of trees: 1000
Sample size: 8000
Number of independent variables: 6
Mtry: 2
Target node size: 10
Variable importance mode: none
Splitrule: gini
OOB prediction error (Brier s.): 0.07171122
Use this for prediction on new data, like for deploying