In this document, we will carry out a binary classification job using tidymodels on a modeldata dataset.
The lending_club Dataset
These data were downloaded from the Lending Club access site and are from the first quarter of 2016. A subset of the rows and variables are included here. The outcome is in the variable Class and is either good (meaning that the loan was fully paid back or currently on-time) or bad (charged off, defaulted, of 21-120 days late).
data (lending_club)
str (lending_club)
tibble [9,857 × 23] (S3: tbl_df/tbl/data.frame)
$ funded_amnt : int [1:9857] 16100 32000 10000 16800 3500 10000 11000 15000 6000 20000 ...
$ term : Factor w/ 2 levels "term_36","term_60": 1 2 1 2 1 1 1 1 1 2 ...
$ int_rate : num [1:9857] 13.99 11.99 16.29 13.67 7.39 ...
$ sub_grade : Factor w/ 35 levels "A1","A2","A3",..: 14 11 16 13 4 10 1 7 8 12 ...
$ addr_state : Factor w/ 50 levels "AK","AL","AR",..: 7 23 35 33 5 43 17 24 34 11 ...
$ verification_status : Factor w/ 3 levels "Not_Verified",..: 1 3 2 3 2 2 1 1 2 1 ...
$ annual_inc : num [1:9857] 35000 72000 72000 101000 50100 32000 65000 188000 89000 48000 ...
$ emp_length : Factor w/ 12 levels "emp_lt_1","emp_1",..: 7 3 3 1 12 1 6 8 3 3 ...
$ delinq_2yrs : int [1:9857] 0 0 0 0 0 0 0 2 0 1 ...
$ inq_last_6mths : int [1:9857] 0 0 2 0 0 0 0 0 2 0 ...
$ revol_util : num [1:9857] 67.4 82.2 74.2 64 78.3 68.1 54.8 33.3 72.3 36 ...
$ acc_now_delinq : int [1:9857] 0 0 0 0 0 0 0 0 0 0 ...
$ open_il_6m : int [1:9857] 1 5 3 6 0 1 2 8 1 0 ...
$ open_il_12m : int [1:9857] 0 1 2 1 0 0 0 2 0 0 ...
$ open_il_24m : int [1:9857] 0 3 3 2 0 0 0 5 0 0 ...
$ total_bal_il : int [1:9857] 1099 49187 33378 55445 0 7574 20998 170814 19616 0 ...
$ all_util : int [1:9857] 48 77 79 64 78 66 54 80 75 36 ...
$ inq_fi : int [1:9857] 0 0 1 1 0 0 0 0 2 1 ...
$ inq_last_12m : int [1:9857] 3 0 4 4 0 0 0 4 6 1 ...
$ delinq_amnt : int [1:9857] 0 0 0 0 0 0 0 0 0 0 ...
$ num_il_tl : int [1:9857] 3 9 9 10 3 9 7 21 7 1 ...
$ total_il_high_credit_limit: int [1:9857] 13193 65945 39387 60188 0 12131 39486 177477 26129 0 ...
$ Class : Factor w/ 2 levels "bad","good": 2 2 2 2 2 2 2 2 2 2 ...
Predicting Credit Default
Let’s define a workflow to define a model for lending_club.
Initial Split
Perform an adequate split of the dataset into train and test sets. Keep 80% of data for the training set.
lending_club <- janitor:: clean_names (lending_club)
lending_club|>
ggplot (aes (class)) +
geom_bar () +
theme_minimal (base_size = 14 ) +
labs (title = "status target variable" , x = NULL , y = NULL )
lc_split <- initial_split (lending_club, prop = 0.8 , strata = "class" )
lc_split
<Training/Testing/Total>
<7885/1972/9857>
Preprocessing
Preprocess the data adequately. Some useful steps:
Turn factors into dummy variables.
Filter for near-zero variance features and for correlated features.
lc_rec <- recipe (class ~ ., training (lc_split)) |>
step_dummy (all_nominal_predictors ()) |>
step_corr (all_numeric_predictors ()) |>
step_nzv (all_numeric_predictors ())
lc_rec |> prep () |> bake (new_data = NULL )
# A tibble: 7,885 × 38
funded_amnt int_rate annual_inc delinq_2yrs inq_last_6mths revol_util
<int> <dbl> <dbl> <int> <int> <dbl>
1 32000 12.0 72000 0 0 82.2
2 10000 16.3 72000 0 2 74.2
3 16800 13.7 101000 0 0 64
4 3500 7.39 50100 0 0 78.3
5 11000 5.32 65000 0 0 54.8
6 15000 9.16 188000 2 0 33.3
7 6000 9.8 89000 0 2 72.3
8 20000 13.0 48000 1 0 36
9 10000 13.7 40000 0 3 21.9
10 10000 18.2 75000 0 3 78.3
# ℹ 7,875 more rows
# ℹ 32 more variables: open_il_6m <int>, open_il_12m <int>, open_il_24m <int>,
# all_util <int>, inq_fi <int>, inq_last_12m <int>, num_il_tl <int>,
# total_il_high_credit_limit <int>, class <fct>, term_term_60 <dbl>,
# sub_grade_B1 <dbl>, sub_grade_B2 <dbl>, sub_grade_B3 <dbl>,
# sub_grade_B4 <dbl>, sub_grade_B5 <dbl>, sub_grade_C1 <dbl>,
# sub_grade_C2 <dbl>, sub_grade_C3 <dbl>, sub_grade_C4 <dbl>, …
lc_rec |> prep () |> bake (new_data = testing (lc_split))
# A tibble: 1,972 × 38
funded_amnt int_rate annual_inc delinq_2yrs inq_last_6mths revol_util
<int> <dbl> <dbl> <int> <int> <dbl>
1 16100 14.0 35000 0 0 67.4
2 10000 11.5 32000 0 0 68.1
3 25750 13.7 56000 1 0 38.6
4 7000 13.0 73400 0 2 43.4
5 35000 11.5 175000 0 0 60.2
6 9000 26.6 26500 2 2 26
7 10000 10.8 52000 1 2 58.9
8 12000 9.16 74880 0 1 53.1
9 7500 7.91 46000 0 1 26.4
10 35000 13.7 103000 4 0 62.4
# ℹ 1,962 more rows
# ℹ 32 more variables: open_il_6m <int>, open_il_12m <int>, open_il_24m <int>,
# all_util <int>, inq_fi <int>, inq_last_12m <int>, num_il_tl <int>,
# total_il_high_credit_limit <int>, class <fct>, term_term_60 <dbl>,
# sub_grade_B1 <dbl>, sub_grade_B2 <dbl>, sub_grade_B3 <dbl>,
# sub_grade_B4 <dbl>, sub_grade_B5 <dbl>, sub_grade_C1 <dbl>,
# sub_grade_C2 <dbl>, sub_grade_C3 <dbl>, sub_grade_C4 <dbl>, …
Models
Define two models to analyze that data:
#1
lr <- logistic_reg ()
#2
rf <- rand_forest (mode = "classification" ) |>
set_engine ("ranger" )
#3
bt <- boost_tree (mode = "classification" ) |>
set_engine ("xgboost" )
##########
lc_lr_wf <- workflow () |>
add_recipe (lc_rec) |>
add_model (lr)
lc_bt_wf <- workflow () |>
add_recipe (lc_rec) |>
add_model (bt)
lc_rf_wf <- workflow () |>
add_recipe (lc_rec) |>
add_model (rf)
folds <- vfold_cv (training (lc_split), v = 10 )
Cross Validation
Define a set of ten folders for the training set, and test the two models with cross validation using as metrics accuracy, sensitivity and specificity.
#FOLDS FOR CROSS VALIDATION
folds <- vfold_cv (training (lc_split), v = 10 , strata = "class" )
folds
# 10-fold cross-validation using stratification
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [7096/789]> Fold01
2 <split [7096/789]> Fold02
3 <split [7096/789]> Fold03
4 <split [7096/789]> Fold04
5 <split [7096/789]> Fold05
6 <split [7097/788]> Fold06
7 <split [7097/788]> Fold07
8 <split [7097/788]> Fold08
9 <split [7097/788]> Fold09
10 <split [7097/788]> Fold10
class_metrics <- metric_set (accuracy, sens, spec)
class_metrics <- metric_set (accuracy, sens, spec)
lc_lr_cv <- lc_lr_wf |>
fit_resamples (folds, metrics = class_metrics)
lc_bt_cv <- lc_bt_wf |>
fit_resamples (folds, metrics = class_metrics)
Warning: package 'xgboost' was built under R version 4.3.3
lc_rf_cv <- lc_rf_wf |>
fit_resamples (folds, metrics = class_metrics)
Warning: package 'ranger' was built under R version 4.3.3
Model Decision
Which model would you choose for this dataset?
lc_lr_cv |>
collect_metrics ()
# A tibble: 3 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.947 10 0.00284 Preprocessor1_Model1
2 sens binary 0.00709 10 0.00361 Preprocessor1_Model1
3 spec binary 0.999 10 0.000404 Preprocessor1_Model1
lc_bt_cv |>
collect_metrics ()
# A tibble: 3 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.947 10 0.00285 Preprocessor1_Model1
2 sens binary 0.0100 10 0.00549 Preprocessor1_Model1
3 spec binary 0.999 10 0.000490 Preprocessor1_Model1
lc_rf_cv |>
collect_metrics ()
# A tibble: 3 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.947 10 0.00267 Preprocessor1_Model1
2 sens binary 0 10 0 Preprocessor1_Model1
3 spec binary 1 10 0 Preprocessor1_Model1
I will choose Loistic Regresion
Improving the Model
A possible way to improve results can be undersampling or oversampling. We can to that with the themis package: https://themis.tidymodels.org/index.html
Warning: package 'themis' was built under R version 4.3.3
Oversampling
Oversampling consists of generating artificial observations of the minority class, so that we train the model with a balanced dataset. We can do that using the step_smote() function when defining the recipe.
Take a look at the function documentation to see how it works, and:
Define a new recipe including oversampling with the SMOTE algorithm.
Use the recipe to evaluate both models with cross validation.
Function documentation: https://themis.tidymodels.org/reference/step_smote.html
lc_rec_SM <- recipe (class ~ ., training (lc_split)) |>
step_dummy (all_nominal_predictors ()) |>
step_corr (all_numeric_predictors ()) |>
step_nzv (all_numeric_predictors ()) |>
step_smote (class)
lc_lr_wf_SM <- workflow () |>
add_recipe (lc_rec_SM) |>
add_model (lr)
lc_rf_wf_SM <- workflow () |>
add_recipe (lc_rec_SM) |>
add_model (rf)
lc_lr_cv_SM <- lc_lr_wf_SM |>
fit_resamples (folds, metrics = class_metrics)
lc_rf_cv_SM <- lc_rf_wf_SM |>
fit_resamples (folds, metrics = class_metrics)
Undersampling
Oversampling consists of removing artificial observations of the minority class, so that we train the model with a balanced dataset. We can do that using the step_downsample() function when defining the recipe.
Evaluate a model using undersampling with step_downsample() in a similar way to the SMOTE algorithm.
#step_downsample()
lc_rec_DS <- recipe (class ~ ., training (lc_split)) |>
step_dummy (all_nominal_predictors ()) |>
step_corr (all_numeric_predictors ()) |>
step_nzv (all_numeric_predictors ()) |>
step_downsample (class)
RECIPE
lc_lr_wf_DS <- workflow () |>
add_recipe (lc_rec_DS) |>
add_model (lr)
lc_rf_wf_DS <- workflow () |>
add_recipe (lc_rec_DS) |>
add_model (rf)
WORKFLOWS
lc_lr_cv_DS <- lc_lr_wf_DS |>
fit_resamples (folds, metrics = class_metrics)
lc_rf_cv_DS <- lc_rf_wf_DS |>
fit_resamples (folds, metrics = class_metrics)
Model Selection
Evaluate the four models and select the most convenient.
lc_lr_cv_SM |>
collect_metrics ()
# A tibble: 3 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.711 10 0.00352 Preprocessor1_Model1
2 sens binary 0.642 10 0.0160 Preprocessor1_Model1
3 spec binary 0.715 10 0.00424 Preprocessor1_Model1
lc_rf_cv_SM |>
collect_metrics ()
# A tibble: 3 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.948 10 0.00268 Preprocessor1_Model1
2 sens binary 0.00488 10 0.00325 Preprocessor1_Model1
3 spec binary 1 10 0 Preprocessor1_Model1
lc_lr_cv_DS |>
collect_metrics ()
# A tibble: 3 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.673 10 0.00630 Preprocessor1_Model1
2 sens binary 0.663 10 0.0191 Preprocessor1_Model1
3 spec binary 0.673 10 0.00705 Preprocessor1_Model1
lc_rf_cv_DS |>
collect_metrics ()
# A tibble: 3 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.649 10 0.00636 Preprocessor1_Model1
2 sens binary 0.692 10 0.0195 Preprocessor1_Model1
3 spec binary 0.647 10 0.00666 Preprocessor1_Model1
I choose the logistic regresion Downsampling model.
Training of Final Model
Decide which model you select and:
Train it in the whole train set.
Test performance in the test set.
Results:
modelo_final <- lc_lr_wf_DS |>
fit (training (lc_split))
modelo_final <- modelo_final|>
predict (testing (lc_split)) |>
bind_cols (testing (lc_split) |> select (class)) |>
class_metrics (truth = class, estimate = .pred_class)
modelo_final
# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.703
2 sens binary 0.696
3 spec binary 0.704