Load required packages

if (!require("pacman")) install.packages("pacman")

pacman::p_load(here,        # for referencing files and folders
               tidyverse,   # for data reading wrangling and visualization
               tidymodels , # for data modeling
               ggmosaic,    # for mosaic plots
               gamlr)       # for running the gamma lasso algorithm

Read and pre-process the data

Read the data

credit_raw <- read_csv(here( "credit.csv"))

Create some “interesting” variables by re-leveling the credit history and checking account status

credit_processed <- credit_raw %>%
  mutate(history = fct_collapse(history,
    good     = c("A30", "A31"),
    poor     = c("A32", "A33"),
    terrible = c("A34")
  )) %>%
  mutate(foreign = fct_recode(foreign,
    "foreign" = "A201",
    "german"  = "A202"
  )) %>%
  mutate(purpose = fct_collapse(purpose,
    newcar       = c("A40"),
    usedcar      = c("A41"),
    goods_repair = c("A42", "A43", "A44", "A45"),
    edu          = c("A46", "A48"),
    na           = c("A47"),
    biz          = c("A49", "A410")
  )) %>% 
  mutate(rent = factor(housing == "A151"))

Our analysis focuses on a subset of nine variables

credit_df <- credit_processed %>% 
  select(Default, duration, amount,
         installment, age, history,
         purpose, foreign, rent)

Plot a mosaic using the ggmosaic package

credit_df %>% 
  mutate(Default = as_factor(Default)) %>% 
  ggplot() +
  geom_mosaic(aes(x = product(Default, history), fill = Default)) + 
  labs(x = "History",
       y = "Default",
       fill = "")

Surprise! the dangers of choice-based sampling!

Build a design matrix

  • recipe() starts a new set of transformations to be applied, Its main argument is a formula that defines the outcome (Default in our case) and the predictors.
  • step_x() defines data transformations.
  • prep() prepares the transformations based on the data that is supplied.
  • juice() to generate the outcome and predictors matrix to be used in the eatimation step
credit_recipe <- credit_df %>% 
  recipe(Default ~ .) %>% # defines Default as outcome and the rest as predictors
  step_dummy(all_nominal(), -Default,
             one_hot = TRUE) %>% # factors to dummies, keep all levels
  step_interact(~ all_predictors():all_predictors()) %>% # pairwise interactions
  step_zv(all_predictors()) %>% # remove redundent interactions
  prep()

credit_juiced <- juice(credit_recipe) 

head(credit_juiced)
## # A tibble: 6 × 121
##   duration amount installment   age Default history_good history_poor
##      <dbl>  <dbl>       <dbl> <dbl>   <dbl>        <dbl>        <dbl>
## 1        6   1169           4    67       0            0            0
## 2       48   5951           2    22       1            0            1
## 3       12   2096           2    49       0            0            0
## 4       42   7882           2    45       0            0            1
## 5       24   4870           3    53       1            0            1
## 6       36   9055           2    35       0            0            1
## # ℹ 114 more variables: history_terrible <dbl>, purpose_newcar <dbl>,
## #   purpose_usedcar <dbl>, purpose_biz <dbl>, purpose_goods_repair <dbl>,
## #   purpose_edu <dbl>, foreign_foreign <dbl>, foreign_german <dbl>,
## #   rent_FALSE. <dbl>, rent_TRUE. <dbl>, duration_x_amount <dbl>,
## #   duration_x_installment <dbl>, duration_x_age <dbl>,
## #   duration_x_history_good <dbl>, duration_x_history_poor <dbl>,
## #   duration_x_history_terrible <dbl>, duration_x_purpose_newcar <dbl>, …

Estimate the model using cv.gamlr()

cred_X <- credit_juiced %>% select(-Default) # predictors matrix
cred_Y <- credit_juiced %>% select(Default)  # outcome vector

credscore <- cv.gamlr(x = cred_X, y = cred_Y,
                      family = "binomial",
                      verb = TRUE)
## fold 1,2,3,4,5,done.

Plot the lasso’s regularization path and the cross-validation error for each value of \(\lambda\)

plot(credscore$gamlr)

plot(credscore)

For the selected set of random folds, check how many variables are selected by each selection criterion

model_spec <- tribble(~criterion, ~var_selected,
  "min",  sum(coef(credscore, s = "min") != 0),
  "AICc", sum(coef(credscore$gamlr) != 0),
  "1se",  sum(coef(credscore) != 0),
  "AIC",  sum(coef(credscore$gamlr, s = which.min(AIC(credscore$gamlr))) != 0),
  "BIC",  sum(coef(credscore$gamlr, s = which.min(BIC(credscore$gamlr))) != 0))

model_spec
## # A tibble: 5 × 2
##   criterion var_selected
##   <chr>            <int>
## 1 min                 20
## 2 AICc                21
## 3 1se                 13
## 4 AIC                 21
## 5 BIC                 19

Augment the credit_df data with the model’s prediction vector

pred <- predict(credscore$gamlr, newdata = cred_X, type="response") %>% 
  drop() # remove the sparse matrix formatting

credit_df_pred <- credit_df %>% 
  mutate(.pred_prob = pred) %>% 
  select(Default, .pred_prob)

head(credit_df_pred)
## # A tibble: 6 × 2
##   Default .pred_prob
##     <dbl>      <dbl>
## 1       0     0.0498
## 2       1     0.410 
## 3       0     0.127 
## 4       0     0.379 
## 5       1     0.413 
## 6       0     0.461

Show in sample fitted probability (via a boxplot)

credit_df_pred %>% 
  ggplot(aes(x = as_factor(Default),
             y = .pred_prob,
             color = as_factor(Default))) +
  geom_boxplot() +
  labs(x = "Default",
       y = "Fitted probability of default",
       color = "Default")

Misclassification rates

Classify predictions to “1” and “0” based on an arbitrary rule

rule <- 0.20 # move this around to see how these change

credit_df_rule <- credit_df_pred %>% 
  mutate(.pred_class = case_when(.pred_prob >= rule ~ 1,
                                 .pred_prob <  rule ~ 0))

head(credit_df_rule)
## # A tibble: 6 × 3
##   Default .pred_prob .pred_class
##     <dbl>      <dbl>       <dbl>
## 1       0     0.0498           0
## 2       1     0.410            1
## 3       0     0.127            0
## 4       0     0.379            1
## 5       1     0.413            1
## 6       0     0.461            1

What are the misclassification rates?

options(yardstick.event_first = FALSE) # consider "1" as the "positive" result

sensitivity <- credit_df_rule %>%
  mutate(
    Default = as.factor(Default),
    .pred_class = as.factor(.pred_class)
  ) %>%
  sens(truth = Default, estimate = .pred_class) %>%
  pull(.estimate)

specificity <- credit_df_rule %>%
  mutate(
    Default = as.factor(Default),
    .pred_class = as.factor(.pred_class)
  ) %>%
  spec(truth = Default, estimate = .pred_class) %>%
  pull(.estimate)



accuracy_tbl <- tribble(~measure, ~value,
                        "false positive rate", 1 - specificity,
                        "false negative rate", 1 - sensitivity,
                        "sensitivity", sensitivity,
                        "specificity", specificity)

accuracy_tbl
## # A tibble: 4 × 2
##   measure              value
##   <chr>                <dbl>
## 1 false positive rate 0.0767
## 2 false negative rate 0.609 
## 3 sensitivity         0.391 
## 4 specificity         0.923

Out of sample Prediction

Split the sample in half

set.seed(1234) # for the replicating the results

credit_split <- credit_df %>% 
  initial_split(prop = 0.5) 

credit_split
## <Training/Testing/Total>
## <500/500/1000>

Prepare the recipe to be used later to generate the training and test samples

credit_recipe <- training(credit_split) %>% 
  recipe(Default ~ .) %>% 
  step_dummy(all_nominal(), - Default, one_hot = TRUE) %>% 
  step_interact(~ all_predictors():all_predictors()) %>%
  step_zv(all_predictors()) %>% 
  prep()

Use the recipe to generate a training and test samples

credit_training <- credit_recipe %>% 
  juice()

credit_testing <- credit_recipe %>% 
  bake(new_data = testing(credit_split))

Train the model

cred_X_train <- credit_training %>% select(-Default)
cred_Y_train <- credit_training %>% select(Default)

credscore <- cv.gamlr(x = cred_X_train, y = cred_Y_train,
                      family = "binomial", verb = TRUE)
## fold 1,2,3,4,5,done.

Evaluate the performance of of the model using the training set

pred_is <- predict(credscore$gamlr, newdata = cred_X_train, type="response") %>% 
  drop()

credit_training %>%
  mutate(
    Default = as.factor(Default),
    .pred_prob = pred_is
  ) %>%
  roc_curve(Default, .pred_prob) %>%
  autoplot() +
  labs(title = "In-sample ROC curve")

Evaluate the performance of the model using the test set

cred_X_test <- credit_testing  %>% select(-Default)

pred_oos <- predict(credscore$gamlr, newdata = cred_X_test, type="response") %>% 
  drop()

credit_testing %>%
  mutate(
    Default = as.factor(Default),
    .pred_prob = pred_oos
  ) %>%
  roc_curve(Default, .pred_prob) %>%
  autoplot() +
  labs(title = "Out-of-sample ROC curve")

References

Taddy, Matt. Business Data Science: Combining Machine Learning and Economics to Optimize, Automate, and Accelerate Business Decisions . McGraw-Hill Education.