Abstract
A replication of the German credit score example that appear in Matt Taddy’s “Business Data Science”m Chapter 4 under “Probability, Cost, and Classification.”
Read the data
Create some “interesting” variables by re-leveling the credit history and checking account status
credit_processed <- credit_raw %>%
mutate(history = fct_collapse(history,
good = c("A30", "A31"),
poor = c("A32", "A33"),
terrible = c("A34")
)) %>%
mutate(foreign = fct_recode(foreign,
"foreign" = "A201",
"german" = "A202"
)) %>%
mutate(purpose = fct_collapse(purpose,
newcar = c("A40"),
usedcar = c("A41"),
goods_repair = c("A42", "A43", "A44", "A45"),
edu = c("A46", "A48"),
na = c("A47"),
biz = c("A49", "A410")
)) %>%
mutate(rent = factor(housing == "A151"))
Our analysis focuses on a subset of nine variables
credit_df <- credit_processed %>%
select(Default, duration, amount,
installment, age, history,
purpose, foreign, rent)
Plot a mosaic using the ggmosaic
package
credit_df %>%
mutate(Default = as_factor(Default)) %>%
ggplot() +
geom_mosaic(aes(x = product(Default, history), fill = Default)) +
labs(x = "History",
y = "Default",
fill = "")
Surprise! the dangers of choice-based sampling!
recipe()
starts a new set of transformations to be
applied, Its main argument is a formula that defines the outcome
(Default
in our case) and the predictors.step_x()
defines data transformations.prep()
prepares the transformations based on the data
that is supplied.juice()
to generate the outcome and predictors matrix
to be used in the eatimation stepcredit_recipe <- credit_df %>%
recipe(Default ~ .) %>% # defines Default as outcome and the rest as predictors
step_dummy(all_nominal(), -Default,
one_hot = TRUE) %>% # factors to dummies, keep all levels
step_interact(~ all_predictors():all_predictors()) %>% # pairwise interactions
step_zv(all_predictors()) %>% # remove redundent interactions
prep()
credit_juiced <- juice(credit_recipe)
head(credit_juiced)
## # A tibble: 6 × 121
## duration amount installment age Default history_good history_poor
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 1169 4 67 0 0 0
## 2 48 5951 2 22 1 0 1
## 3 12 2096 2 49 0 0 0
## 4 42 7882 2 45 0 0 1
## 5 24 4870 3 53 1 0 1
## 6 36 9055 2 35 0 0 1
## # ℹ 114 more variables: history_terrible <dbl>, purpose_newcar <dbl>,
## # purpose_usedcar <dbl>, purpose_biz <dbl>, purpose_goods_repair <dbl>,
## # purpose_edu <dbl>, foreign_foreign <dbl>, foreign_german <dbl>,
## # rent_FALSE. <dbl>, rent_TRUE. <dbl>, duration_x_amount <dbl>,
## # duration_x_installment <dbl>, duration_x_age <dbl>,
## # duration_x_history_good <dbl>, duration_x_history_poor <dbl>,
## # duration_x_history_terrible <dbl>, duration_x_purpose_newcar <dbl>, …
Estimate the model using cv.gamlr()
cred_X <- credit_juiced %>% select(-Default) # predictors matrix
cred_Y <- credit_juiced %>% select(Default) # outcome vector
credscore <- cv.gamlr(x = cred_X, y = cred_Y,
family = "binomial",
verb = TRUE)
## fold 1,2,3,4,5,done.
Plot the lasso’s regularization path and the cross-validation error for each value of \(\lambda\)
For the selected set of random folds, check how many variables are selected by each selection criterion
model_spec <- tribble(~criterion, ~var_selected,
"min", sum(coef(credscore, s = "min") != 0),
"AICc", sum(coef(credscore$gamlr) != 0),
"1se", sum(coef(credscore) != 0),
"AIC", sum(coef(credscore$gamlr, s = which.min(AIC(credscore$gamlr))) != 0),
"BIC", sum(coef(credscore$gamlr, s = which.min(BIC(credscore$gamlr))) != 0))
model_spec
## # A tibble: 5 × 2
## criterion var_selected
## <chr> <int>
## 1 min 20
## 2 AICc 21
## 3 1se 13
## 4 AIC 21
## 5 BIC 19
Augment the credit_df
data with the model’s prediction
vector
pred <- predict(credscore$gamlr, newdata = cred_X, type="response") %>%
drop() # remove the sparse matrix formatting
credit_df_pred <- credit_df %>%
mutate(.pred_prob = pred) %>%
select(Default, .pred_prob)
head(credit_df_pred)
## # A tibble: 6 × 2
## Default .pred_prob
## <dbl> <dbl>
## 1 0 0.0498
## 2 1 0.410
## 3 0 0.127
## 4 0 0.379
## 5 1 0.413
## 6 0 0.461
Show in sample fitted probability (via a boxplot)
credit_df_pred %>%
ggplot(aes(x = as_factor(Default),
y = .pred_prob,
color = as_factor(Default))) +
geom_boxplot() +
labs(x = "Default",
y = "Fitted probability of default",
color = "Default")
Classify predictions to “1” and “0” based on an arbitrary rule
rule <- 0.20 # move this around to see how these change
credit_df_rule <- credit_df_pred %>%
mutate(.pred_class = case_when(.pred_prob >= rule ~ 1,
.pred_prob < rule ~ 0))
head(credit_df_rule)
## # A tibble: 6 × 3
## Default .pred_prob .pred_class
## <dbl> <dbl> <dbl>
## 1 0 0.0498 0
## 2 1 0.410 1
## 3 0 0.127 0
## 4 0 0.379 1
## 5 1 0.413 1
## 6 0 0.461 1
What are the misclassification rates?
options(yardstick.event_first = FALSE) # consider "1" as the "positive" result
sensitivity <- credit_df_rule %>%
mutate(
Default = as.factor(Default),
.pred_class = as.factor(.pred_class)
) %>%
sens(truth = Default, estimate = .pred_class) %>%
pull(.estimate)
specificity <- credit_df_rule %>%
mutate(
Default = as.factor(Default),
.pred_class = as.factor(.pred_class)
) %>%
spec(truth = Default, estimate = .pred_class) %>%
pull(.estimate)
accuracy_tbl <- tribble(~measure, ~value,
"false positive rate", 1 - specificity,
"false negative rate", 1 - sensitivity,
"sensitivity", sensitivity,
"specificity", specificity)
accuracy_tbl
## # A tibble: 4 × 2
## measure value
## <chr> <dbl>
## 1 false positive rate 0.0767
## 2 false negative rate 0.609
## 3 sensitivity 0.391
## 4 specificity 0.923
Split the sample in half
set.seed(1234) # for the replicating the results
credit_split <- credit_df %>%
initial_split(prop = 0.5)
credit_split
## <Training/Testing/Total>
## <500/500/1000>
Prepare the recipe to be used later to generate the training and test samples
credit_recipe <- training(credit_split) %>%
recipe(Default ~ .) %>%
step_dummy(all_nominal(), - Default, one_hot = TRUE) %>%
step_interact(~ all_predictors():all_predictors()) %>%
step_zv(all_predictors()) %>%
prep()
Use the recipe to generate a training and test samples
credit_training <- credit_recipe %>%
juice()
credit_testing <- credit_recipe %>%
bake(new_data = testing(credit_split))
Train the model
cred_X_train <- credit_training %>% select(-Default)
cred_Y_train <- credit_training %>% select(Default)
credscore <- cv.gamlr(x = cred_X_train, y = cred_Y_train,
family = "binomial", verb = TRUE)
## fold 1,2,3,4,5,done.
Evaluate the performance of of the model using the training set
pred_is <- predict(credscore$gamlr, newdata = cred_X_train, type="response") %>%
drop()
credit_training %>%
mutate(
Default = as.factor(Default),
.pred_prob = pred_is
) %>%
roc_curve(Default, .pred_prob) %>%
autoplot() +
labs(title = "In-sample ROC curve")
Evaluate the performance of the model using the test set
cred_X_test <- credit_testing %>% select(-Default)
pred_oos <- predict(credscore$gamlr, newdata = cred_X_test, type="response") %>%
drop()
credit_testing %>%
mutate(
Default = as.factor(Default),
.pred_prob = pred_oos
) %>%
roc_curve(Default, .pred_prob) %>%
autoplot() +
labs(title = "Out-of-sample ROC curve")
Taddy, Matt. Business Data Science: Combining Machine Learning and Economics to Optimize, Automate, and Accelerate Business Decisions . McGraw-Hill Education.