Machine Learning can be more productive when the modelers not only focus on the accuracy of those models themselve, but also the business value and requirements (profits or lost, etc.). Below is some discovery on this.
#=================================================================================
# Using Accuracy Criterion for Credit Classification Can Cause Disaster for Banks.
#=================================================================================
# Load R packages:
rm(list = ls())
library(cutpointr)
library(tidyverse)
library(caret)
# Load GermanCredit Data:
data("GermanCredit")
df <- GermanCredit
# Split data:
set.seed(1)
id <- createDataPartition(y = df$Class, p = 0.7, list = FALSE)
train <- df[id, ]
test <- df[-id, ]
# Set conditions for training Logistic Model:
number <- 5
repeats <- 5
set.seed(1)
train.control <- trainControl(method = "repeatedcv",
number = number,
repeats = repeats,
classProbs = TRUE,
allowParallel = TRUE,
summaryFunction = twoClassSummary)
# Trani Logistic Model with train data:
my_logit <- train(Class ~.,
data = train,
method = "glm",
trControl = train.control)
# Use model for predicting PD:
pd <- predict(my_logit, test, type = "prob") %>% pull(Bad)
# Calculate optimal cutoff by sensitivity and accuracy criterion:
m <- cutpointr(x = pd, class = test$Class, metric = sens_constrain, pos_class = "Bad")
n <- cutpointr(x = pd, class = test$Class, metric = accuracy, pos_class = "Bad")
# Optimal cutoff:
t1 <- m$optimal_cutpoint # maximize sensitivity.
t2 <- n$optimal_cutpoint # maximize accuracy.
# Plot ROC curve with optimal cutoff:
gridExtra::grid.arrange(plot(m), plot(n))## [1] 0.7504762
## [1] 0.7504762
# Function for labelling credit applications:
label_predicted <- function(cutoff) {
y <- case_when(pd >= cutoff ~ "Bad", TRUE ~ "Good") %>% as.factor()
return(y)
}
# Confution maxtrix:
confusionMatrix(label_predicted(t1), test$Class, positive = "Bad") # If t1 is selected for classification. ## Confusion Matrix and Statistics
##
## Reference
## Prediction Bad Good
## Bad 76 102
## Good 14 108
##
## Accuracy : 0.6133
## 95% CI : (0.5557, 0.6687)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.9995
##
## Kappa : 0.2804
##
## Mcnemar's Test P-Value : 6.597e-16
##
## Sensitivity : 0.8444
## Specificity : 0.5143
## Pos Pred Value : 0.4270
## Neg Pred Value : 0.8852
## Prevalence : 0.3000
## Detection Rate : 0.2533
## Detection Prevalence : 0.5933
## Balanced Accuracy : 0.6794
##
## 'Positive' Class : Bad
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction Bad Good
## Bad 38 23
## Good 52 187
##
## Accuracy : 0.75
## 95% CI : (0.697, 0.798)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.032245
##
## Kappa : 0.3444
##
## Mcnemar's Test P-Value : 0.001224
##
## Sensitivity : 0.4222
## Specificity : 0.8905
## Pos Pred Value : 0.6230
## Neg Pred Value : 0.7824
## Prevalence : 0.3000
## Detection Rate : 0.1267
## Detection Prevalence : 0.2033
## Balanced Accuracy : 0.6563
##
## 'Positive' Class : Bad
##
# Results of Classification by the two methods:
test %>%
mutate(Class_max_sen = label_predicted(t1),
Class_max_accuracy = label_predicted(t2)) -> test_t1_t2
test_t1_t2 %>%
filter(Class_max_sen == "Good", Class == "Good") %>%
select(Class, Class_max_sen, Amount) -> df_Good_max_sen
test_t1_t2 %>%
filter(Class_max_sen == "Good", Class == "Bad") %>%
select(Class, Class_max_sen, Amount) -> df_Bad_max_sen
test_t1_t2 %>%
filter(Class_max_accuracy == "Good", Class == "Good") %>%
select(Class, Class_max_accuracy, Amount) -> df_Good_max_acc
test_t1_t2 %>%
filter(Class_max_accuracy == "Good", Class == "Bad") %>%
select(Class, Class_max_accuracy, Amount) -> df_Bad_max_acc
# Calculate profit if interest rate for personal loan = 10%:
ir <- 0.15
sum(ir*df_Good_max_sen$Amount) - sum(df_Bad_max_sen$Amount)## [1] 87.25
## [1] -97150.35
# Function for calculating profit:
profit <- function(ir) {
pro1 <- sum(ir*df_Good_max_sen$Amount) - sum(df_Bad_max_sen$Amount)
pro2 <- sum(ir*df_Good_max_acc$Amount) - sum(df_Bad_max_acc$Amount)
return(data.frame(IR = ir, Pro = c(pro1, pro2), Method = c("MaxSen", "MaxAcc")))
}
# Compare profit:
library(ggdark)
lapply(seq(0.1, 0.3, by = 0.01), profit) -> profit_list
do.call("rbind", profit_list) %>%
mutate(Pro = Pro / 1000) %>%
ggplot(aes(IR, Pro, color = Method)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = seq(0.1, 0.3, by = 0.01), labels = scales::percent) +
labs(x = "Interest Rate", y = "Profit",
title = "Profit Comparision by Classification Method") + dark_theme_gray()