Detailed explanations you can read here. An empirical evidence you can find here.
#=================================================================================
# Using Accuracy Criterion for Credit Classification Can Cause Disaster for Banks.
#=================================================================================
# Load R packages:
rm(list = ls())
library(cutpointr)
library(tidyverse)
library(caret)
# Load GermanCredit Data:
data("GermanCredit")
df <- GermanCredit
# Split data:
set.seed(1)
id <- createDataPartition(y = df$Class, p = 0.7, list = FALSE)
train <- df[id, ]
test <- df[-id, ]
# Set conditions for training Logistic Model:
number <- 5
repeats <- 5
n <- number*repeats
set.seed(1)
train.control <- trainControl(method = "repeatedcv",
number = number,
repeats = repeats,
classProbs = TRUE,
allowParallel = TRUE,
summaryFunction = twoClassSummary)
# Trani Logistic Model with train data:
my_logit <- train(Class ~.,
data = train,
method = "glm",
trControl = train.control)
# Use model for predicting PD:
pd <- predict(my_logit, test, type = "prob") %>% pull(Bad)
# Calculate optimal cutoff by sensitivity and accuracy criterion:
m <- cutpointr(x = pd, class = test$Class, metric = sens_constrain, pos_class = "Bad")
n <- cutpointr(x = pd, class = test$Class, metric = accuracy, pos_class = "Bad")
# Optimal cutoff:
t1 <- m$optimal_cutpoint # maximize sensitivity.
t2 <- n$optimal_cutpoint # maximize accuracy.
# Plot ROC curve with optimal cutoff:
gridExtra::grid.arrange(plot(m), plot(n))
## [1] 0.7504762
## [1] 0.7504762
# Function for labelling credit applications:
label_predicted <- function(cutoff) {
y <- case_when(pd >= cutoff ~ "Bad", TRUE ~ "Good") %>% as.factor()
return(y)
}
# Confution maxtrix:
confusionMatrix(label_predicted(t1), test$Class, positive = "Bad") # If t1 is selected for classification.
## Confusion Matrix and Statistics
##
## Reference
## Prediction Bad Good
## Bad 76 102
## Good 14 108
##
## Accuracy : 0.6133
## 95% CI : (0.5557, 0.6687)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.9995
##
## Kappa : 0.2804
##
## Mcnemar's Test P-Value : 6.597e-16
##
## Sensitivity : 0.8444
## Specificity : 0.5143
## Pos Pred Value : 0.4270
## Neg Pred Value : 0.8852
## Prevalence : 0.3000
## Detection Rate : 0.2533
## Detection Prevalence : 0.5933
## Balanced Accuracy : 0.6794
##
## 'Positive' Class : Bad
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction Bad Good
## Bad 38 23
## Good 52 187
##
## Accuracy : 0.75
## 95% CI : (0.697, 0.798)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.032245
##
## Kappa : 0.3444
##
## Mcnemar's Test P-Value : 0.001224
##
## Sensitivity : 0.4222
## Specificity : 0.8905
## Pos Pred Value : 0.6230
## Neg Pred Value : 0.7824
## Prevalence : 0.3000
## Detection Rate : 0.1267
## Detection Prevalence : 0.2033
## Balanced Accuracy : 0.6563
##
## 'Positive' Class : Bad
##
# Results of Classification by the two methods:
test %>%
mutate(Class_max_sen = label_predicted(t1),
Class_max_accuracy = label_predicted(t2)) -> test_t1_t2
test_t1_t2 %>%
filter(Class_max_sen == "Good", Class == "Good") %>%
select(Class, Class_max_sen, Amount) -> df_Good_max_sen
test_t1_t2 %>%
filter(Class_max_sen == "Good", Class == "Bad") %>%
select(Class, Class_max_sen, Amount) -> df_Bad_max_sen
test_t1_t2 %>%
filter(Class_max_accuracy == "Good", Class == "Good") %>%
select(Class, Class_max_accuracy, Amount) -> df_Good_max_acc
test_t1_t2 %>%
filter(Class_max_accuracy == "Good", Class == "Bad") %>%
select(Class, Class_max_accuracy, Amount) -> df_Bad_max_acc
# Calculate profit if interest rate = 30%:
ir <- 0.3
sum(ir*df_Good_max_sen$Amount) - sum(df_Bad_max_sen$Amount)
## [1] 42215.5
## [1] -21770.7
# Function for calculating profit:
profit <- function(ir) {
pro1 <- sum(ir*df_Good_max_sen$Amount) - sum(df_Bad_max_sen$Amount)
pro2 <- sum(ir*df_Good_max_acc$Amount) - sum(df_Bad_max_acc$Amount)
return(data.frame(IR = ir, Pro = c(pro1, pro2), Method = c("MaxSen", "MaxAcc")))
}
# Compare profit:
lapply(seq(0.15, 0.5, by = 0.01), profit) -> profit_list
do.call("rbind", profit_list) %>%
mutate(Pro = Pro / 1000) %>%
ggplot(aes(IR, Pro, color = Method)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = seq(0.15, 0.5, by = 0.05), labels = scales::percent) +
labs(x = "Interest Rate", y = "Profit",
title = "Profit Comparision by Classification Method") +
hrbrthemes::theme_modern_rc()