I will write something …
rm(list = ls())
library(tidyverse)
# Load data:
hmeq <- read_csv("http://www.creditriskanalytics.net/uploads/1/9/5/1/19511601/hmeq.csv")
# Convert to factor for categorical columns and remove missing cases:
hmeq_full <- hmeq %>%
na.omit() %>%
mutate_if(is.character, as.factor)
# Split data:
set.seed(29)
train <- hmeq_full %>%
group_by(BAD) %>%
sample_frac(0.7) %>%
ungroup()
test <- anti_join(hmeq_full, train)
actual <- test$BAD
# Logistic Classifier:
logistic_reg <- glm(BAD ~ ., data = train, family = binomial(link = "logit"))
# Use Logistic for predicting PD (Probability of Default):
pd <- predict(logistic_reg, test, type = "response")
is_Bad <- actual == 1
orders_by_pd <- order(pd, decreasing = TRUE)
is_Bad_ordered <- is_Bad[orders_by_pd]
data.frame(actual = actual,
is_Bad = is_Bad,
is_Bad_ordered = is_Bad_ordered,
orders_by_pd = orders_by_pd,
prob_of_default = pd) -> df_BG
df_BG %>% mutate(TPR = cumsum(is_Bad_ordered) / sum(is_Bad_ordered),
FPR = cumsum(!is_Bad_ordered) / sum(!is_Bad_ordered)) -> df_BG
# Function calculates credit score from PD predicted:
credit_score <- function(pd) {
pdo <- 20
factor <- pdo / log(2)
my_offset <- 600 - factor*log(50)
scores <- my_offset + factor*log((1 - pd) / pd)
return(scores)
}
scores <- credit_score(pd = pd)
orders_by_score <- order(scores, decreasing = FALSE)
df_BG %>%
mutate(is_Bad_ordered_by_score = is_Bad[orders_by_score],
orders_by_score = orders_by_score,
credit_score = scores) %>%
mutate(TPR_score = cumsum(is_Bad_ordered_by_score) / sum(is_Bad_ordered_by_score),
FPR_score = cumsum(!is_Bad_ordered_by_score) / sum(!is_Bad_ordered_by_score)) -> df_BG
data.frame(FPR = c(df_BG$FPR, df_BG$FPR),
TPR = c(df_BG$TPR, df_BG$TPR_score),
Method = rep(c("PD", "Score"), each = nrow(df_BG))) -> df_roc
library(extrafont)
theme_set(theme_minimal())
my_font <- "Roboto Condensed"
df_roc %>%
ggplot(aes(FPR, TPR, color = Method)) +
geom_line() +
facet_wrap(~ Method) +
labs(title = "Figure 1: ROC curve by PD and Score") +
theme(text = element_text(family = my_font)) +
theme(plot.margin = unit(rep(0.8, 4), "cm"))
## prob_of_default credit_score
## Min. :0.0000665 Min. :187.0
## 1st Qu.:0.0215613 1st Qu.:554.5
## Median :0.0429043 Median :576.7
## Mean :0.0854957 Mean :573.8
## 3rd Qu.:0.0882429 3rd Qu.:597.2
## Max. :0.9999696 Max. :764.6
##
## Call:
## roc.default(response = actual, predictor = pd)
##
## Data: pd in 919 controls (actual 0) < 90 cases (actual 1).
## Area under the curve: 0.7563
##
## Call:
## roc.default(response = actual, predictor = scores)
##
## Data: scores in 919 controls (actual 0) > 90 cases (actual 1).
## Area under the curve: 0.7563
# Set threshold for classification:
threshold <- 0.5
labels_predicted <- case_when(pd >= threshold ~ 1, TRUE ~ 0)
df_results <- data.frame(predicted = labels_predicted, actual = actual, PD = pd)
df_results %>%
filter(actual == 1) %>%
group_by(predicted) %>%
count()
## # A tibble: 2 x 2
## # Groups: predicted [2]
## predicted n
## <dbl> <int>
## 1 0 68
## 2 1 22
## # A tibble: 2 x 2
## # Groups: predicted [2]
## predicted n
## <dbl> <int>
## 1 0 914
## 2 1 5
calculate_TPR_FPR <- function(threshold) {
# Classify as 1 or 0 base on PD and given threshold:
labels_predicted <- case_when(pd >= threshold ~ 1, TRUE ~ 0)
df_results <- test %>% transmute(actual = BAD, predicted = labels_predicted)
# Actual cases that all are default (label as 1):
df_results %>%
filter(actual == 1) %>%
pull(predicted) -> bad_predicted_by_model
# Calculate TPR:
sum(bad_predicted_by_model == 1) / length(bad_predicted_by_model) -> TPR
# Actual cases that all are non-default (label as 0):
df_results %>%
filter(actual == 0) %>%
pull(predicted) -> good_predicted_by_model
# Calculate Specificity:
sum(good_predicted_by_model == 0) / length(good_predicted_by_model) -> spec
# Calculate Accuracy:
sum(df_results$predicted == df_results$actual) / length(labels_predicted) -> acc
# Report results in data frame:
df_classification <- data.frame(TPR = TPR,
FPR = 1 - spec,
Accuracy = acc,
Specificity = spec,
Threshold = threshold)
return(df_classification)
}
calculate_TPR_FPR(threshold = 0.5)
## TPR FPR Accuracy Specificity Threshold
## 1 0.2444444 0.005440696 0.9276511 0.9945593 0.5
library(caret)
confusionMatrix(labels_predicted %>% as.factor(), actual %>% as.factor(), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 914 68
## 1 5 22
##
## Accuracy : 0.9277
## 95% CI : (0.9099, 0.9429)
## No Information Rate : 0.9108
## P-Value [Acc > NIR] : 0.03135
##
## Kappa : 0.3493
##
## Mcnemar's Test P-Value : 3.971e-13
##
## Sensitivity : 0.24444
## Specificity : 0.99456
## Pos Pred Value : 0.81481
## Neg Pred Value : 0.93075
## Prevalence : 0.08920
## Detection Rate : 0.02180
## Detection Prevalence : 0.02676
## Balanced Accuracy : 0.61950
##
## 'Positive' Class : 1
##
threshold_range <- c(0.5, seq(0, 1, length.out = 100))
lapply(threshold_range, calculate_TPR_FPR) -> my_list
do.call("bind_rows", my_list) -> df_roc
df_roc %>%
gather(Metric, Rate, -Threshold) -> df_long
df_long %>%
filter(Metric == "Accuracy") %>%
slice(which.max(Rate)) -> max_acc
df_long %>%
ggplot(aes(Threshold, Rate, color = Metric)) +
geom_line() +
geom_point(data = max_acc, shape = 18, size = 3) +
geom_text(data = max_acc %>% mutate(Rate = 0.87),
aes(label = "Threshold that\nmaximizes Accuracy"),
size = 3.2, color = "black", family = my_font) +
scale_y_continuous(labels = scales::percent) +
theme(text = element_text(family = my_font)) +
theme(plot.margin = unit(rep(0.8, 4), "cm")) +
labs(y = NULL, title = "Figure 2: Model Performance by Threshold")
calculate_profit <- function(threshold) {
# Convert to label 0 or 1 base on PD predicted:
labels_predicted <- case_when(pd >= threshold ~ 1, TRUE ~ 0)
# Set conditions for calculating average profit at given threshold:
n <- 100
rate <- 0.07
profit_space <- NULL
# Calculate net profit for each sample randomly selected from test data:
for (j in 1:n) {
set.seed(j)
df_results <- test %>%
mutate(predicted = labels_predicted) %>%
sample_frac(0.7)
# Profit from true negative cases:
df_results %>%
filter(predicted == 0, BAD == 0) %>%
mutate(profit = rate*LOAN) %>%
pull(profit) %>%
sum() -> total_profit
# Loss causes from false negative cases:
df_results %>%
filter(predicted == 0, BAD == 1) %>%
pull(LOAN) %>%
sum() -> total_loss
# Net profit:
net_profit <- total_profit - total_loss
profit_space <- c(profit_space, net_profit)
}
# Average net profit at given threshold:
data.frame(Profit_avg = mean(profit_space), Threshold = threshold) -> df_prof_thres
return(df_prof_thres)
}
lapply(threshold_range, calculate_profit) -> list_prof
do.call("bind_rows", list_prof) -> df_prof
df_prof %>% filter(Threshold == 0.5) -> default_prof
df_prof %>% slice(which.max(Profit_avg)) -> max_prof
df_prof %>%
ggplot(aes(Threshold, Profit_avg)) +
geom_line(color = "#00006E") +
geom_point(data = max_prof, color = "red", size = 3) +
geom_point(data = default_prof, color = "blue", size = 3) +
geom_text(data = max_prof %>% mutate(Threshold = Threshold + 0.1), family = my_font,
aes(label = "Threshold that\nmaximizes Profit"), size = 3.5) +
geom_text(data = default_prof %>% mutate(Profit_avg = Profit_avg + 35000, Threshold = Threshold + 0.05),
aes(label = "Profit at\ndefault threshold"), size = 3.5, family = my_font) +
scale_y_continuous(labels = scales::dollar_format()) +
theme(text = element_text(family = my_font)) +
theme(plot.margin = unit(rep(0.8, 4), "cm")) +
labs(y = NULL, title = "Figure 3: Net Profit by Threshold")