Introduction
I will write something later…
| Bad |
300 |
8.92 |
| Good |
3064 |
91.08 |
library(tidymodels)
set.seed(123)
members_split <- initial_split(hmeq_full, prop = 0.7, strata = BAD)
members_train <- training(members_split)
members_test <- testing(members_split)
set.seed(123)
members_folds <- vfold_cv(members_train, strata = BAD, v = 5, repeats = 3)
library(recipes)
library(themis)
members_rec <- recipe(BAD ~ ., data = members_train) %>%
step_dummy(all_nominal(), -BAD) %>%
step_smote(BAD)
glm_spec <- logistic_reg() %>%
set_engine("glm")
rf_spec <- rand_forest(trees = 1000) %>%
set_mode("classification") %>%
set_engine("ranger")
members_wf <- workflow() %>%
add_recipe(members_rec)
members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)
glm_rs <- members_wf %>%
add_model(glm_spec) %>%
fit_resamples(resamples = members_folds,
metrics = members_metrics,
control = control_resamples(save_pred = TRUE))
rf_rs <- members_wf %>%
add_model(rf_spec) %>%
fit_resamples(resamples = members_folds,
metrics = members_metrics,
control = control_resamples(save_pred = TRUE))
members_final_glm <- members_wf %>%
add_model(glm_spec) %>%
last_fit(members_split)
collect_predictions(members_final_glm) -> glm_pd
members_final_rf <- members_wf %>%
add_model(rf_spec) %>%
last_fit(members_split)
collect_predictions(members_final_rf) -> rf_pd
calculate_some_metrics <- function(threshold_selected, pd_selected) {
pd_selected %>%
mutate(label_predicted_threshold = case_when(.pred_Bad >= threshold_selected ~ "Bad", TRUE ~ "Good")) -> df_result
sum(df_result$label_predicted_threshold == df_result$BAD) / nrow(df_result) -> accuracy_threshold
df_result %>% filter(BAD == "Bad") -> actual_is_died
sum(actual_is_died$label_predicted_threshold == "Bad") / nrow(actual_is_died) -> sensitivity_threshold
df_result %>% filter(BAD != "Bad") -> actual_is_survived
# Calculate Specificity:
sum(actual_is_survived$label_predicted_threshold != "Bad") / nrow(actual_is_survived) -> spec
# Report results in form of data frame:
data.frame(Accuracy = accuracy_threshold,
Sensitivity = sensitivity_threshold,
Specificity = spec,
FPR = 1 - spec,
# AUC = roc_auc,
Threshold = threshold_selected) -> df_reporting
return(df_reporting)
}
range_threshold <- c(0.5, seq(0, 1, length.out = 99))
lapply(range_threshold, function(x) {calculate_some_metrics(x, pd_selected = glm_pd)}) -> list_glm
lapply(range_threshold, function(x) {calculate_some_metrics(x, pd_selected = rf_pd)}) -> list_rf
do.call("bind_rows", list_glm) -> glm_results_thres
do.call("bind_rows", list_rf) -> rf_results_thres
bind_rows(glm_results_thres %>% mutate(Model = "GLM"),
rf_results_thres %>% mutate(Model = "RF")) -> df_compare
df_compare %>%
gather(Metric, Value, -Threshold, -Model) -> df_long
library(pROC)
glm_auc <- roc(case_when(members_test$BAD == "Bad" ~ 1, TRUE ~ 0), glm_pd$.pred_Bad)$auc %>% round(3)
rf_auc <- roc(case_when(members_test$BAD == "Bad" ~ 1, TRUE ~ 0), rf_pd$.pred_Bad)$auc %>% round(3)
library(extrafont)
theme_set(theme_minimal())
my_font <- "Roboto Condensed"
df_long %>%
ggplot(aes(Threshold, Value, color = Model)) +
geom_line() +
geom_point(data = df_long %>% filter(Threshold == 0.5)) +
facet_wrap(~ Metric) +
theme(text = element_text(family = my_font)) +
theme(plot.margin = unit(rep(0.8, 4), "cm")) +
scale_y_continuous(labels = scales::percent) +
labs(subtitle = paste0("AUC for GLM = ", glm_auc, ", AUC for RF = ", rf_auc),
title = "Figure 1: Model Performance by Threshold based on Test Data",
y = NULL)

#========================
calculate_profit <- function(threshold_selected, pd_selected) {
members_test %>%
as_tibble() %>%
mutate(pd = pd_selected$.pred_Bad) %>%
mutate(predicted = case_when(pd >= threshold_selected ~ "Bad", TRUE ~ "Good")) -> df_selected
# Set conditions for calculating average profit at given threshold:
n <- 50
rate <- 0.07
profit_space <- NULL
# Calculate net profit for each sample randomly selected from test data:
for (j in 1:n) {
set.seed(j)
df_results <- df_selected %>%
sample_frac(0.7)
# Profit from true negative cases:
df_results %>%
filter(predicted == "Good", BAD == "Good") %>%
mutate(profit = rate*LOAN) %>%
pull(profit) %>%
sum() -> total_profit
# Loss causes from false negative cases:
df_results %>%
filter(predicted == "Good", BAD == "Bad") %>%
pull(LOAN) %>%
sum() -> total_loss
# Net profit:
net_profit <- total_profit - total_loss
profit_space <- c(profit_space, net_profit)
}
# Average net profit at given threshold:
data.frame(Profit_avg = mean(profit_space), Threshold = threshold_selected) -> df_prof_thres
return(df_prof_thres)
}
lapply(range_threshold, function(x) {calculate_profit(threshold_selected = x, pd_selected = rf_pd)}) -> list_prof
do.call("bind_rows", list_prof) -> df_prof
df_prof %>% filter(Threshold == 0.5) -> default_prof
df_prof %>% slice(which.max(Profit_avg)) -> max_prof
df_prof %>%
ggplot(aes(Threshold, Profit_avg)) +
geom_line(color = "#00006E") +
geom_point(data = max_prof, color = "red", size = 3) +
geom_point(data = default_prof, color = "blue", size = 3) +
geom_text(data = max_prof %>% mutate(Profit_avg = 640000), family = my_font,
aes(label = "Threshold that\nmaximizes Profit"), size = 3.5) +
geom_text(data = default_prof %>% mutate(Profit_avg = 550000),
aes(label = "Profit at\ndefault threshold"), size = 3.5, family = my_font) +
scale_y_continuous(labels = scales::dollar_format()) +
theme(text = element_text(family = my_font)) +
theme(plot.margin = unit(rep(0.8, 4), "cm")) +
labs(y = NULL, title = "Figure 2: Maximum Profit by Threshold for Random Forest")

lapply(range_threshold, function(x) {calculate_profit(threshold_selected = x, pd_selected = glm_pd)}) -> list_prof_glm
do.call("bind_rows", list_prof_glm) -> df_prof_glm
df_prof_glm %>% filter(Threshold == 0.5) -> default_prof_glm
df_prof_glm %>% slice(which.max(Profit_avg)) -> max_prof_glm
df_prof_glm %>%
ggplot(aes(Threshold, Profit_avg)) +
geom_line(color = "#00006E") +
geom_point(data = max_prof_glm, color = "red", size = 3) +
geom_point(data = default_prof_glm, color = "blue", size = 3) +
geom_text(data = max_prof_glm %>% mutate(Threshold = 0.5), family = my_font,
aes(label = "Threshold that\nmaximizes Profit"), size = 3.5) +
geom_text(data = default_prof_glm %>% mutate(Profit_avg = 290000),
aes(label = "Profit at\ndefault threshold"), size = 3.5, family = my_font) +
scale_y_continuous(labels = scales::dollar_format()) +
theme(text = element_text(family = my_font)) +
theme(plot.margin = unit(rep(0.8, 4), "cm")) +
labs(y = NULL, title = "Figure 3: Maximum Profit by Threshold for Logistic")

---
title: "Compare Maximum Profit between Random Forest and Logistic (tidymodels)"
author: "Nguyen Chi Dung"
subtitle: "R Machine Learning Series"
output:
  html_document: 
    code_download: true
    # code_folding: hide
    highlight: zenburn
    # number_sections: yes
    theme: "flatly"
    toc: TRUE
    toc_float: TRUE
---

```{r setup,include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.width = 10, fig.height = 6)
```



# Introduction

I will write something later...


```{r}
# https://github.com/tidymodels/themis
# https://www.tidymodels.org/
# https://www.tmwr.org/
# https://juliasilge.com/blog/xgboost-tune-volleyball/

# Clear our workspace: 
rm(list = ls())

# Load hmeq.csv dataset: 
library(tidyverse)
hmeq <- read_csv("http://www.creditriskanalytics.net/uploads/1/9/5/1/19511601/hmeq.csv")
hmeq %>% na.omit() -> hmeq_full

# Relabel for BAD and convert to factor for character columns: 
hmeq_full %>% 
  mutate(BAD = case_when(BAD == 1 ~ "Bad", TRUE ~ "Good")) %>% 
  mutate_if(is.character, as.factor) -> hmeq_full

# Distribution of BAD: 

library(knitr)

hmeq_full %>% 
  group_by(BAD) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(percent = 100*n / sum(n)) %>% 
  mutate(percent = round(percent, 2)) -> df_dis

df_dis %>% 
  kable()


library(tidymodels)

set.seed(123)
members_split <- initial_split(hmeq_full, prop = 0.7, strata = BAD)
members_train <- training(members_split)
members_test <- testing(members_split)

set.seed(123)
members_folds <- vfold_cv(members_train, strata = BAD, v = 5, repeats = 3)

library(recipes)
library(themis)

members_rec <- recipe(BAD ~ ., data = members_train) %>%
  step_dummy(all_nominal(), -BAD) %>%
  step_smote(BAD)

glm_spec <- logistic_reg() %>%
  set_engine("glm")

rf_spec <- rand_forest(trees = 1000) %>%
  set_mode("classification") %>%
  set_engine("ranger")

members_wf <- workflow() %>%
  add_recipe(members_rec)

members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

glm_rs <- members_wf %>%
  add_model(glm_spec) %>%
  fit_resamples(resamples = members_folds,
                metrics = members_metrics,
                control = control_resamples(save_pred = TRUE))

rf_rs <- members_wf %>%
  add_model(rf_spec) %>%
  fit_resamples(resamples = members_folds,
                metrics = members_metrics,
                control = control_resamples(save_pred = TRUE))

members_final_glm <- members_wf %>%
  add_model(glm_spec) %>%
  last_fit(members_split)

collect_predictions(members_final_glm) -> glm_pd

members_final_rf <- members_wf %>%
  add_model(rf_spec) %>%
  last_fit(members_split)

collect_predictions(members_final_rf) -> rf_pd

calculate_some_metrics <- function(threshold_selected, pd_selected) {
  
  pd_selected %>% 
    mutate(label_predicted_threshold = case_when(.pred_Bad >= threshold_selected ~ "Bad", TRUE ~ "Good")) -> df_result
  
  sum(df_result$label_predicted_threshold == df_result$BAD) / nrow(df_result) -> accuracy_threshold
  
  df_result %>% filter(BAD == "Bad") -> actual_is_died
  
  sum(actual_is_died$label_predicted_threshold == "Bad") / nrow(actual_is_died) -> sensitivity_threshold
  
  df_result %>% filter(BAD != "Bad") -> actual_is_survived
  
  # Calculate Specificity: 
  sum(actual_is_survived$label_predicted_threshold != "Bad") / nrow(actual_is_survived) -> spec 
  
  # Report results in form of data frame: 
  data.frame(Accuracy = accuracy_threshold, 
             Sensitivity = sensitivity_threshold, 
             Specificity = spec, 
             FPR = 1 - spec, 
             # AUC = roc_auc, 
             Threshold = threshold_selected) -> df_reporting
  return(df_reporting)
}

range_threshold <- c(0.5, seq(0, 1, length.out = 99))

lapply(range_threshold, function(x) {calculate_some_metrics(x, pd_selected = glm_pd)}) -> list_glm
lapply(range_threshold, function(x) {calculate_some_metrics(x, pd_selected = rf_pd)}) -> list_rf

do.call("bind_rows", list_glm) -> glm_results_thres
do.call("bind_rows", list_rf) -> rf_results_thres

bind_rows(glm_results_thres %>% mutate(Model = "GLM"), 
          rf_results_thres %>% mutate(Model = "RF")) -> df_compare

df_compare %>% 
  gather(Metric, Value, -Threshold, -Model) -> df_long

library(pROC)
glm_auc <- roc(case_when(members_test$BAD == "Bad" ~ 1, TRUE ~ 0), glm_pd$.pred_Bad)$auc %>% round(3)
rf_auc <- roc(case_when(members_test$BAD == "Bad" ~ 1, TRUE ~ 0), rf_pd$.pred_Bad)$auc %>% round(3)

library(extrafont)
theme_set(theme_minimal())
my_font <- "Roboto Condensed"

df_long %>% 
  ggplot(aes(Threshold, Value, color = Model)) + 
  geom_line() + 
  geom_point(data = df_long %>% filter(Threshold == 0.5)) + 
  facet_wrap(~ Metric) + 
  theme(text = element_text(family = my_font)) + 
  theme(plot.margin = unit(rep(0.8, 4), "cm")) + 
  scale_y_continuous(labels = scales::percent) + 
  labs(subtitle = paste0("AUC for GLM = ", glm_auc, ", AUC for RF = ", rf_auc), 
       title = "Figure 1: Model Performance by Threshold based on Test Data", 
       y = NULL)


#========================

calculate_profit <- function(threshold_selected, pd_selected) {
  
  members_test %>% 
    as_tibble() %>% 
    mutate(pd = pd_selected$.pred_Bad) %>% 
    mutate(predicted = case_when(pd >= threshold_selected ~ "Bad", TRUE ~ "Good")) -> df_selected
  
  # Set conditions for calculating average profit at given threshold: 
  n <- 50
  rate <- 0.07
  profit_space <- NULL
  
  # Calculate net profit for each sample randomly selected from test data:
  
  for (j in 1:n) {
    
    set.seed(j)
    
    df_results <- df_selected %>% 
      sample_frac(0.7)
    
    # Profit from true negative cases: 
    
    df_results %>% 
      filter(predicted == "Good", BAD == "Good") %>% 
      mutate(profit = rate*LOAN) %>% 
      pull(profit) %>% 
      sum() -> total_profit
    
    # Loss causes from false negative cases: 
    
    df_results %>% 
      filter(predicted == "Good", BAD == "Bad") %>% 
      pull(LOAN) %>% 
      sum() -> total_loss
    
    # Net profit: 
    net_profit <- total_profit - total_loss
    profit_space <- c(profit_space, net_profit)
    
  }
  
  # Average net profit at given threshold: 
  data.frame(Profit_avg = mean(profit_space), Threshold = threshold_selected) -> df_prof_thres
  return(df_prof_thres)
  
}

lapply(range_threshold, function(x) {calculate_profit(threshold_selected = x, pd_selected = rf_pd)}) -> list_prof

do.call("bind_rows", list_prof) -> df_prof

df_prof %>% filter(Threshold == 0.5) -> default_prof

df_prof %>% slice(which.max(Profit_avg)) -> max_prof

df_prof %>% 
  ggplot(aes(Threshold, Profit_avg)) + 
  geom_line(color = "#00006E") + 
  geom_point(data = max_prof, color = "red", size = 3) + 
  geom_point(data = default_prof, color = "blue", size = 3) + 
  geom_text(data = max_prof %>% mutate(Profit_avg = 640000), family = my_font,  
            aes(label = "Threshold that\nmaximizes Profit"), size = 3.5) + 
  geom_text(data = default_prof %>% mutate(Profit_avg = 550000),
            aes(label = "Profit at\ndefault threshold"), size = 3.5, family = my_font) + 
  scale_y_continuous(labels = scales::dollar_format()) + 
  theme(text = element_text(family = my_font)) + 
  theme(plot.margin = unit(rep(0.8, 4), "cm")) + 
  labs(y = NULL, title = "Figure 2: Maximum Profit by Threshold for Random Forest")

lapply(range_threshold, function(x) {calculate_profit(threshold_selected = x, pd_selected = glm_pd)}) -> list_prof_glm

do.call("bind_rows", list_prof_glm) -> df_prof_glm

df_prof_glm %>% filter(Threshold == 0.5) -> default_prof_glm

df_prof_glm %>% slice(which.max(Profit_avg)) -> max_prof_glm

df_prof_glm %>% 
  ggplot(aes(Threshold, Profit_avg)) + 
  geom_line(color = "#00006E") + 
  geom_point(data = max_prof_glm, color = "red", size = 3) + 
  geom_point(data = default_prof_glm, color = "blue", size = 3) + 
  geom_text(data = max_prof_glm %>% mutate(Threshold = 0.5), family = my_font,  
            aes(label = "Threshold that\nmaximizes Profit"), size = 3.5) + 
  geom_text(data = default_prof_glm %>% mutate(Profit_avg = 290000),
            aes(label = "Profit at\ndefault threshold"), size = 3.5, family = my_font) + 
  scale_y_continuous(labels = scales::dollar_format()) + 
  theme(text = element_text(family = my_font)) + 
  theme(plot.margin = unit(rep(0.8, 4), "cm")) + 
  labs(y = NULL, title = "Figure 3: Maximum Profit by Threshold for Logistic")


```

