Introduction

I will write something later…

BAD n percent
Bad 300 8.92
Good 3064 91.08
library(tidymodels)

set.seed(123)
members_split <- initial_split(hmeq_full, prop = 0.7, strata = BAD)
members_train <- training(members_split)
members_test <- testing(members_split)

set.seed(123)
members_folds <- vfold_cv(members_train, strata = BAD, v = 5, repeats = 3)

library(recipes)
library(themis)

members_rec <- recipe(BAD ~ ., data = members_train) %>%
  step_dummy(all_nominal(), -BAD) %>%
  step_smote(BAD)

glm_spec <- logistic_reg() %>%
  set_engine("glm")

rf_spec <- rand_forest(trees = 1000) %>%
  set_mode("classification") %>%
  set_engine("ranger")

members_wf <- workflow() %>%
  add_recipe(members_rec)

members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

glm_rs <- members_wf %>%
  add_model(glm_spec) %>%
  fit_resamples(resamples = members_folds,
                metrics = members_metrics,
                control = control_resamples(save_pred = TRUE))

rf_rs <- members_wf %>%
  add_model(rf_spec) %>%
  fit_resamples(resamples = members_folds,
                metrics = members_metrics,
                control = control_resamples(save_pred = TRUE))

members_final_glm <- members_wf %>%
  add_model(glm_spec) %>%
  last_fit(members_split)

collect_predictions(members_final_glm) -> glm_pd

members_final_rf <- members_wf %>%
  add_model(rf_spec) %>%
  last_fit(members_split)

collect_predictions(members_final_rf) -> rf_pd

calculate_some_metrics <- function(threshold_selected, pd_selected) {
  
  pd_selected %>% 
    mutate(label_predicted_threshold = case_when(.pred_Bad >= threshold_selected ~ "Bad", TRUE ~ "Good")) -> df_result
  
  sum(df_result$label_predicted_threshold == df_result$BAD) / nrow(df_result) -> accuracy_threshold
  
  df_result %>% filter(BAD == "Bad") -> actual_is_died
  
  sum(actual_is_died$label_predicted_threshold == "Bad") / nrow(actual_is_died) -> sensitivity_threshold
  
  df_result %>% filter(BAD != "Bad") -> actual_is_survived
  
  # Calculate Specificity: 
  sum(actual_is_survived$label_predicted_threshold != "Bad") / nrow(actual_is_survived) -> spec 
  
  # Report results in form of data frame: 
  data.frame(Accuracy = accuracy_threshold, 
             Sensitivity = sensitivity_threshold, 
             Specificity = spec, 
             FPR = 1 - spec, 
             # AUC = roc_auc, 
             Threshold = threshold_selected) -> df_reporting
  return(df_reporting)
}

range_threshold <- c(0.5, seq(0, 1, length.out = 99))

lapply(range_threshold, function(x) {calculate_some_metrics(x, pd_selected = glm_pd)}) -> list_glm
lapply(range_threshold, function(x) {calculate_some_metrics(x, pd_selected = rf_pd)}) -> list_rf

do.call("bind_rows", list_glm) -> glm_results_thres
do.call("bind_rows", list_rf) -> rf_results_thres

bind_rows(glm_results_thres %>% mutate(Model = "GLM"), 
          rf_results_thres %>% mutate(Model = "RF")) -> df_compare

df_compare %>% 
  gather(Metric, Value, -Threshold, -Model) -> df_long

library(pROC)
glm_auc <- roc(case_when(members_test$BAD == "Bad" ~ 1, TRUE ~ 0), glm_pd$.pred_Bad)$auc %>% round(3)
rf_auc <- roc(case_when(members_test$BAD == "Bad" ~ 1, TRUE ~ 0), rf_pd$.pred_Bad)$auc %>% round(3)

library(extrafont)
theme_set(theme_minimal())
my_font <- "Roboto Condensed"

df_long %>% 
  ggplot(aes(Threshold, Value, color = Model)) + 
  geom_line() + 
  geom_point(data = df_long %>% filter(Threshold == 0.5)) + 
  facet_wrap(~ Metric) + 
  theme(text = element_text(family = my_font)) + 
  theme(plot.margin = unit(rep(0.8, 4), "cm")) + 
  scale_y_continuous(labels = scales::percent) + 
  labs(subtitle = paste0("AUC for GLM = ", glm_auc, ", AUC for RF = ", rf_auc), 
       title = "Figure 1: Model Performance by Threshold based on Test Data", 
       y = NULL)

#========================

calculate_profit <- function(threshold_selected, pd_selected) {
  
  members_test %>% 
    as_tibble() %>% 
    mutate(pd = pd_selected$.pred_Bad) %>% 
    mutate(predicted = case_when(pd >= threshold_selected ~ "Bad", TRUE ~ "Good")) -> df_selected
  
  # Set conditions for calculating average profit at given threshold: 
  n <- 50
  rate <- 0.07
  profit_space <- NULL
  
  # Calculate net profit for each sample randomly selected from test data:
  
  for (j in 1:n) {
    
    set.seed(j)
    
    df_results <- df_selected %>% 
      sample_frac(0.7)
    
    # Profit from true negative cases: 
    
    df_results %>% 
      filter(predicted == "Good", BAD == "Good") %>% 
      mutate(profit = rate*LOAN) %>% 
      pull(profit) %>% 
      sum() -> total_profit
    
    # Loss causes from false negative cases: 
    
    df_results %>% 
      filter(predicted == "Good", BAD == "Bad") %>% 
      pull(LOAN) %>% 
      sum() -> total_loss
    
    # Net profit: 
    net_profit <- total_profit - total_loss
    profit_space <- c(profit_space, net_profit)
    
  }
  
  # Average net profit at given threshold: 
  data.frame(Profit_avg = mean(profit_space), Threshold = threshold_selected) -> df_prof_thres
  return(df_prof_thres)
  
}

lapply(range_threshold, function(x) {calculate_profit(threshold_selected = x, pd_selected = rf_pd)}) -> list_prof

do.call("bind_rows", list_prof) -> df_prof

df_prof %>% filter(Threshold == 0.5) -> default_prof

df_prof %>% slice(which.max(Profit_avg)) -> max_prof

df_prof %>% 
  ggplot(aes(Threshold, Profit_avg)) + 
  geom_line(color = "#00006E") + 
  geom_point(data = max_prof, color = "red", size = 3) + 
  geom_point(data = default_prof, color = "blue", size = 3) + 
  geom_text(data = max_prof %>% mutate(Profit_avg = 640000), family = my_font,  
            aes(label = "Threshold that\nmaximizes Profit"), size = 3.5) + 
  geom_text(data = default_prof %>% mutate(Profit_avg = 550000),
            aes(label = "Profit at\ndefault threshold"), size = 3.5, family = my_font) + 
  scale_y_continuous(labels = scales::dollar_format()) + 
  theme(text = element_text(family = my_font)) + 
  theme(plot.margin = unit(rep(0.8, 4), "cm")) + 
  labs(y = NULL, title = "Figure 2: Maximum Profit by Threshold for Random Forest")

---
title: "Compare Maximum Profit between Random Forest and Logistic (tidymodels)"
author: "Nguyen Chi Dung"
subtitle: "R Machine Learning Series"
output:
  html_document: 
    code_download: true
    # code_folding: hide
    highlight: zenburn
    # number_sections: yes
    theme: "flatly"
    toc: TRUE
    toc_float: TRUE
---

```{r setup,include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.width = 10, fig.height = 6)
```



# Introduction

I will write something later...


```{r}
# https://github.com/tidymodels/themis
# https://www.tidymodels.org/
# https://www.tmwr.org/
# https://juliasilge.com/blog/xgboost-tune-volleyball/

# Clear our workspace: 
rm(list = ls())

# Load hmeq.csv dataset: 
library(tidyverse)
hmeq <- read_csv("http://www.creditriskanalytics.net/uploads/1/9/5/1/19511601/hmeq.csv")
hmeq %>% na.omit() -> hmeq_full

# Relabel for BAD and convert to factor for character columns: 
hmeq_full %>% 
  mutate(BAD = case_when(BAD == 1 ~ "Bad", TRUE ~ "Good")) %>% 
  mutate_if(is.character, as.factor) -> hmeq_full

# Distribution of BAD: 

library(knitr)

hmeq_full %>% 
  group_by(BAD) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(percent = 100*n / sum(n)) %>% 
  mutate(percent = round(percent, 2)) -> df_dis

df_dis %>% 
  kable()


library(tidymodels)

set.seed(123)
members_split <- initial_split(hmeq_full, prop = 0.7, strata = BAD)
members_train <- training(members_split)
members_test <- testing(members_split)

set.seed(123)
members_folds <- vfold_cv(members_train, strata = BAD, v = 5, repeats = 3)

library(recipes)
library(themis)

members_rec <- recipe(BAD ~ ., data = members_train) %>%
  step_dummy(all_nominal(), -BAD) %>%
  step_smote(BAD)

glm_spec <- logistic_reg() %>%
  set_engine("glm")

rf_spec <- rand_forest(trees = 1000) %>%
  set_mode("classification") %>%
  set_engine("ranger")

members_wf <- workflow() %>%
  add_recipe(members_rec)

members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

glm_rs <- members_wf %>%
  add_model(glm_spec) %>%
  fit_resamples(resamples = members_folds,
                metrics = members_metrics,
                control = control_resamples(save_pred = TRUE))

rf_rs <- members_wf %>%
  add_model(rf_spec) %>%
  fit_resamples(resamples = members_folds,
                metrics = members_metrics,
                control = control_resamples(save_pred = TRUE))

members_final_glm <- members_wf %>%
  add_model(glm_spec) %>%
  last_fit(members_split)

collect_predictions(members_final_glm) -> glm_pd

members_final_rf <- members_wf %>%
  add_model(rf_spec) %>%
  last_fit(members_split)

collect_predictions(members_final_rf) -> rf_pd

calculate_some_metrics <- function(threshold_selected, pd_selected) {
  
  pd_selected %>% 
    mutate(label_predicted_threshold = case_when(.pred_Bad >= threshold_selected ~ "Bad", TRUE ~ "Good")) -> df_result
  
  sum(df_result$label_predicted_threshold == df_result$BAD) / nrow(df_result) -> accuracy_threshold
  
  df_result %>% filter(BAD == "Bad") -> actual_is_died
  
  sum(actual_is_died$label_predicted_threshold == "Bad") / nrow(actual_is_died) -> sensitivity_threshold
  
  df_result %>% filter(BAD != "Bad") -> actual_is_survived
  
  # Calculate Specificity: 
  sum(actual_is_survived$label_predicted_threshold != "Bad") / nrow(actual_is_survived) -> spec 
  
  # Report results in form of data frame: 
  data.frame(Accuracy = accuracy_threshold, 
             Sensitivity = sensitivity_threshold, 
             Specificity = spec, 
             FPR = 1 - spec, 
             # AUC = roc_auc, 
             Threshold = threshold_selected) -> df_reporting
  return(df_reporting)
}

range_threshold <- c(0.5, seq(0, 1, length.out = 99))

lapply(range_threshold, function(x) {calculate_some_metrics(x, pd_selected = glm_pd)}) -> list_glm
lapply(range_threshold, function(x) {calculate_some_metrics(x, pd_selected = rf_pd)}) -> list_rf

do.call("bind_rows", list_glm) -> glm_results_thres
do.call("bind_rows", list_rf) -> rf_results_thres

bind_rows(glm_results_thres %>% mutate(Model = "GLM"), 
          rf_results_thres %>% mutate(Model = "RF")) -> df_compare

df_compare %>% 
  gather(Metric, Value, -Threshold, -Model) -> df_long

library(pROC)
glm_auc <- roc(case_when(members_test$BAD == "Bad" ~ 1, TRUE ~ 0), glm_pd$.pred_Bad)$auc %>% round(3)
rf_auc <- roc(case_when(members_test$BAD == "Bad" ~ 1, TRUE ~ 0), rf_pd$.pred_Bad)$auc %>% round(3)

library(extrafont)
theme_set(theme_minimal())
my_font <- "Roboto Condensed"

df_long %>% 
  ggplot(aes(Threshold, Value, color = Model)) + 
  geom_line() + 
  geom_point(data = df_long %>% filter(Threshold == 0.5)) + 
  facet_wrap(~ Metric) + 
  theme(text = element_text(family = my_font)) + 
  theme(plot.margin = unit(rep(0.8, 4), "cm")) + 
  scale_y_continuous(labels = scales::percent) + 
  labs(subtitle = paste0("AUC for GLM = ", glm_auc, ", AUC for RF = ", rf_auc), 
       title = "Figure 1: Model Performance by Threshold based on Test Data", 
       y = NULL)


#========================

calculate_profit <- function(threshold_selected, pd_selected) {
  
  members_test %>% 
    as_tibble() %>% 
    mutate(pd = pd_selected$.pred_Bad) %>% 
    mutate(predicted = case_when(pd >= threshold_selected ~ "Bad", TRUE ~ "Good")) -> df_selected
  
  # Set conditions for calculating average profit at given threshold: 
  n <- 50
  rate <- 0.07
  profit_space <- NULL
  
  # Calculate net profit for each sample randomly selected from test data:
  
  for (j in 1:n) {
    
    set.seed(j)
    
    df_results <- df_selected %>% 
      sample_frac(0.7)
    
    # Profit from true negative cases: 
    
    df_results %>% 
      filter(predicted == "Good", BAD == "Good") %>% 
      mutate(profit = rate*LOAN) %>% 
      pull(profit) %>% 
      sum() -> total_profit
    
    # Loss causes from false negative cases: 
    
    df_results %>% 
      filter(predicted == "Good", BAD == "Bad") %>% 
      pull(LOAN) %>% 
      sum() -> total_loss
    
    # Net profit: 
    net_profit <- total_profit - total_loss
    profit_space <- c(profit_space, net_profit)
    
  }
  
  # Average net profit at given threshold: 
  data.frame(Profit_avg = mean(profit_space), Threshold = threshold_selected) -> df_prof_thres
  return(df_prof_thres)
  
}

lapply(range_threshold, function(x) {calculate_profit(threshold_selected = x, pd_selected = rf_pd)}) -> list_prof

do.call("bind_rows", list_prof) -> df_prof

df_prof %>% filter(Threshold == 0.5) -> default_prof

df_prof %>% slice(which.max(Profit_avg)) -> max_prof

df_prof %>% 
  ggplot(aes(Threshold, Profit_avg)) + 
  geom_line(color = "#00006E") + 
  geom_point(data = max_prof, color = "red", size = 3) + 
  geom_point(data = default_prof, color = "blue", size = 3) + 
  geom_text(data = max_prof %>% mutate(Profit_avg = 640000), family = my_font,  
            aes(label = "Threshold that\nmaximizes Profit"), size = 3.5) + 
  geom_text(data = default_prof %>% mutate(Profit_avg = 550000),
            aes(label = "Profit at\ndefault threshold"), size = 3.5, family = my_font) + 
  scale_y_continuous(labels = scales::dollar_format()) + 
  theme(text = element_text(family = my_font)) + 
  theme(plot.margin = unit(rep(0.8, 4), "cm")) + 
  labs(y = NULL, title = "Figure 2: Maximum Profit by Threshold for Random Forest")

lapply(range_threshold, function(x) {calculate_profit(threshold_selected = x, pd_selected = glm_pd)}) -> list_prof_glm

do.call("bind_rows", list_prof_glm) -> df_prof_glm

df_prof_glm %>% filter(Threshold == 0.5) -> default_prof_glm

df_prof_glm %>% slice(which.max(Profit_avg)) -> max_prof_glm

df_prof_glm %>% 
  ggplot(aes(Threshold, Profit_avg)) + 
  geom_line(color = "#00006E") + 
  geom_point(data = max_prof_glm, color = "red", size = 3) + 
  geom_point(data = default_prof_glm, color = "blue", size = 3) + 
  geom_text(data = max_prof_glm %>% mutate(Threshold = 0.5), family = my_font,  
            aes(label = "Threshold that\nmaximizes Profit"), size = 3.5) + 
  geom_text(data = default_prof_glm %>% mutate(Profit_avg = 290000),
            aes(label = "Profit at\ndefault threshold"), size = 3.5, family = my_font) + 
  scale_y_continuous(labels = scales::dollar_format()) + 
  theme(text = element_text(family = my_font)) + 
  theme(plot.margin = unit(rep(0.8, 4), "cm")) + 
  labs(y = NULL, title = "Figure 3: Maximum Profit by Threshold for Logistic")


```

