Load Libraries

options(scipen=999)

library(tidyverse)  
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)  
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──
## ✓ broom        0.7.6      ✓ rsample      0.0.9 
## ✓ dials        0.0.9      ✓ tune         0.1.5 
## ✓ infer        0.5.4      ✓ workflows    0.2.2 
## ✓ modeldata    0.1.0      ✓ workflowsets 0.0.2 
## ✓ parsnip      0.1.5      ✓ yardstick    0.0.8 
## ✓ recipes      0.1.16
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## ● Use tidymodels_prefer() to resolve common conflicts.
library(janitor)    
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(skimr)      
library(vip)        
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
library(readr)

Stage

loan <- read_csv("loan_defaults.csv") %>%
  clean_names()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Current_Loan_Amount = col_double(),
##   Term = col_character(),
##   Credit_Score = col_double(),
##   Annual_Income = col_double(),
##   Years_in_current_job = col_character(),
##   Home_Ownership = col_character(),
##   Purpose = col_character(),
##   Monthly_Debt = col_double(),
##   Years_of_Credit_History = col_double(),
##   Months_since_last_delinquent = col_double(),
##   Number_of_Open_Accounts = col_double(),
##   Number_of_Credit_Problems = col_double(),
##   Current_Credit_Balance = col_double(),
##   Maximum_Open_Credit = col_double(),
##   Bankruptcies = col_double(),
##   Tax_Liens = col_double(),
##   ID = col_double(),
##   loan_default = col_double()
## )
head(loan,10)

Analyze Target

loan %>%
  count(loan_default) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(loan_default, pct, label=sprintf("%1.1f%%",pct*100))) +
  geom_col() +
  geom_text( size = 3, color = "white", position = position_stack(vjust = 0.5)) + 
  labs(title = "Target %",
       x = "Loan_Default",
       y = "%")

Skim your data

loan %>%
  skim_without_charts() %>%
  select(skim_type, skim_variable, n_missing, numeric.mean)
Data summary
Name Piped data
Number of rows 70478
Number of columns 18
_______________________
Column type frequency:
character 4
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing
term 326
years_in_current_job 326
home_ownership 326
purpose 326

Variable type: numeric

skim_variable n_missing mean
current_loan_amount 326 11653456.40
credit_score 13820 1078.91
annual_income 13820 1378525.31
monthly_debt 326 18454.53
years_of_credit_history 326 18.23
months_since_last_delinquent 37541 34.80
number_of_open_accounts 326 11.15
number_of_credit_problems 326 0.17
current_credit_balance 326 294499.65
maximum_open_credit 328 771826.00
bankruptcies 488 0.12
tax_liens 334 0.03
id 0 502701.01
loan_default 0 0.23

Explore

category_eval <- function(var, title){
loan %>%
    count(!!as.name(var), loan_default) %>%
    ggplot(aes(!!as.name(var), n, fill=as.factor(loan_default))) +
    geom_col() +
      coord_flip()+
    labs(title = paste("catefory analysis for:", var, title))
}

category_eval("term", "")

category_eval("years_in_current_job", "")

category_eval("purpose", "")

Explore Categorical Variables

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  group_by(term, loan_default) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = term, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Loan by Term") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'term'. You can override using the `.groups` argument.

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  group_by(years_in_current_job, loan_default) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = years_in_current_job, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Loan by Years in Current Job") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'years_in_current_job'. You can override using the `.groups` argument.

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  group_by(home_ownership, loan_default) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = home_ownership, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Loan by Home Ownership") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'home_ownership'. You can override using the `.groups` argument.

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  group_by(purpose, loan_default) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = purpose, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Loan by Purpose") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'purpose'. You can override using the `.groups` argument.

Numeric Exploration

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(current_loan_amount, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does current loan amount impact a loan default? ") +
  ylab("count")+ xlab("current_loan_amount")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(annual_income, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does annual income impact a loan default? ") +
  ylab("count")+ xlab("annual income")
## Warning: Removed 13820 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(monthly_debt, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does monthly debt impact a loan default? ") +
  ylab("count")+ xlab("monthly debt impact")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(years_of_credit_history, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does years of credit history impact a loan default? ") +
  ylab("count")+ xlab("years of credit history")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(months_since_last_delinquent, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does months since last delinquent impact a loan default? ") +
  ylab("count")+ xlab("months since last delinquent")
## Warning: Removed 37541 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(number_of_open_accounts, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does number of open accounts a impact a loan default? ") +
  ylab("count")+ xlab("number of open accounts")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(number_of_credit_problems, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does number of credit problems impact a loan default? ") +
  ylab("count")+ xlab("number of credit problems")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(current_credit_balance, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does current credit balance impact a loan default? ") +
  ylab("count")+ xlab("current credit balance")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(maximum_open_credit, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does maximum open credit impact a loan default? ") +
  ylab("count")+ xlab("maximum open credit")
## Warning: Removed 328 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(bankruptcies, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does bankruptcies impact a loan default? ") +
  ylab("count")+ xlab("bankruptcies")
## Warning: Removed 488 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(tax_liens, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does tax liens a loan default? ") +
  ylab("count")+ xlab("tax liens")
## Warning: Removed 334 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  mutate(credit_score = if_else(credit_score > 1000, credit_score / 10, credit_score)) %>%
  ggplot(aes(credit_score, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does credit score impact a loan default? ") +
  ylab("count")+ xlab("credit score")
## Warning: Removed 13820 rows containing non-finite values (stat_bin).

Correlation

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
 loan_cor <- loan %>%
  na.omit() %>%
  select_if(is.numeric) %>%
  cor() %>%
  melt()
 
 loan_cor %>%
  ggplot(aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446") +
  geom_text(aes(label=round(value,2)), color="black") +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(title = "Correlation to Loan Default",
       subtitle = "exclude varibles w. abs(correlation) < .2?",
       y = "",
       x = "")

 loan_cor %>%
  filter(Var1 == "loan_default") %>%
  ggplot(aes(x=Var1, y=reorder(Var2,value), fill=value)) +
  geom_tile() +
  scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446") +
  geom_text(aes(label=round(value,2)), color="black") +
  theme(axis.text.x = element_text(angle = 0)) +
  labs(title = "Correlation to loan_default",
       subtitle = "exclude varibles w. abs(correlation) < .2?",
       y = "",
       x = "")

Transformation

loan_prep <- loan %>%
  mutate_if(is.character, as.factor) %>%
  mutate(loan_default = as.factor(loan_default))
 
loan_prep %>%
  skim_without_charts()
Data summary
Name Piped data
Number of rows 70478
Number of columns 18
_______________________
Column type frequency:
factor 5
numeric 13
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
term 326 1 FALSE 2 Sho: 50646, Lon: 19506
years_in_current_job 326 1 FALSE 12 10+: 21710, 2 y: 6451, < 1: 5742, 3 y: 5688
home_ownership 326 1 FALSE 4 Hom: 34003, Ren: 29517, Own: 6484, Hav: 148
purpose 326 1 FALSE 16 Deb: 55097, oth: 4237, Hom: 4085, Oth: 2277
loan_default 0 1 FALSE 2 0: 54503, 1: 15975

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
current_loan_amount 326 1.00 11653456.40 31654399.13 10802.0 179272.50 311751.00 524414.00 99999999.0
credit_score 13820 0.80 1078.91 1479.54 585.0 705.00 724.00 741.00 7510.0
annual_income 13820 0.80 1378525.31 1134102.15 81092.0 848502.00 1171150.50 1649732.00 165557393.0
monthly_debt 326 1.00 18454.53 12098.19 0.0 10185.42 16234.83 23997.71 229057.9
years_of_credit_history 326 1.00 18.23 7.03 3.6 13.50 17.00 21.70 65.0
months_since_last_delinquent 37541 0.47 34.80 21.95 0.0 16.00 32.00 51.00 97.0
number_of_open_accounts 326 1.00 11.15 5.02 0.0 8.00 10.00 14.00 76.0
number_of_credit_problems 326 1.00 0.17 0.48 0.0 0.00 0.00 0.00 12.0
current_credit_balance 326 1.00 294499.65 367014.79 0.0 112423.00 209722.00 368091.75 12986956.0
maximum_open_credit 328 1.00 771826.00 8596997.88 0.0 274230.00 468930.00 783480.50 1539737892.0
bankruptcies 488 0.99 0.12 0.35 0.0 0.00 0.00 0.00 7.0
tax_liens 334 1.00 0.03 0.25 0.0 0.00 0.00 0.00 11.0
id 0 1.00 502701.01 290461.22 20.0 251027.50 503130.00 754697.50 1005120.0

Partition 70/30

set.seed(123)

train_test_split <- initial_split(loan_prep, prop = 0.7)

train <- training(train_test_split)

test <- testing(train_test_split)

sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(loan_prep) * 100)
## [1] "Train PCT : 70.00%"
sprintf("Test  PCT : %1.2f%%", nrow(test)/ nrow(loan_prep) * 100)
## [1] "Test  PCT : 30.00%"

Tidymodel

Recipe

loan_recipe <-recipe(loan_default ~ ., data=train) %>%
  step_rm(id) %>%
  step_novel(all_nominal(), -all_outcomes()) %>%
  step_impute_mode(all_nominal(), -all_numeric()) %>%
  step_impute_mean(all_numeric(), -all_nominal()) %>%
  step_nzv( unique_cut = 3 ,all_nominal()) %>%
  step_mutate(credit_score = if_else(credit_score>1000, credit_score/10, credit_score)) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  prep()

Bake Recipe

bake_train <- bake(loan_recipe, train)

bake_test <- bake(loan_recipe, test)

sprintf("Column count before baking : %2d ", ncol(train))
## [1] "Column count before baking : 18 "
sprintf("Column count after  baking : %2d", ncol(bake_test))
## [1] "Column count after  baking : 47"

Create and Fit Model

loan_model <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm") %>%
  fit(loan_default ~ ., data= bake_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glance(loan_model)
tidy(loan_model) %>%
  mutate_at(c("estimate","std.error", "statistic", "p.value"),round, 4) %>%
  arrange(-estimate)
tidy(loan_model) %>%
  mutate_at(c("estimate","std.error", "statistic", "p.value"),round, 4) %>%
  arrange(-abs(-estimate))

Evaluate Model

train_scored <- predict(loan_model, bake_train, type = "prob") %>%
  bind_cols(predict(loan_model, bake_train, type = "class"))  %>%
  bind_cols(train)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_scored <- predict(loan_model, bake_test, type = "prob") %>%
  bind_cols(predict(loan_model, bake_test, type = "class"))  %>%
  bind_cols(test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_scored %>%
  mutate(.part = "test") %>%
  bind_rows(train_scored %>%
              mutate(.part = "train")) %>%
  group_by(.part) %>%
  precision(loan_default, .pred_class)
test_scored %>%
  mutate(.part = "test") %>%
  bind_rows(train_scored %>%
              mutate(.part = "train")) %>%
  group_by(.part) %>%
  metrics(loan_default, .pred_class) %>%
  filter(.metric == "accuracy")
test_scored %>%
  ggplot(aes(x=.pred_1, fill=loan_default)) +
  geom_histogram(bins=100) +
  geom_vline(xintercept = 0.2) +
  annotate(geom="text", x=0.8, y=1000, label="True Positive & False Positives",
              color="red") +
  annotate(geom="text", x=0.25, y=1000, label="True Negatives & False Negatives",
              color="blue") + 
  labs(title="Test score distribution",
       x = "predicted probability",
       y = "count")

test_scored %>%
  conf_mat(loan_default, .pred_class) %>%
  autoplot(type = "heatmap") +
  labs(title="Confusion Matrix") 

vip(loan_model, 10) +
  labs(title="Variable Importance")