Final Project

Load Libraries

options(scipen=999)

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──

## ✓ broom        0.7.6      ✓ rsample      0.0.9 
## ✓ dials        0.0.9      ✓ tune         0.1.5 
## ✓ infer        0.5.4      ✓ workflows    0.2.2 
## ✓ modeldata    0.1.0      ✓ workflowsets 0.0.2 
## ✓ parsnip      0.1.5      ✓ yardstick    0.0.8 
## ✓ recipes      0.1.16

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## ● Use tidymodels_prefer() to resolve common conflicts.

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(skimr)      
library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(readr)

Stage

loan <- read_csv("loan_defaults.csv") %>%
  clean_names()

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Current_Loan_Amount = col_double(),
##   Term = col_character(),
##   Credit_Score = col_double(),
##   Annual_Income = col_double(),
##   Years_in_current_job = col_character(),
##   Home_Ownership = col_character(),
##   Purpose = col_character(),
##   Monthly_Debt = col_double(),
##   Years_of_Credit_History = col_double(),
##   Months_since_last_delinquent = col_double(),
##   Number_of_Open_Accounts = col_double(),
##   Number_of_Credit_Problems = col_double(),
##   Current_Credit_Balance = col_double(),
##   Maximum_Open_Credit = col_double(),
##   Bankruptcies = col_double(),
##   Tax_Liens = col_double(),
##   ID = col_double(),
##   loan_default = col_double()
## )

head(loan,10)

Analyze Target

loan %>%
  count(loan_default) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(loan_default, pct, label=sprintf("%1.1f%%",pct*100))) +
  geom_col() +
  geom_text( size = 3, color = "white", position = position_stack(vjust = 0.5)) + 
  labs(title = "Target %",
       x = "Loan_Default",
       y = "%")

Skim your data

loan %>%
  skim_without_charts() %>%
  select(skim_type, skim_variable, n_missing, numeric.mean)

Data summary
Name	Piped data
Number of rows	70478
Number of columns	18
_______________________
Column type frequency:
character	4
numeric	14
________________________
Group variables	None

Variable type: character

skim_variable	n_missing
term	326
years_in_current_job	326
home_ownership	326
purpose	326

Variable type: numeric

skim_variable	n_missing	mean
current_loan_amount	326	11653456.40
credit_score	13820	1078.91
annual_income	13820	1378525.31
monthly_debt	326	18454.53
years_of_credit_history	326	18.23
months_since_last_delinquent	37541	34.80
number_of_open_accounts	326	11.15
number_of_credit_problems	326	0.17
current_credit_balance	326	294499.65
maximum_open_credit	328	771826.00
bankruptcies	488	0.12
tax_liens	334	0.03
id	0	502701.01
loan_default	0	0.23

Explore

category_eval <- function(var, title){
loan %>%
    count(!!as.name(var), loan_default) %>%
    ggplot(aes(!!as.name(var), n, fill=as.factor(loan_default))) +
    geom_col() +
      coord_flip()+
    labs(title = paste("catefory analysis for:", var, title))
}

category_eval("term", "")

category_eval("years_in_current_job", "")

category_eval("purpose", "")

Explore Categorical Variables

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  group_by(term, loan_default) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = term, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Loan by Term") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## `summarise()` has grouped output by 'term'. You can override using the `.groups` argument.

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  group_by(years_in_current_job, loan_default) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = years_in_current_job, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Loan by Years in Current Job") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## `summarise()` has grouped output by 'years_in_current_job'. You can override using the `.groups` argument.

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  group_by(home_ownership, loan_default) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = home_ownership, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Loan by Home Ownership") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## `summarise()` has grouped output by 'home_ownership'. You can override using the `.groups` argument.

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  group_by(purpose, loan_default) %>%
  summarise(n = n()) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(y = pct, x = purpose, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
  geom_col() +
  geom_text( size = 3, position = position_stack(vjust = 0.5)) + 
  labs(title = "Loan by Purpose") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## `summarise()` has grouped output by 'purpose'. You can override using the `.groups` argument.

Numeric Exploration

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(current_loan_amount, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does current loan amount impact a loan default? ") +
  ylab("count")+ xlab("current_loan_amount")

## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(annual_income, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does annual income impact a loan default? ") +
  ylab("count")+ xlab("annual income")

## Warning: Removed 13820 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(monthly_debt, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does monthly debt impact a loan default? ") +
  ylab("count")+ xlab("monthly debt impact")

## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(years_of_credit_history, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does years of credit history impact a loan default? ") +
  ylab("count")+ xlab("years of credit history")

## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(months_since_last_delinquent, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does months since last delinquent impact a loan default? ") +
  ylab("count")+ xlab("months since last delinquent")

## Warning: Removed 37541 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(number_of_open_accounts, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does number of open accounts a impact a loan default? ") +
  ylab("count")+ xlab("number of open accounts")

## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(number_of_credit_problems, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does number of credit problems impact a loan default? ") +
  ylab("count")+ xlab("number of credit problems")

## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(current_credit_balance, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does current credit balance impact a loan default? ") +
  ylab("count")+ xlab("current credit balance")

## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(maximum_open_credit, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does maximum open credit impact a loan default? ") +
  ylab("count")+ xlab("maximum open credit")

## Warning: Removed 328 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(bankruptcies, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does bankruptcies impact a loan default? ") +
  ylab("count")+ xlab("bankruptcies")

## Warning: Removed 488 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  ggplot(aes(tax_liens, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does tax liens a loan default? ") +
  ylab("count")+ xlab("tax liens")

## Warning: Removed 334 rows containing non-finite values (stat_bin).

loan %>%
  mutate(loan_default = as.factor(loan_default)) %>%
  mutate(credit_score = if_else(credit_score > 1000, credit_score / 10, credit_score)) %>%
  ggplot(aes(credit_score, fill = loan_default)) +
  geom_histogram( bins=35, ) + 
  labs(title = "No Fill: Does credit score impact a loan default? ") +
  ylab("count")+ xlab("credit score")

## Warning: Removed 13820 rows containing non-finite values (stat_bin).

Correlation

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

 loan_cor <- loan %>%
  na.omit() %>%
  select_if(is.numeric) %>%
  cor() %>%
  melt()
 
 loan_cor %>%
  ggplot(aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446") +
  geom_text(aes(label=round(value,2)), color="black") +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(title = "Correlation to Loan Default",
       subtitle = "exclude varibles w. abs(correlation) < .2?",
       y = "",
       x = "")

 loan_cor %>%
  filter(Var1 == "loan_default") %>%
  ggplot(aes(x=Var1, y=reorder(Var2,value), fill=value)) +
  geom_tile() +
  scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446") +
  geom_text(aes(label=round(value,2)), color="black") +
  theme(axis.text.x = element_text(angle = 0)) +
  labs(title = "Correlation to loan_default",
       subtitle = "exclude varibles w. abs(correlation) < .2?",
       y = "",
       x = "")

Transformation

loan_prep <- loan %>%
  mutate_if(is.character, as.factor) %>%
  mutate(loan_default = as.factor(loan_default))
 
loan_prep %>%
  skim_without_charts()

Data summary
Name	Piped data
Number of rows	70478
Number of columns	18
_______________________
Column type frequency:
factor	5
numeric	13
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
term	326	1	FALSE	2	Sho: 50646, Lon: 19506
years_in_current_job	326	1	FALSE	12	10+: 21710, 2 y: 6451, < 1: 5742, 3 y: 5688
home_ownership	326	1	FALSE	4	Hom: 34003, Ren: 29517, Own: 6484, Hav: 148
purpose	326	1	FALSE	16	Deb: 55097, oth: 4237, Hom: 4085, Oth: 2277
loan_default	0	1	FALSE	2	0: 54503, 1: 15975

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
current_loan_amount	326	1.00	11653456.40	31654399.13	10802.0	179272.50	311751.00	524414.00	99999999.0
credit_score	13820	0.80	1078.91	1479.54	585.0	705.00	724.00	741.00	7510.0
annual_income	13820	0.80	1378525.31	1134102.15	81092.0	848502.00	1171150.50	1649732.00	165557393.0
monthly_debt	326	1.00	18454.53	12098.19	0.0	10185.42	16234.83	23997.71	229057.9
years_of_credit_history	326	1.00	18.23	7.03	3.6	13.50	17.00	21.70	65.0
months_since_last_delinquent	37541	0.47	34.80	21.95	0.0	16.00	32.00	51.00	97.0
number_of_open_accounts	326	1.00	11.15	5.02	0.0	8.00	10.00	14.00	76.0
number_of_credit_problems	326	1.00	0.17	0.48	0.0	0.00	0.00	0.00	12.0
current_credit_balance	326	1.00	294499.65	367014.79	0.0	112423.00	209722.00	368091.75	12986956.0
maximum_open_credit	328	1.00	771826.00	8596997.88	0.0	274230.00	468930.00	783480.50	1539737892.0
bankruptcies	488	0.99	0.12	0.35	0.0	0.00	0.00	0.00	7.0
tax_liens	334	1.00	0.03	0.25	0.0	0.00	0.00	0.00	11.0
id	0	1.00	502701.01	290461.22	20.0	251027.50	503130.00	754697.50	1005120.0

Partition 70/30

set.seed(123)

train_test_split <- initial_split(loan_prep, prop = 0.7)

train <- training(train_test_split)

test <- testing(train_test_split)

sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(loan_prep) * 100)

## [1] "Train PCT : 70.00%"

sprintf("Test  PCT : %1.2f%%", nrow(test)/ nrow(loan_prep) * 100)

## [1] "Test  PCT : 30.00%"

Tidymodel

Recipe

loan_recipe <-recipe(loan_default ~ ., data=train) %>%
  step_rm(id) %>%
  step_novel(all_nominal(), -all_outcomes()) %>%
  step_impute_mode(all_nominal(), -all_numeric()) %>%
  step_impute_mean(all_numeric(), -all_nominal()) %>%
  step_nzv( unique_cut = 3 ,all_nominal()) %>%
  step_mutate(credit_score = if_else(credit_score>1000, credit_score/10, credit_score)) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  prep()

Bake Recipe

bake_train <- bake(loan_recipe, train)

bake_test <- bake(loan_recipe, test)

sprintf("Column count before baking : %2d ", ncol(train))

## [1] "Column count before baking : 18 "

sprintf("Column count after  baking : %2d", ncol(bake_test))

## [1] "Column count after  baking : 47"

Create and Fit Model

loan_model <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm") %>%
  fit(loan_default ~ ., data= bake_train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

glance(loan_model)

tidy(loan_model) %>%
  mutate_at(c("estimate","std.error", "statistic", "p.value"),round, 4) %>%
  arrange(-estimate)

tidy(loan_model) %>%
  mutate_at(c("estimate","std.error", "statistic", "p.value"),round, 4) %>%
  arrange(-abs(-estimate))

Evaluate Model

train_scored <- predict(loan_model, bake_train, type = "prob") %>%
  bind_cols(predict(loan_model, bake_train, type = "class"))  %>%
  bind_cols(train)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

test_scored <- predict(loan_model, bake_test, type = "prob") %>%
  bind_cols(predict(loan_model, bake_test, type = "class"))  %>%
  bind_cols(test)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

test_scored %>%
  mutate(.part = "test") %>%
  bind_rows(train_scored %>%
              mutate(.part = "train")) %>%
  group_by(.part) %>%
  precision(loan_default, .pred_class)

test_scored %>%
  mutate(.part = "test") %>%
  bind_rows(train_scored %>%
              mutate(.part = "train")) %>%
  group_by(.part) %>%
  metrics(loan_default, .pred_class) %>%
  filter(.metric == "accuracy")

test_scored %>%
  ggplot(aes(x=.pred_1, fill=loan_default)) +
  geom_histogram(bins=100) +
  geom_vline(xintercept = 0.2) +
  annotate(geom="text", x=0.8, y=1000, label="True Positive & False Positives",
              color="red") +
  annotate(geom="text", x=0.25, y=1000, label="True Negatives & False Negatives",
              color="blue") + 
  labs(title="Test score distribution",
       x = "predicted probability",
       y = "count")

test_scored %>%
  conf_mat(loan_default, .pred_class) %>%
  autoplot(type = "heatmap") +
  labs(title="Confusion Matrix")

vip(loan_model, 10) +
  labs(title="Variable Importance")

Final Project

Christopher Morris

Load Libraries

Stage

Analyze Target

Skim your data

Explore

Explore Categorical Variables

Numeric Exploration

Correlation

Transformation

Partition 70/30

Tidymodel

Recipe

Bake Recipe

Create and Fit Model

Evaluate Model