Load Libraries
options(scipen=999)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.1 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──
## ✓ broom 0.7.6 ✓ rsample 0.0.9
## ✓ dials 0.0.9 ✓ tune 0.1.5
## ✓ infer 0.5.4 ✓ workflows 0.2.2
## ✓ modeldata 0.1.0 ✓ workflowsets 0.0.2
## ✓ parsnip 0.1.5 ✓ yardstick 0.0.8
## ✓ recipes 0.1.16
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## ● Use tidymodels_prefer() to resolve common conflicts.
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(readr)
Stage
loan <- read_csv("loan_defaults.csv") %>%
clean_names()
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Current_Loan_Amount = col_double(),
## Term = col_character(),
## Credit_Score = col_double(),
## Annual_Income = col_double(),
## Years_in_current_job = col_character(),
## Home_Ownership = col_character(),
## Purpose = col_character(),
## Monthly_Debt = col_double(),
## Years_of_Credit_History = col_double(),
## Months_since_last_delinquent = col_double(),
## Number_of_Open_Accounts = col_double(),
## Number_of_Credit_Problems = col_double(),
## Current_Credit_Balance = col_double(),
## Maximum_Open_Credit = col_double(),
## Bankruptcies = col_double(),
## Tax_Liens = col_double(),
## ID = col_double(),
## loan_default = col_double()
## )
head(loan,10)
Analyze Target
loan %>%
count(loan_default) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(loan_default, pct, label=sprintf("%1.1f%%",pct*100))) +
geom_col() +
geom_text( size = 3, color = "white", position = position_stack(vjust = 0.5)) +
labs(title = "Target %",
x = "Loan_Default",
y = "%")

Skim your data
loan %>%
skim_without_charts() %>%
select(skim_type, skim_variable, n_missing, numeric.mean)
Data summary
| Name |
Piped data |
| Number of rows |
70478 |
| Number of columns |
18 |
| _______________________ |
|
| Column type frequency: |
|
| character |
4 |
| numeric |
14 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| term |
326 |
| years_in_current_job |
326 |
| home_ownership |
326 |
| purpose |
326 |
Variable type: numeric
| current_loan_amount |
326 |
11653456.40 |
| credit_score |
13820 |
1078.91 |
| annual_income |
13820 |
1378525.31 |
| monthly_debt |
326 |
18454.53 |
| years_of_credit_history |
326 |
18.23 |
| months_since_last_delinquent |
37541 |
34.80 |
| number_of_open_accounts |
326 |
11.15 |
| number_of_credit_problems |
326 |
0.17 |
| current_credit_balance |
326 |
294499.65 |
| maximum_open_credit |
328 |
771826.00 |
| bankruptcies |
488 |
0.12 |
| tax_liens |
334 |
0.03 |
| id |
0 |
502701.01 |
| loan_default |
0 |
0.23 |
Explore
category_eval <- function(var, title){
loan %>%
count(!!as.name(var), loan_default) %>%
ggplot(aes(!!as.name(var), n, fill=as.factor(loan_default))) +
geom_col() +
coord_flip()+
labs(title = paste("catefory analysis for:", var, title))
}
category_eval("term", "")

category_eval("years_in_current_job", "")

category_eval("purpose", "")

Explore Categorical Variables
loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
group_by(term, loan_default) %>%
summarise(n = n()) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(y = pct, x = term, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
geom_col() +
geom_text( size = 3, position = position_stack(vjust = 0.5)) +
labs(title = "Loan by Term") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'term'. You can override using the `.groups` argument.

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
group_by(years_in_current_job, loan_default) %>%
summarise(n = n()) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(y = pct, x = years_in_current_job, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
geom_col() +
geom_text( size = 3, position = position_stack(vjust = 0.5)) +
labs(title = "Loan by Years in Current Job") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'years_in_current_job'. You can override using the `.groups` argument.

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
group_by(home_ownership, loan_default) %>%
summarise(n = n()) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(y = pct, x = home_ownership, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
geom_col() +
geom_text( size = 3, position = position_stack(vjust = 0.5)) +
labs(title = "Loan by Home Ownership") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'home_ownership'. You can override using the `.groups` argument.

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
group_by(purpose, loan_default) %>%
summarise(n = n()) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(y = pct, x = purpose, label=sprintf("%1.1f%%",pct*100), fill = loan_default)) +
geom_col() +
geom_text( size = 3, position = position_stack(vjust = 0.5)) +
labs(title = "Loan by Purpose") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'purpose'. You can override using the `.groups` argument.

Numeric Exploration
loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(current_loan_amount, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does current loan amount impact a loan default? ") +
ylab("count")+ xlab("current_loan_amount")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(annual_income, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does annual income impact a loan default? ") +
ylab("count")+ xlab("annual income")
## Warning: Removed 13820 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(monthly_debt, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does monthly debt impact a loan default? ") +
ylab("count")+ xlab("monthly debt impact")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(years_of_credit_history, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does years of credit history impact a loan default? ") +
ylab("count")+ xlab("years of credit history")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(months_since_last_delinquent, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does months since last delinquent impact a loan default? ") +
ylab("count")+ xlab("months since last delinquent")
## Warning: Removed 37541 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(number_of_open_accounts, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does number of open accounts a impact a loan default? ") +
ylab("count")+ xlab("number of open accounts")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(number_of_credit_problems, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does number of credit problems impact a loan default? ") +
ylab("count")+ xlab("number of credit problems")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(current_credit_balance, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does current credit balance impact a loan default? ") +
ylab("count")+ xlab("current credit balance")
## Warning: Removed 326 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(maximum_open_credit, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does maximum open credit impact a loan default? ") +
ylab("count")+ xlab("maximum open credit")
## Warning: Removed 328 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(bankruptcies, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does bankruptcies impact a loan default? ") +
ylab("count")+ xlab("bankruptcies")
## Warning: Removed 488 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
ggplot(aes(tax_liens, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does tax liens a loan default? ") +
ylab("count")+ xlab("tax liens")
## Warning: Removed 334 rows containing non-finite values (stat_bin).

loan %>%
mutate(loan_default = as.factor(loan_default)) %>%
mutate(credit_score = if_else(credit_score > 1000, credit_score / 10, credit_score)) %>%
ggplot(aes(credit_score, fill = loan_default)) +
geom_histogram( bins=35, ) +
labs(title = "No Fill: Does credit score impact a loan default? ") +
ylab("count")+ xlab("credit score")
## Warning: Removed 13820 rows containing non-finite values (stat_bin).

Correlation
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
loan_cor <- loan %>%
na.omit() %>%
select_if(is.numeric) %>%
cor() %>%
melt()
loan_cor %>%
ggplot(aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446") +
geom_text(aes(label=round(value,2)), color="black") +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Correlation to Loan Default",
subtitle = "exclude varibles w. abs(correlation) < .2?",
y = "",
x = "")

loan_cor %>%
filter(Var1 == "loan_default") %>%
ggplot(aes(x=Var1, y=reorder(Var2,value), fill=value)) +
geom_tile() +
scale_fill_gradient2(mid="#FBFEF9",low="#0C6291",high="#A63446") +
geom_text(aes(label=round(value,2)), color="black") +
theme(axis.text.x = element_text(angle = 0)) +
labs(title = "Correlation to loan_default",
subtitle = "exclude varibles w. abs(correlation) < .2?",
y = "",
x = "")

Partition 70/30
set.seed(123)
train_test_split <- initial_split(loan_prep, prop = 0.7)
train <- training(train_test_split)
test <- testing(train_test_split)
sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(loan_prep) * 100)
## [1] "Train PCT : 70.00%"
sprintf("Test PCT : %1.2f%%", nrow(test)/ nrow(loan_prep) * 100)
## [1] "Test PCT : 30.00%"
Tidymodel
Recipe
loan_recipe <-recipe(loan_default ~ ., data=train) %>%
step_rm(id) %>%
step_novel(all_nominal(), -all_outcomes()) %>%
step_impute_mode(all_nominal(), -all_numeric()) %>%
step_impute_mean(all_numeric(), -all_nominal()) %>%
step_nzv( unique_cut = 3 ,all_nominal()) %>%
step_mutate(credit_score = if_else(credit_score>1000, credit_score/10, credit_score)) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
prep()
Bake Recipe
bake_train <- bake(loan_recipe, train)
bake_test <- bake(loan_recipe, test)
sprintf("Column count before baking : %2d ", ncol(train))
## [1] "Column count before baking : 18 "
sprintf("Column count after baking : %2d", ncol(bake_test))
## [1] "Column count after baking : 47"
Create and Fit Model
loan_model <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm") %>%
fit(loan_default ~ ., data= bake_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glance(loan_model)
tidy(loan_model) %>%
mutate_at(c("estimate","std.error", "statistic", "p.value"),round, 4) %>%
arrange(-estimate)
tidy(loan_model) %>%
mutate_at(c("estimate","std.error", "statistic", "p.value"),round, 4) %>%
arrange(-abs(-estimate))
Evaluate Model
train_scored <- predict(loan_model, bake_train, type = "prob") %>%
bind_cols(predict(loan_model, bake_train, type = "class")) %>%
bind_cols(train)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_scored <- predict(loan_model, bake_test, type = "prob") %>%
bind_cols(predict(loan_model, bake_test, type = "class")) %>%
bind_cols(test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_scored %>%
mutate(.part = "test") %>%
bind_rows(train_scored %>%
mutate(.part = "train")) %>%
group_by(.part) %>%
precision(loan_default, .pred_class)
test_scored %>%
mutate(.part = "test") %>%
bind_rows(train_scored %>%
mutate(.part = "train")) %>%
group_by(.part) %>%
metrics(loan_default, .pred_class) %>%
filter(.metric == "accuracy")
test_scored %>%
ggplot(aes(x=.pred_1, fill=loan_default)) +
geom_histogram(bins=100) +
geom_vline(xintercept = 0.2) +
annotate(geom="text", x=0.8, y=1000, label="True Positive & False Positives",
color="red") +
annotate(geom="text", x=0.25, y=1000, label="True Negatives & False Negatives",
color="blue") +
labs(title="Test score distribution",
x = "predicted probability",
y = "count")

test_scored %>%
conf_mat(loan_default, .pred_class) %>%
autoplot(type = "heatmap") +
labs(title="Confusion Matrix")

vip(loan_model, 10) +
labs(title="Variable Importance")
