Overview

In this project the objective is to create a supervised learning model to predict if a loan will default. There are 1,000 loan records and 16 variables to use to predict if the loan will default. The variables include both numeric and character features.

Packages

library(dplyr)
library(caret)
library(gmodels)
library(C50)

Features

#read in data
credit <- read.csv("credit.csv")%>%
  mutate(default = as.factor(default))

colnames(credit)
##  [1] "checking_balance"     "months_loan_duration" "credit_history"      
##  [4] "purpose"              "amount"               "savings_balance"     
##  [7] "employment_duration"  "percent_of_income"    "years_at_residence"  
## [10] "age"                  "other_credit"         "housing"             
## [13] "existing_loans_count" "job"                  "dependents"          
## [16] "phone"                "default"
head(credit)
##   checking_balance months_loan_duration credit_history              purpose
## 1           < 0 DM                    6       critical furniture/appliances
## 2       1 - 200 DM                   48           good furniture/appliances
## 3          unknown                   12       critical            education
## 4           < 0 DM                   42           good furniture/appliances
## 5           < 0 DM                   24           poor                  car
## 6          unknown                   36           good            education
##   amount savings_balance employment_duration percent_of_income
## 1   1169         unknown           > 7 years                 4
## 2   5951        < 100 DM         1 - 4 years                 2
## 3   2096        < 100 DM         4 - 7 years                 2
## 4   7882        < 100 DM         4 - 7 years                 2
## 5   4870        < 100 DM         1 - 4 years                 3
## 6   9055         unknown         1 - 4 years                 2
##   years_at_residence age other_credit housing existing_loans_count       job
## 1                  4  67         none     own                    2   skilled
## 2                  2  22         none     own                    1   skilled
## 3                  3  49         none     own                    1 unskilled
## 4                  4  45         none   other                    1   skilled
## 5                  4  53         none   other                    2   skilled
## 6                  4  35         none   other                    1 unskilled
##   dependents phone default
## 1          1   yes      no
## 2          1    no     yes
## 3          2    no      no
## 4          2    no      no
## 5          2    no     yes
## 6          2   yes      no

Create Train & Test Data

Using 80-20 split for train versus test

set.seed(123)
train_index <- createDataPartition(credit$default, times = 1, p = 0.80, list = FALSE)
credit_train <- credit[train_index,]
credit_test <- credit[-train_index,]

Create error cost to add to model

Cost of error will weight errors, in this case we are weighting a false negative 4 times as much as a false positive. In other words, the cost of default is 4 times the cost of a missed opportunity by not providing a loan.

matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))
names(matrix_dimensions) <- c("predicted", "actual")

error_cost <- matrix(c(0,1,10,0), nrow = 2, dimnames = matrix_dimensions)
error_cost
##          actual
## predicted no yes
##       no   0  10
##       yes  1   0

Create two c.50 Modelโ€™s one with error cost and one without

modelc50 <- C5.0(credit_train[-17], credit_train$default)

modelc50cost <- C5.0(credit_train[-17], credit_train$default, costs = error_cost)

Run both models on test data

credit_pred <- predict(modelc50, credit_test)
credit_cost_pred <- predict(modelc50cost, credit_test)

C50 model results

Accuracy

confusionMatrix(credit_pred, credit_test$default)$overall["Accuracy"] 
## Accuracy 
##     0.75

False Positive Rate %

1 - confusionMatrix(credit_pred, credit_test$default)$byClass["Pos Pred Value"] 
## Pos Pred Value 
##      0.1959459

False Negative Rate %

1 - confusionMatrix(credit_pred, credit_test$default)$byClass["Neg Pred Value"] 
## Neg Pred Value 
##      0.4038462

C50 error cost model results

Accuracy

confusionMatrix(credit_cost_pred, credit_test$default)$overall["Accuracy"] 
## Accuracy 
##     0.52

False Positive Rate %

1 - confusionMatrix(credit_cost_pred, credit_test$default)$byClass["Pos Pred Value"] 
## Pos Pred Value 
##     0.04166667

False Negative Rate %

1 - confusionMatrix(credit_cost_pred, credit_test$default)$byClass["Neg Pred Value"] 
## Neg Pred Value 
##      0.6184211

Above we can see that using a 10:1 cost ratio reduces model performance significantly, because the model is optimizing True positives. By reducing the false positive rate we increase the false negative rate. In other words, the model is reducing the number of defaults by being more selective. From a business application standpoint, this would mean turning away more customers for the purpose of minimizing risk of defaults. The proper error cost ratio would be selected depending on risk management guidelines as well as loan interest rate.

Visualizing the impact of cost of error ratios

#loop results for different cost of errors to determine necessary level of risk mitigation
ec_seq <- seq(2,20)

Results_per_EC=NULL
for (i in ec_seq) {
  error_cost <- matrix(c(0,1,i,0), nrow = 2, dimnames = matrix_dimensions)
  credit_cost <- C5.0(credit_train[-17], credit_train$default, costs = error_cost)
  credit_cost_pred <- predict(credit_cost, credit_test)
  EC <- i
  FALSE_POSITIVE <- prop.table(table(Predicted = credit_cost_pred, Actual = credit_test$default))[2,1]
  FALSE_NEGATIVE <- prop.table(table(Predicted = credit_cost_pred, Actual = credit_test$default))[1,2]
  OVERALL_ACCURACY <- confusionMatrix(credit_cost_pred, credit_test$default)$overall["Accuracy"] 
  Results_per_EC = rbind(Results_per_EC, data.frame(EC, FALSE_POSITIVE, FALSE_NEGATIVE, OVERALL_ACCURACY))
} 

#plot
Results_per_EC %>%
ggplot()+
  geom_line(aes(x = EC, y = FALSE_POSITIVE, color = "blue"), size = 2)+
geom_line(aes(x = EC, y = FALSE_NEGATIVE, color = "orange"), size = 2)+
  geom_line(aes(x = EC, y = OVERALL_ACCURACY, color = "black"), size = 2)+
  geom_hline(yintercept=0, linetype="dashed", color = "black")+
  scale_color_discrete(name = "Model Results", labels = c("OVERALL_ACCURACY", "FALSE_POSITIVE", "FALSE_NEGATIVE"))+
  scale_x_continuous(breaks = c(seq(2,20)))+
  scale_y_continuous(breaks = seq(0,1, by = .05))+
  labs(
    title = "Cost of Loan Risk",
    x = "Cost of Error Ratio (cost of default / cost of missed opportunity",
    y = "Percent"
  )

In the chart above we can see that we could reduce defaults in our test data to 0% by setting cost of error to 14. However, this would not be needed because interest rates are used as a risk mitigation instrument. To optimize profit a lower cost of error would be selected.