link to the dataset:https://www.kaggle.com/datasets/itssuru/loan-data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.2
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
loan <- read.csv("loan_data.csv", stringsAsFactors = T)
head(loan)
  • ‘credit.policy’: 1 if the customer meets the credit underwriting criteria of LendingClub.com, and 0 otherwise. purpose: The purpose of the loan (takes values “creditcard”, “debtconsolidation”, “educational”, “majorpurchase”, “smallbusiness”, and “all_other”).
  • ‘int.rate’: The interest rate of the loan, as a proportion (a rate of 11% would be stored as 0.11). Borrowers judged by LendingClub.com to be more risky are assigned higher interest rates.
  • ‘installment’: The monthly installments owed by the borrower if the loan is funded.
  • ‘log.annual.inc’: The natural log of the self-reported annual income of the borrower.
  • ‘dti’: The debt-to-income ratio of the borrower (amount of debt divided by annual income).
  • ‘fico’: The FICO credit score of the borrower.
  • ‘days.with.cr.line’: The number of days the borrower has had a credit line.
  • ‘revol.bal’: The borrower’s revolving balance (amount unpaid at the end of the credit card billing cycle).
  • ‘revol.util’: The borrower’s revolving line utilization rate (the amount of the credit line used relative to total credit available).
  • ‘inq.last.6mths’: The borrower’s number of inquiries by creditors in the last 6 months.
  • ‘delinq.2yrs’: The number of times the borrower had been 30+ days past due on a payment in the past 2 years.
  • ‘pub.rec’: The borrower’s number of derogatory public records (bankruptcy filings, tax liens, or judgments).

EDA

table(loan$not.fully.paid)
## 
##    0    1 
## 8045 1533

The data seems imbalance, we will try both imbalance data and balanced data by using downsampling.

anyNA(loan)
## [1] FALSE
sum(duplicated(loan))
## [1] 0

The data doesn’t have any missing value (NA) nor duplicated data.

Wrangling

Changing data type fon target column to factor

loan <- loan %>% mutate(not.fully.paid = as.factor(not.fully.paid))

Deviding data into train and test set

set.seed(100)
insample <- sample(nrow(loan), nrow(loan) * 0.8)
train_loan <- loan[insample, ]
test_loan <- loan[-insample, ]
prop.table(table(train_loan$not.fully.paid))
## 
##        0        1 
## 0.839598 0.160402
prop.table(table(test_loan$not.fully.paid))
## 
##         0         1 
## 0.8413361 0.1586639
table(train_loan$not.fully.paid)
## 
##    0    1 
## 6433 1229
table(test_loan$not.fully.paid)
## 
##    0    1 
## 1612  304

Modelling

Logistic Regression

Model using all columns as predictor

risk_model_all <- glm(formula = not.fully.paid ~ ., data = train_loan, family = "binomial")

summary(risk_model_all)
## 
## Call:
## glm(formula = not.fully.paid ~ ., family = "binomial", data = train_loan)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3842  -0.6237  -0.4977  -0.3581   2.5250  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                9.026e+00  1.451e+00   6.221 4.95e-10 ***
## credit.policy             -2.913e-01  9.384e-02  -3.104 0.001907 ** 
## purposecredit_card        -6.398e-01  1.235e-01  -5.183 2.19e-07 ***
## purposedebt_consolidation -3.755e-01  8.577e-02  -4.378 1.20e-05 ***
## purposeeducational        -5.298e-03  1.744e-01  -0.030 0.975768    
## purposehome_improvement   -5.285e-02  1.436e-01  -0.368 0.712945    
## purposemajor_purchase     -3.555e-01  1.809e-01  -1.966 0.049354 *  
## purposesmall_business      4.340e-01  1.306e-01   3.324 0.000889 ***
## int.rate                   2.031e+00  1.931e+00   1.052 0.292879    
## installment                1.184e-03  1.950e-04   6.073 1.26e-09 ***
## log.annual.inc            -4.091e-01  6.736e-02  -6.073 1.26e-09 ***
## dti                        1.857e-03  5.090e-03   0.365 0.715186    
## fico                      -9.690e-03  1.596e-03  -6.072 1.27e-09 ***
## days.with.cr.line          1.081e-05  1.514e-05   0.714 0.475483    
## revol.bal                  3.322e-06  1.051e-06   3.160 0.001577 ** 
## revol.util                 2.459e-03  1.425e-03   1.726 0.084390 .  
## inq.last.6mths             8.049e-02  1.534e-02   5.248 1.54e-07 ***
## delinq.2yrs               -1.148e-01  6.190e-02  -1.855 0.063644 .  
## pub.rec                    2.179e-01  1.110e-01   1.962 0.049732 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6747.7  on 7661  degrees of freedom
## Residual deviance: 6285.9  on 7643  degrees of freedom
## AIC: 6323.9
## 
## Number of Fisher Scoring iterations: 5

Doing backward step, to select columns with good correlation only

risk_model_backward <- step(object=risk_model_all,
                       direction="backward", 
                       trace = F)

summary(risk_model_backward)
## 
## Call:
## glm(formula = not.fully.paid ~ credit.policy + purpose + installment + 
##     log.annual.inc + fico + revol.bal + revol.util + inq.last.6mths + 
##     delinq.2yrs + pub.rec, family = "binomial", data = train_loan)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4227  -0.6236  -0.4990  -0.3581   2.5455  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                9.779e+00  1.075e+00   9.098  < 2e-16 ***
## credit.policy             -2.918e-01  9.372e-02  -3.114 0.001847 ** 
## purposecredit_card        -6.406e-01  1.228e-01  -5.218 1.81e-07 ***
## purposedebt_consolidation -3.724e-01  8.502e-02  -4.380 1.19e-05 ***
## purposeeducational        -3.180e-03  1.744e-01  -0.018 0.985453    
## purposehome_improvement   -4.548e-02  1.435e-01  -0.317 0.751294    
## purposemajor_purchase     -3.539e-01  1.808e-01  -1.957 0.050333 .  
## purposesmall_business      4.657e-01  1.269e-01   3.670 0.000242 ***
## installment                1.267e-03  1.790e-04   7.075 1.49e-12 ***
## log.annual.inc            -4.016e-01  6.540e-02  -6.141 8.21e-10 ***
## fico                      -1.049e-02  1.232e-03  -8.515  < 2e-16 ***
## revol.bal                  3.488e-06  1.031e-06   3.384 0.000715 ***
## revol.util                 2.861e-03  1.389e-03   2.059 0.039463 *  
## inq.last.6mths             8.217e-02  1.534e-02   5.358 8.42e-08 ***
## delinq.2yrs               -1.061e-01  6.048e-02  -1.755 0.079320 .  
## pub.rec                    2.272e-01  1.099e-01   2.067 0.038700 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6747.7  on 7661  degrees of freedom
## Residual deviance: 6287.8  on 7646  degrees of freedom
## AIC: 6319.8
## 
## Number of Fisher Scoring iterations: 5

Cross-validation

prediction_prop <- predict(risk_model_backward, newdata = test_loan, type = "response")
threshold <- 0.15
prediction_prop <- ifelse(prediction_prop > threshold, 1, 0)
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: lattice
confusionMatrix(data = as.factor(prediction_prop),
                reference = as.factor(test_loan$not.fully.paid),
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 967 109
##          1 645 195
##                                           
##                Accuracy : 0.6065          
##                  95% CI : (0.5842, 0.6284)
##     No Information Rate : 0.8413          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1407          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6414          
##             Specificity : 0.5999          
##          Pos Pred Value : 0.2321          
##          Neg Pred Value : 0.8987          
##              Prevalence : 0.1587          
##          Detection Rate : 0.1018          
##    Detection Prevalence : 0.4384          
##       Balanced Accuracy : 0.6207          
##                                           
##        'Positive' Class : 1               
## 

From this model, the metrics that are proposed are either accuracy or th sensitivity. In my point of view, it depends on the company’s strategy. If the company wants to be more careful for their money, they can increase the sensitivity by decreasing the threshold. This will prevent from lending money to the customer that will fail on the paying back. The drawbacks of this strategy is the false negative rate will also increase. This means that will be more people that predicted they cannot pay but actually they can fully pay back in the future. This also means there will be less people taking loan than it should, and this will lead to less income to the company.

let’s try to make another model that might have not only good sensitivity but also good accuracy.

KNN (K Nearest Neightbor)

head(loan)
length(unique(loan$pub.rec))
## [1] 6

let’s select only the numeric data because KNN cannot work with categorical data.

loan_knn <- loan %>% 
  select(-credit.policy, -purpose)
set.seed(100)
insample <- sample(nrow(loan_knn), nrow(loan_knn) * 0.8)
train_loank <- loan_knn[insample, ]
test_loank <- loan_knn[-insample, ]
head(train_loank)

Because the data between the columns are not havig the equal metrics, we need to normalize the data.

normalize <- function(x){
  return ( 
    (x - min(x))/(max(x) - min(x)) 
  )
}
train_normal <- data.frame(lapply(train_loank[, -12], normalize))
test_normal <- data.frame(lapply(test_loank[, -12], normalize))

head(train_normal)
head(test_normal)

One of the method to choose the value of K is by setting the K value with square root of the total rows

k <- sqrt(nrow(train_normal))
# 87.5
k <- 87
library(class)

pred_knn <- knn(train = train_normal,
                test = test_normal,
                cl = train_loank$not.fully.paid,
                k = 40)
confusionMatrix(data = pred_knn,
                reference = as.factor(test_loank$not.fully.paid),
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1611  304
##          1    1    0
##                                           
##                Accuracy : 0.8408          
##                  95% CI : (0.8237, 0.8569)
##     No Information Rate : 0.8413          
##     P-Value [Acc > NIR] : 0.5402          
##                                           
##                   Kappa : -0.001          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000000       
##             Specificity : 0.9993797       
##          Pos Pred Value : 0.0000000       
##          Neg Pred Value : 0.8412533       
##              Prevalence : 0.1586639       
##          Detection Rate : 0.0000000       
##    Detection Prevalence : 0.0005219       
##       Balanced Accuracy : 0.4996898       
##                                           
##        'Positive' Class : 1               
## 

The KNN has good accuracy but very bad sensitivity. Adjusting the K value didn’t give any improvement to the model. So the logistic regression has the better model than the KNN. But we know that the logistic regression model isn’t really good to do the prediction. There is still one way to improve the model. It is by balancing the data between the positive and negative class.

But before that, let’s try to measure the logistic regression model performance using the other methods (ROC & AUC)

ROC & AUC

library(ROCR)
## Warning: package 'ROCR' was built under R version 4.2.2
pred_prob <- predict(risk_model_backward, newdata = test_loan, type = "response")

pred_prob <- data.frame(pred_prob)
roc_pred <- prediction(predictions = pred_prob[,1], labels = test_loan$not.fully.paid)

plot(performance(prediction.obj = roc_pred, measure = "tpr", x.measure = "fpr"))
abline(0,1, lty= 2)

loan_auc <- performance(prediction.obj = roc_pred, measure = "auc")


loan_auc@y.values
## [[1]]
## [1] 0.6745176

Area under curve that we get from the model prediction is 0.67. This validates that the model is still not good enough.

Optimizing Model by Balancing The Data

Down Sampling

I choose to do down sampling because the least class having more than 1200 data, this should be enough.

RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
library(caret)
loan_train_down <- downSample(x = train_loan %>% select(-not.fully.paid),
                              y = train_loan$not.fully.paid,
                              list = F,
                              yname = "not.fully.paid") #nama kolom target


#loan_train_down <- data.frame(loan_train_down)
head(loan_train_down)
prop.table(table(loan_train_down$not.fully.paid))
## 
##   0   1 
## 0.5 0.5
risk_model_all_new <- glm(formula = not.fully.paid ~ ., data = loan_train_down, family = "binomial")

summary(risk_model_all_new)
## 
## Call:
## glm(formula = not.fully.paid ~ ., family = "binomial", data = loan_train_down)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.658  -1.080  -0.167   1.101   1.873  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                1.021e+01  2.007e+00   5.086 3.66e-07 ***
## credit.policy             -2.809e-01  1.342e-01  -2.093 0.036350 *  
## purposecredit_card        -6.017e-01  1.565e-01  -3.844 0.000121 ***
## purposedebt_consolidation -3.470e-01  1.145e-01  -3.032 0.002429 ** 
## purposeeducational        -7.075e-02  2.376e-01  -0.298 0.765932    
## purposehome_improvement    8.664e-02  1.956e-01   0.443 0.657796    
## purposemajor_purchase     -4.077e-01  2.308e-01  -1.767 0.077278 .  
## purposesmall_business      4.416e-01  1.855e-01   2.381 0.017287 *  
## int.rate                   3.829e+00  2.672e+00   1.433 0.151911    
## installment                1.216e-03  2.644e-04   4.600 4.23e-06 ***
## log.annual.inc            -4.802e-01  8.922e-02  -5.382 7.36e-08 ***
## dti                       -1.136e-03  6.905e-03  -0.164 0.869359    
## fico                      -8.336e-03  2.165e-03  -3.849 0.000118 ***
## days.with.cr.line         -2.715e-06  2.071e-05  -0.131 0.895687    
## revol.bal                  4.089e-06  1.653e-06   2.473 0.013381 *  
## revol.util                 3.303e-03  1.897e-03   1.741 0.081669 .  
## inq.last.6mths             1.051e-01  2.461e-02   4.272 1.93e-05 ***
## delinq.2yrs               -3.627e-02  8.800e-02  -0.412 0.680181    
## pub.rec                    3.918e-01  1.645e-01   2.381 0.017253 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3407.5  on 2457  degrees of freedom
## Residual deviance: 3120.9  on 2439  degrees of freedom
## AIC: 3158.9
## 
## Number of Fisher Scoring iterations: 4
risk_model_backward_new <- step(object=risk_model_all_new,
                       direction="backward", 
                       trace = F)

summary(risk_model_backward_new)
## 
## Call:
## glm(formula = not.fully.paid ~ credit.policy + purpose + int.rate + 
##     installment + log.annual.inc + fico + revol.bal + revol.util + 
##     inq.last.6mths + pub.rec, family = "binomial", data = loan_train_down)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6528  -1.0783  -0.1676   1.1008   1.8747  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                1.013e+01  1.904e+00   5.321 1.03e-07 ***
## credit.policy             -2.818e-01  1.339e-01  -2.104 0.035358 *  
## purposecredit_card        -6.056e-01  1.560e-01  -3.883 0.000103 ***
## purposedebt_consolidation -3.488e-01  1.135e-01  -3.072 0.002123 ** 
## purposeeducational        -7.218e-02  2.374e-01  -0.304 0.761122    
## purposehome_improvement    8.765e-02  1.953e-01   0.449 0.653510    
## purposemajor_purchase     -4.084e-01  2.306e-01  -1.771 0.076481 .  
## purposesmall_business      4.424e-01  1.854e-01   2.386 0.017020 *  
## int.rate                   3.776e+00  2.669e+00   1.415 0.157083    
## installment                1.215e-03  2.641e-04   4.600 4.22e-06 ***
## log.annual.inc            -4.826e-01  8.626e-02  -5.595 2.21e-08 ***
## fico                      -8.222e-03  2.062e-03  -3.987 6.68e-05 ***
## revol.bal                  4.013e-06  1.595e-06   2.516 0.011857 *  
## revol.util                 3.358e-03  1.826e-03   1.839 0.065879 .  
## inq.last.6mths             1.053e-01  2.448e-02   4.302 1.69e-05 ***
## pub.rec                    3.933e-01  1.631e-01   2.412 0.015875 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3407.5  on 2457  degrees of freedom
## Residual deviance: 3121.2  on 2442  degrees of freedom
## AIC: 3153.2
## 
## Number of Fisher Scoring iterations: 4
prediction_prop_new <- predict(risk_model_backward_new, newdata = test_loan, type = "response")
threshold <- 0.3
prediction_prop_new <- ifelse(prediction_prop_new > threshold, 1, 0)
library(caret)

confusionMatrix(data = as.factor(prediction_prop_new),
                reference = as.factor(test_loan$not.fully.paid),
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  313   23
##          1 1299  281
##                                           
##                Accuracy : 0.31            
##                  95% CI : (0.2894, 0.3313)
##     No Information Rate : 0.8413          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0438          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9243          
##             Specificity : 0.1942          
##          Pos Pred Value : 0.1778          
##          Neg Pred Value : 0.9315          
##              Prevalence : 0.1587          
##          Detection Rate : 0.1467          
##    Detection Prevalence : 0.8246          
##       Balanced Accuracy : 0.5593          
##                                           
##        'Positive' Class : 1               
## 
pred_prob_new <- predict(risk_model_backward_new, newdata = test_loan, type = "response")

pred_prob_new <- data.frame(pred_prob_new)
roc_pred_new <- prediction(predictions = pred_prob_new[,1], labels = test_loan$not.fully.paid)

plot(performance(prediction.obj = roc_pred_new, measure = "tpr", x.measure = "fpr"))
abline(0,1, lty= 2)

loan_auc_new <- performance(prediction.obj = roc_pred_new, measure = "auc")


loan_auc_new@y.values
## [[1]]
## [1] 0.675795

The AUC after downsampling the data is 0.68. This is not any better from the previous model. Let’s try up sampling the data.

Up sampling

RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
library(caret)
loan_train_up <- upSample(x = train_loan %>% select(-not.fully.paid),
                              y = train_loan$not.fully.paid,
                              list = F,
                              yname = "not.fully.paid") #nama kolom target


#loan_train_down <- data.frame(loan_train_down)
head(loan_train_up)
table(loan_train_up$not.fully.paid)
## 
##    0    1 
## 6433 6433
risk_model_all_new1 <- glm(formula = not.fully.paid ~ ., data = loan_train_up, family = "binomial")

summary(risk_model_all_new1)
## 
## Call:
## glm(formula = not.fully.paid ~ ., family = "binomial", data = loan_train_up)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5768  -1.0815  -0.1357   1.0942   1.8654  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                1.057e+01  8.601e-01  12.284  < 2e-16 ***
## credit.policy             -1.769e-01  5.834e-02  -3.033 0.002424 ** 
## purposecredit_card        -6.288e-01  6.809e-02  -9.236  < 2e-16 ***
## purposedebt_consolidation -3.914e-01  4.980e-02  -7.860 3.85e-15 ***
## purposeeducational        -8.997e-02  1.053e-01  -0.854 0.392906    
## purposehome_improvement   -5.522e-02  8.368e-02  -0.660 0.509364    
## purposemajor_purchase     -3.966e-01  1.024e-01  -3.872 0.000108 ***
## purposesmall_business      3.666e-01  8.150e-02   4.498 6.86e-06 ***
## int.rate                   3.327e+00  1.166e+00   2.854 0.004317 ** 
## installment                1.236e-03  1.145e-04  10.801  < 2e-16 ***
## log.annual.inc            -4.664e-01  3.888e-02 -11.996  < 2e-16 ***
## dti                       -2.469e-03  2.980e-03  -0.829 0.407283    
## fico                      -9.064e-03  9.307e-04  -9.740  < 2e-16 ***
## days.with.cr.line          2.980e-06  9.039e-06   0.330 0.741641    
## revol.bal                  5.489e-06  7.469e-07   7.349 1.99e-13 ***
## revol.util                 3.140e-03  8.265e-04   3.799 0.000145 ***
## inq.last.6mths             1.095e-01  1.067e-02  10.262  < 2e-16 ***
## delinq.2yrs               -6.624e-02  3.527e-02  -1.878 0.060349 .  
## pub.rec                    2.885e-01  7.078e-02   4.075 4.59e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17836  on 12865  degrees of freedom
## Residual deviance: 16380  on 12847  degrees of freedom
## AIC: 16418
## 
## Number of Fisher Scoring iterations: 4
risk_model_backward_new1 <- step(object=risk_model_all_new1,
                       direction="backward", 
                       trace = F)

summary(risk_model_backward_new1)
## 
## Call:
## glm(formula = not.fully.paid ~ credit.policy + purpose + int.rate + 
##     installment + log.annual.inc + fico + revol.bal + revol.util + 
##     inq.last.6mths + delinq.2yrs + pub.rec, family = "binomial", 
##     data = loan_train_up)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5576  -1.0826  -0.1351   1.0976   1.8577  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                1.042e+01  8.247e-01  12.640  < 2e-16 ***
## credit.policy             -1.761e-01  5.816e-02  -3.028 0.002461 ** 
## purposecredit_card        -6.332e-01  6.772e-02  -9.351  < 2e-16 ***
## purposedebt_consolidation -3.959e-01  4.937e-02  -8.020 1.06e-15 ***
## purposeeducational        -9.087e-02  1.053e-01  -0.863 0.388071    
## purposehome_improvement   -5.349e-02  8.359e-02  -0.640 0.522270    
## purposemajor_purchase     -3.961e-01  1.024e-01  -3.868 0.000110 ***
## purposesmall_business      3.684e-01  8.146e-02   4.522 6.11e-06 ***
## int.rate                   3.290e+00  1.164e+00   2.827 0.004695 ** 
## installment                1.236e-03  1.144e-04  10.804  < 2e-16 ***
## log.annual.inc            -4.600e-01  3.772e-02 -12.193  < 2e-16 ***
## fico                      -8.973e-03  9.024e-04  -9.943  < 2e-16 ***
## revol.bal                  5.414e-06  7.230e-07   7.489 6.96e-14 ***
## revol.util                 3.060e-03  8.113e-04   3.772 0.000162 ***
## inq.last.6mths             1.094e-01  1.063e-02  10.286  < 2e-16 ***
## delinq.2yrs               -6.325e-02  3.468e-02  -1.824 0.068198 .  
## pub.rec                    2.913e-01  7.031e-02   4.143 3.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17836  on 12865  degrees of freedom
## Residual deviance: 16380  on 12849  degrees of freedom
## AIC: 16414
## 
## Number of Fisher Scoring iterations: 4
prediction_prop_new1 <- predict(risk_model_backward_new1, newdata = test_loan, type = "response")
threshold <- 0.4
prediction_prop_new1 <- ifelse(prediction_prop_new1 > threshold, 1, 0)
library(caret)

confusionMatrix(data = as.factor(prediction_prop_new1),
                reference = as.factor(test_loan$not.fully.paid),
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 657  63
##          1 955 241
##                                           
##                Accuracy : 0.4687          
##                  95% CI : (0.4461, 0.4913)
##     No Information Rate : 0.8413          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0915          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7928          
##             Specificity : 0.4076          
##          Pos Pred Value : 0.2015          
##          Neg Pred Value : 0.9125          
##              Prevalence : 0.1587          
##          Detection Rate : 0.1258          
##    Detection Prevalence : 0.6242          
##       Balanced Accuracy : 0.6002          
##                                           
##        'Positive' Class : 1               
## 
pred_prob_new1 <- predict(risk_model_backward_new1, newdata = test_loan, type = "response")

pred_prob_new1 <- data.frame(pred_prob_new1)
roc_pred_new1 <- prediction(predictions = pred_prob_new1[,1], labels = test_loan$not.fully.paid)

plot(performance(prediction.obj = roc_pred_new1, measure = "tpr", x.measure = "fpr"))
abline(0,1, lty= 2)

loan_auc_new1 <- performance(prediction.obj = roc_pred_new1, measure = "auc")


loan_auc_new1@y.values
## [[1]]
## [1] 0.6731157

The AUC that we got is 0.69. This is just almost the same from the previous model. I think we need to use a better algorithm to tackle this problem. We can use either naive bayes, decision tree, or even random forest.