Introduction

The data analyzed in this file was obtained from the following link:https://www.kaggle.com/datasets/saramah/loan-data/data

I have attached the original description of the data - "publicly available data from LendingClub.com. Lending Club connects people who need money (borrowers) with people who have money (investors). Hopefully, as an investor you would want to invest in people who showed a profile of having a high probability of paying you back.

We will use lending data from 2007-2010 and be trying to classify and predict whether or not the borrower paid back their loan in full… "

The definitions of each variable is also available via the link.

Load Libraries

library(caret)
library(ggplot2)
library(tidyr)
library(gridExtra)
# Read data from: https://www.kaggle.com/datasets/saramah/loan-data/data
setwd("/Users/robertvargas/Documents/Projects/R/Loan_data")
loan_data <- read.csv("loan_data.csv")
loan_data <- data.frame(loan_data)
# Visual the break out of the data, about 80% of total borrowers have not paid off their loan
barplot(table(loan_data$not.fully.paid),main = "Histogram of Borrowers Who Paid Back Loan", xlab = "0 - Not Paid, 1 - Paid", ylab = "Frequency", col = "lightblue")

# List of columns available in dataset
for (i in 1:ncol(loan_data)) {
  print(colnames(loan_data)[i])}
## [1] "credit.policy"
## [1] "purpose"
## [1] "int.rate"
## [1] "installment"
## [1] "log.annual.inc"
## [1] "dti"
## [1] "fico"
## [1] "days.with.cr.line"
## [1] "revol.bal"
## [1] "revol.util"
## [1] "inq.last.6mths"
## [1] "delinq.2yrs"
## [1] "pub.rec"
## [1] "not.fully.paid"

Creating the Model

There were no missing or null values found in the data. Below is a detailed, step-by-step description of my process for creating a binomial regression model. This approach was heavily influenced by the “Using R for Analytics” course I completed during my graduate studies at Purdue, taught by Professor Matthew Lanham.

# Rename depedent variable to "y"
loan_data = loan_data[,c(ncol(loan_data), 1:(ncol(loan_data)-1))]
colnames(loan_data)[1] <- "y"

# one-hot encodings
dummies = dummyVars(y~. , data = loan_data)
ex = data.frame(predict(dummies, newdata = loan_data))
names(ex) = gsub("\\.", "", names(ex))
d = cbind(loan_data$y, ex)
names(d)[1] = "y"
rm(dummies,ex)

#find correlated variables
descrCor = cor(d[,2:ncol(d)])
highCorr = sum(abs(descrCor[upper.tri(descrCor)]) > .85)
summary(descrCor[upper.tri(descrCor)])# no highly correlated variables
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.714821 -0.069111 -0.009318 -0.007477  0.065004  0.464837
#finding linear combos
y = d$y
d = cbind(rep(1,nrow(d)), d[2:ncol(d)])
names(d)[1] = "ones"
comboInfo = findLinearCombos(d)
d = d[,-comboInfo$remove]
d = d[,c(2:ncol(d))]
d = cbind(y,d)
rm(y,comboInfo)

# find variables with near zero variation
nzv = nearZeroVar(d, saveMetrics = TRUE)
d = d[,c(TRUE, !nzv$zeroVar[2:ncol(d)])]

# pre process (opted out)
continuous_vars = c("intrate", "installment", 'logannualinc', "dti", "fico", "dayswithcrline","revolbal", "revolutil", "inqlast6mths","pubrec")
continuous_data <- d[, continuous_vars]
plot_list <- list()
# Loop through the continuous variables to create a histogram for each
for (var in continuous_vars) {
  p = (
    ggplot(continuous_data, aes_string(x = var)) + 
      geom_histogram(bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
      ggtitle(paste("Histogram of", var)) +
      xlab(var) + ylab("Frequency") +
      theme_minimal() + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
  )
  print(p)
}

# creating data partition
set.seed(1234)
d$y <- factor(d$y, levels = c(0, 1), labels = c("NotPaid", "Paid"))
inTrain = createDataPartition(y = d$y, p = .7 , list = F)
train = d[inTrain,]
test = d[-inTrain,]

# train control
ctrl = trainControl(method = "cv", 
                    number = 3,
                    classProbs = TRUE,
                    summaryFunction = twoClassSummary,
                    allowParallel = T)
model = train(y ~ ., 
              data = train,
              method = "glm",
              trControl = ctrl,
              family = "binomial",
              metric = "ROC")

summary(model$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5784  -0.6179  -0.4915  -0.3584   2.6075  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                8.130e+00  1.560e+00   5.210 1.89e-07 ***
## creditpolicy              -2.966e-01  1.007e-01  -2.944 0.003235 ** 
## purposeall_other          -4.976e-01  1.404e-01  -3.543 0.000396 ***
## purposecredit_card        -1.123e+00  1.664e-01  -6.746 1.52e-11 ***
## purposedebt_consolidation -8.427e-01  1.329e-01  -6.343 2.25e-10 ***
## purposeeducational        -4.549e-01  2.078e-01  -2.189 0.028569 *  
## purposehome_improvement   -4.150e-01  1.781e-01  -2.330 0.019811 *  
## purposemajor_purchase     -7.496e-01  2.154e-01  -3.479 0.000503 ***
## intrate                    4.204e+00  2.054e+00   2.047 0.040663 *  
## installment                1.129e-03  2.080e-04   5.430 5.64e-08 ***
## logannualinc              -4.146e-01  7.164e-02  -5.788 7.12e-09 ***
## dti                        2.838e-03  5.439e-03   0.522 0.601806    
## fico                      -8.160e-03  1.702e-03  -4.794 1.63e-06 ***
## dayswithcrline             2.366e-05  1.574e-05   1.504 0.132684    
## revolbal                   3.741e-06  1.171e-06   3.194 0.001402 ** 
## revolutil                  2.550e-03  1.524e-03   1.674 0.094165 .  
## inqlast6mths               7.442e-02  1.537e-02   4.843 1.28e-06 ***
## delinq2yrs                -9.700e-02  6.585e-02  -1.473 0.140768    
## pubrec                     3.288e-01  1.099e-01   2.992 0.002774 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5900.3  on 6705  degrees of freedom
## Residual deviance: 5475.2  on 6687  degrees of freedom
## AIC: 5513.2
## 
## Number of Fisher Scoring iterations: 5
# dti, dayswithcrline, revolutil, and delinq2yrs are not stat significant

Testing and Analyzing Model

# Testing the model against the test and train set, respectively
logit_tr = predict(model, newdata = train, type = 'raw')
logit_te = predict(model, newdata = test, type = 'raw')
# Creating and visualizing the confusion matrices
cm = confusionMatrix(data = logit_tr, train$y)
testcm = confusionMatrix(data = logit_te, test$y)

## [1] "Training Accuracy:  0.840888756337608"

## [1] "Test Accuracy:  0.841225626740947"