Objective

For this project we will be exploring publicly available data from LendingClub.com. Lending Club connects people who need money (borrowers) with people who have money (investors). Hopefully, as an investor you would want to invest in people who showed a profile of having a high probability of paying you back. We will try to create a model that will help predict this.

Libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caTools)
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3

Data

ldata <- read.csv("loan_data.csv")
str(ldata)
## 'data.frame':    9578 obs. of  14 variables:
##  $ credit.policy    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ purpose          : chr  "debt_consolidation" "credit_card" "debt_consolidation" "debt_consolidation" ...
##  $ int.rate         : num  0.119 0.107 0.136 0.101 0.143 ...
##  $ installment      : num  829 228 367 162 103 ...
##  $ log.annual.inc   : num  11.4 11.1 10.4 11.4 11.3 ...
##  $ dti              : num  19.5 14.3 11.6 8.1 15 ...
##  $ fico             : int  737 707 682 712 667 727 667 722 682 707 ...
##  $ days.with.cr.line: num  5640 2760 4710 2700 4066 ...
##  $ revol.bal        : int  28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
##  $ revol.util       : num  52.1 76.7 25.6 73.2 39.5 51 76.8 68.6 51.1 23 ...
##  $ inq.last.6mths   : int  0 0 1 1 0 0 0 0 1 1 ...
##  $ delinq.2yrs      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ pub.rec          : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ not.fully.paid   : int  0 0 0 0 0 0 1 1 0 0 ...
summary(ldata)
##  credit.policy     purpose             int.rate       installment    
##  Min.   :0.000   Length:9578        Min.   :0.0600   Min.   : 15.67  
##  1st Qu.:1.000   Class :character   1st Qu.:0.1039   1st Qu.:163.77  
##  Median :1.000   Mode  :character   Median :0.1221   Median :268.95  
##  Mean   :0.805                      Mean   :0.1226   Mean   :319.09  
##  3rd Qu.:1.000                      3rd Qu.:0.1407   3rd Qu.:432.76  
##  Max.   :1.000                      Max.   :0.2164   Max.   :940.14  
##  log.annual.inc        dti              fico       days.with.cr.line
##  Min.   : 7.548   Min.   : 0.000   Min.   :612.0   Min.   :  179    
##  1st Qu.:10.558   1st Qu.: 7.213   1st Qu.:682.0   1st Qu.: 2820    
##  Median :10.929   Median :12.665   Median :707.0   Median : 4140    
##  Mean   :10.932   Mean   :12.607   Mean   :710.8   Mean   : 4561    
##  3rd Qu.:11.291   3rd Qu.:17.950   3rd Qu.:737.0   3rd Qu.: 5730    
##  Max.   :14.528   Max.   :29.960   Max.   :827.0   Max.   :17640    
##    revol.bal         revol.util    inq.last.6mths    delinq.2yrs     
##  Min.   :      0   Min.   :  0.0   Min.   : 0.000   Min.   : 0.0000  
##  1st Qu.:   3187   1st Qu.: 22.6   1st Qu.: 0.000   1st Qu.: 0.0000  
##  Median :   8596   Median : 46.3   Median : 1.000   Median : 0.0000  
##  Mean   :  16914   Mean   : 46.8   Mean   : 1.577   Mean   : 0.1637  
##  3rd Qu.:  18250   3rd Qu.: 70.9   3rd Qu.: 2.000   3rd Qu.: 0.0000  
##  Max.   :1207359   Max.   :119.0   Max.   :33.000   Max.   :13.0000  
##     pub.rec        not.fully.paid  
##  Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000  
##  Mean   :0.06212   Mean   :0.1601  
##  3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :5.00000   Max.   :1.0000

Convert data to categorical values

ldata <- ldata %>%
      mutate(inq.last.6mths = as.factor(inq.last.6mths)) %>%
      mutate(delinq.2yrs = as.factor(delinq.2yrs))%>%
      mutate(pub.rec = as.factor(pub.rec))%>%
      mutate(not.fully.paid = as.factor(not.fully.paid))%>%
      mutate(credit.policy = as.factor(credit.policy))

EDA

Let’s use ggplot 2 to visualize the data!

Create a histogram of fico scores colored by not.fully.paid

ggplot(ldata, aes(fico)) + geom_histogram(aes(fill=not.fully.paid), color = "black", alpha = .7)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(ldata, aes(purpose)) + 
      geom_bar(aes(fill=not.fully.paid), position = "dodge", color = "black", alpha = .7) +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))

Scatterplot of fico score versus int.rate.

ggplot(ldata) + geom_point(aes(x=int.rate, y=fico, color = not.fully.paid), alpha = .7)

## Building the Model

Split train and test datasets

set.seed(101)

spl = sample.split(ldata$not.fully.paid, 0.7)

train = subset(ldata, spl == TRUE)
test = subset(ldata, spl == FALSE)

Training the model using svm() function

model <- svm(not.fully.paid ~ .,data=train)

Model summary

summary(model)
## 
## Call:
## svm(formula = not.fully.paid ~ ., data = train)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  2849
## 
##  ( 1776 1073 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Prediction of new values

predicted.values <- predict(model,test[1:13])
table(predicted.values,test$not.fully.paid)
##                 
## predicted.values    0    1
##                0 2413  460
##                1    0    0

Tuning the model

tune.results <- tune(svm, train.x=not.fully.paid ~ ., data = train, kernel = 'radial',
                  ranges = list(cost = c(1,10), gamma = c(0.1,1)))
model <- svm(not.fully.paid ~ .,data=train,cost=10,gamma = 0.1)
predicted.values <- predict(model,test[1:13])
table(predicted.values,test$not.fully.paid)
##                 
## predicted.values    0    1
##                0 2350  425
##                1   63   35