For this project we will be exploring publicly available data from LendingClub.com. Lending Club connects people who need money (borrowers) with people who have money (investors). Hopefully, as an investor you would want to invest in people who showed a profile of having a high probability of paying you back. We will try to create a model that will help predict this.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caTools)
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
ldata <- read.csv("loan_data.csv")
str(ldata)
## 'data.frame': 9578 obs. of 14 variables:
## $ credit.policy : int 1 1 1 1 1 1 1 1 1 1 ...
## $ purpose : chr "debt_consolidation" "credit_card" "debt_consolidation" "debt_consolidation" ...
## $ int.rate : num 0.119 0.107 0.136 0.101 0.143 ...
## $ installment : num 829 228 367 162 103 ...
## $ log.annual.inc : num 11.4 11.1 10.4 11.4 11.3 ...
## $ dti : num 19.5 14.3 11.6 8.1 15 ...
## $ fico : int 737 707 682 712 667 727 667 722 682 707 ...
## $ days.with.cr.line: num 5640 2760 4710 2700 4066 ...
## $ revol.bal : int 28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
## $ revol.util : num 52.1 76.7 25.6 73.2 39.5 51 76.8 68.6 51.1 23 ...
## $ inq.last.6mths : int 0 0 1 1 0 0 0 0 1 1 ...
## $ delinq.2yrs : int 0 0 0 0 1 0 0 0 0 0 ...
## $ pub.rec : int 0 0 0 0 0 0 1 0 0 0 ...
## $ not.fully.paid : int 0 0 0 0 0 0 1 1 0 0 ...
summary(ldata)
## credit.policy purpose int.rate installment
## Min. :0.000 Length:9578 Min. :0.0600 Min. : 15.67
## 1st Qu.:1.000 Class :character 1st Qu.:0.1039 1st Qu.:163.77
## Median :1.000 Mode :character Median :0.1221 Median :268.95
## Mean :0.805 Mean :0.1226 Mean :319.09
## 3rd Qu.:1.000 3rd Qu.:0.1407 3rd Qu.:432.76
## Max. :1.000 Max. :0.2164 Max. :940.14
## log.annual.inc dti fico days.with.cr.line
## Min. : 7.548 Min. : 0.000 Min. :612.0 Min. : 179
## 1st Qu.:10.558 1st Qu.: 7.213 1st Qu.:682.0 1st Qu.: 2820
## Median :10.929 Median :12.665 Median :707.0 Median : 4140
## Mean :10.932 Mean :12.607 Mean :710.8 Mean : 4561
## 3rd Qu.:11.291 3rd Qu.:17.950 3rd Qu.:737.0 3rd Qu.: 5730
## Max. :14.528 Max. :29.960 Max. :827.0 Max. :17640
## revol.bal revol.util inq.last.6mths delinq.2yrs
## Min. : 0 Min. : 0.0 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 3187 1st Qu.: 22.6 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 8596 Median : 46.3 Median : 1.000 Median : 0.0000
## Mean : 16914 Mean : 46.8 Mean : 1.577 Mean : 0.1637
## 3rd Qu.: 18250 3rd Qu.: 70.9 3rd Qu.: 2.000 3rd Qu.: 0.0000
## Max. :1207359 Max. :119.0 Max. :33.000 Max. :13.0000
## pub.rec not.fully.paid
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.06212 Mean :0.1601
## 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :5.00000 Max. :1.0000
Convert data to categorical values
ldata <- ldata %>%
mutate(inq.last.6mths = as.factor(inq.last.6mths)) %>%
mutate(delinq.2yrs = as.factor(delinq.2yrs))%>%
mutate(pub.rec = as.factor(pub.rec))%>%
mutate(not.fully.paid = as.factor(not.fully.paid))%>%
mutate(credit.policy = as.factor(credit.policy))
Let’s use ggplot 2 to visualize the data!
Create a histogram of fico scores colored by not.fully.paid
ggplot(ldata, aes(fico)) + geom_histogram(aes(fill=not.fully.paid), color = "black", alpha = .7)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(ldata, aes(purpose)) +
geom_bar(aes(fill=not.fully.paid), position = "dodge", color = "black", alpha = .7) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Scatterplot of fico score versus int.rate.
ggplot(ldata) + geom_point(aes(x=int.rate, y=fico, color = not.fully.paid), alpha = .7)
## Building the Model
Split train and test datasets
set.seed(101)
spl = sample.split(ldata$not.fully.paid, 0.7)
train = subset(ldata, spl == TRUE)
test = subset(ldata, spl == FALSE)
Training the model using svm() function
model <- svm(not.fully.paid ~ .,data=train)
Model summary
summary(model)
##
## Call:
## svm(formula = not.fully.paid ~ ., data = train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 2849
##
## ( 1776 1073 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
Prediction of new values
predicted.values <- predict(model,test[1:13])
table(predicted.values,test$not.fully.paid)
##
## predicted.values 0 1
## 0 2413 460
## 1 0 0
tune.results <- tune(svm, train.x=not.fully.paid ~ ., data = train, kernel = 'radial',
ranges = list(cost = c(1,10), gamma = c(0.1,1)))
model <- svm(not.fully.paid ~ .,data=train,cost=10,gamma = 0.1)
predicted.values <- predict(model,test[1:13])
table(predicted.values,test$not.fully.paid)
##
## predicted.values 0 1
## 0 2350 425
## 1 63 35