In this project we will use Support Vector Machines to create a model that will help predict the default risk for people who borrow money

url <- 'https://raw.githubusercontent.com/jerrytigerxu/SVM-Project/master/data/loan_data.csv'

loans <- read.csv(url)

str(loans)
## 'data.frame':    9578 obs. of  14 variables:
##  $ credit.policy    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ purpose          : Factor w/ 7 levels "all_other","credit_card",..: 3 2 3 3 2 2 3 1 5 3 ...
##  $ int.rate         : num  0.119 0.107 0.136 0.101 0.143 ...
##  $ installment      : num  829 228 367 162 103 ...
##  $ log.annual.inc   : num  11.4 11.1 10.4 11.4 11.3 ...
##  $ dti              : num  19.5 14.3 11.6 8.1 15 ...
##  $ fico             : int  737 707 682 712 667 727 667 722 682 707 ...
##  $ days.with.cr.line: num  5640 2760 4710 2700 4066 ...
##  $ revol.bal        : int  28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
##  $ revol.util       : num  52.1 76.7 25.6 73.2 39.5 51 76.8 68.6 51.1 23 ...
##  $ inq.last.6mths   : int  0 0 1 1 0 0 0 0 1 1 ...
##  $ delinq.2yrs      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ pub.rec          : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ not.fully.paid   : int  0 0 0 0 0 0 1 1 0 0 ...
summary(loans)
##  credit.policy                 purpose        int.rate     
##  Min.   :0.000   all_other         :2331   Min.   :0.0600  
##  1st Qu.:1.000   credit_card       :1262   1st Qu.:0.1039  
##  Median :1.000   debt_consolidation:3957   Median :0.1221  
##  Mean   :0.805   educational       : 343   Mean   :0.1226  
##  3rd Qu.:1.000   home_improvement  : 629   3rd Qu.:0.1407  
##  Max.   :1.000   major_purchase    : 437   Max.   :0.2164  
##                  small_business    : 619                   
##   installment     log.annual.inc        dti              fico      
##  Min.   : 15.67   Min.   : 7.548   Min.   : 0.000   Min.   :612.0  
##  1st Qu.:163.77   1st Qu.:10.558   1st Qu.: 7.213   1st Qu.:682.0  
##  Median :268.95   Median :10.929   Median :12.665   Median :707.0  
##  Mean   :319.09   Mean   :10.932   Mean   :12.607   Mean   :710.8  
##  3rd Qu.:432.76   3rd Qu.:11.291   3rd Qu.:17.950   3rd Qu.:737.0  
##  Max.   :940.14   Max.   :14.528   Max.   :29.960   Max.   :827.0  
##                                                                    
##  days.with.cr.line   revol.bal         revol.util    inq.last.6mths  
##  Min.   :  179     Min.   :      0   Min.   :  0.0   Min.   : 0.000  
##  1st Qu.: 2820     1st Qu.:   3187   1st Qu.: 22.6   1st Qu.: 0.000  
##  Median : 4140     Median :   8596   Median : 46.3   Median : 1.000  
##  Mean   : 4561     Mean   :  16914   Mean   : 46.8   Mean   : 1.577  
##  3rd Qu.: 5730     3rd Qu.:  18250   3rd Qu.: 70.9   3rd Qu.: 2.000  
##  Max.   :17640     Max.   :1207359   Max.   :119.0   Max.   :33.000  
##                                                                      
##   delinq.2yrs         pub.rec        not.fully.paid  
##  Min.   : 0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median : 0.0000   Median :0.00000   Median :0.0000  
##  Mean   : 0.1637   Mean   :0.06212   Mean   :0.1601  
##  3rd Qu.: 0.0000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :13.0000   Max.   :5.00000   Max.   :1.0000  
## 
loans$credit.policy <- factor(loans$credit.policy)
loans$inq.last.6mths <- factor(loans$inq.last.6mths)
loans$delinq.2yrs <- factor(loans$delinq.2yrs)
loans$pub.rec <- factor(loans$pub.rec)
loans$not.fully.paid <- factor(loans$not.fully.paid)

EDA

Let’s use ggplot2 to visualize the data.

library(ggplot2)

p1 <- ggplot(loans, aes(x=fico))
p1 <- p1 + geom_histogram(aes(fill=not.fully.paid), color='black', bins=40, alpha = 0.5)
p1 + scale_fill_manual(values = c('green', 'red')) + theme_bw()

p1 <- ggplot(loans, aes(x=factor(purpose)))
p1 <- p1 + geom_bar(aes(fill=not.fully.paid), position='dodge')
p1 + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust=1))

ggplot(loans, aes(int.rate, fico)) + geom_point() + theme_bw()

ggplot(loans, aes(int.rate, fico)) + geom_point(aes(color=not.fully.paid), alpha=0.3) + theme_bw()

Model Building

library(caTools)
set.seed(101)
sp1 = sample.split(loans$not.fully.paid, 0.7)
train = subset(loans, sp1 == TRUE)
test = subset(loans, sp1 == FALSE)
library(e1071)
model <- svm(not.fully.paid ~., data=train)
summary(model)
## 
## Call:
## svm(formula = not.fully.paid ~ ., data = train)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.01724138 
## 
## Number of Support Vectors:  2849
## 
##  ( 1776 1073 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
predicted.values <- predict(model, test[1:13])
table(predicted.values, test$not.fully.paid)
##                 
## predicted.values    0    1
##                0 2413  460
##                1    0    0