In this project we will use Support Vector Machines to create a model that will help predict the default risk for people who borrow money
url <- 'https://raw.githubusercontent.com/jerrytigerxu/SVM-Project/master/data/loan_data.csv'
loans <- read.csv(url)
str(loans)
## 'data.frame': 9578 obs. of 14 variables:
## $ credit.policy : int 1 1 1 1 1 1 1 1 1 1 ...
## $ purpose : Factor w/ 7 levels "all_other","credit_card",..: 3 2 3 3 2 2 3 1 5 3 ...
## $ int.rate : num 0.119 0.107 0.136 0.101 0.143 ...
## $ installment : num 829 228 367 162 103 ...
## $ log.annual.inc : num 11.4 11.1 10.4 11.4 11.3 ...
## $ dti : num 19.5 14.3 11.6 8.1 15 ...
## $ fico : int 737 707 682 712 667 727 667 722 682 707 ...
## $ days.with.cr.line: num 5640 2760 4710 2700 4066 ...
## $ revol.bal : int 28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
## $ revol.util : num 52.1 76.7 25.6 73.2 39.5 51 76.8 68.6 51.1 23 ...
## $ inq.last.6mths : int 0 0 1 1 0 0 0 0 1 1 ...
## $ delinq.2yrs : int 0 0 0 0 1 0 0 0 0 0 ...
## $ pub.rec : int 0 0 0 0 0 0 1 0 0 0 ...
## $ not.fully.paid : int 0 0 0 0 0 0 1 1 0 0 ...
summary(loans)
## credit.policy purpose int.rate
## Min. :0.000 all_other :2331 Min. :0.0600
## 1st Qu.:1.000 credit_card :1262 1st Qu.:0.1039
## Median :1.000 debt_consolidation:3957 Median :0.1221
## Mean :0.805 educational : 343 Mean :0.1226
## 3rd Qu.:1.000 home_improvement : 629 3rd Qu.:0.1407
## Max. :1.000 major_purchase : 437 Max. :0.2164
## small_business : 619
## installment log.annual.inc dti fico
## Min. : 15.67 Min. : 7.548 Min. : 0.000 Min. :612.0
## 1st Qu.:163.77 1st Qu.:10.558 1st Qu.: 7.213 1st Qu.:682.0
## Median :268.95 Median :10.929 Median :12.665 Median :707.0
## Mean :319.09 Mean :10.932 Mean :12.607 Mean :710.8
## 3rd Qu.:432.76 3rd Qu.:11.291 3rd Qu.:17.950 3rd Qu.:737.0
## Max. :940.14 Max. :14.528 Max. :29.960 Max. :827.0
##
## days.with.cr.line revol.bal revol.util inq.last.6mths
## Min. : 179 Min. : 0 Min. : 0.0 Min. : 0.000
## 1st Qu.: 2820 1st Qu.: 3187 1st Qu.: 22.6 1st Qu.: 0.000
## Median : 4140 Median : 8596 Median : 46.3 Median : 1.000
## Mean : 4561 Mean : 16914 Mean : 46.8 Mean : 1.577
## 3rd Qu.: 5730 3rd Qu.: 18250 3rd Qu.: 70.9 3rd Qu.: 2.000
## Max. :17640 Max. :1207359 Max. :119.0 Max. :33.000
##
## delinq.2yrs pub.rec not.fully.paid
## Min. : 0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median : 0.0000 Median :0.00000 Median :0.0000
## Mean : 0.1637 Mean :0.06212 Mean :0.1601
## 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :13.0000 Max. :5.00000 Max. :1.0000
##
loans$credit.policy <- factor(loans$credit.policy)
loans$inq.last.6mths <- factor(loans$inq.last.6mths)
loans$delinq.2yrs <- factor(loans$delinq.2yrs)
loans$pub.rec <- factor(loans$pub.rec)
loans$not.fully.paid <- factor(loans$not.fully.paid)
Let’s use ggplot2 to visualize the data.
library(ggplot2)
p1 <- ggplot(loans, aes(x=fico))
p1 <- p1 + geom_histogram(aes(fill=not.fully.paid), color='black', bins=40, alpha = 0.5)
p1 + scale_fill_manual(values = c('green', 'red')) + theme_bw()
p1 <- ggplot(loans, aes(x=factor(purpose)))
p1 <- p1 + geom_bar(aes(fill=not.fully.paid), position='dodge')
p1 + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust=1))
ggplot(loans, aes(int.rate, fico)) + geom_point() + theme_bw()
ggplot(loans, aes(int.rate, fico)) + geom_point(aes(color=not.fully.paid), alpha=0.3) + theme_bw()
library(caTools)
set.seed(101)
sp1 = sample.split(loans$not.fully.paid, 0.7)
train = subset(loans, sp1 == TRUE)
test = subset(loans, sp1 == FALSE)
library(e1071)
model <- svm(not.fully.paid ~., data=train)
summary(model)
##
## Call:
## svm(formula = not.fully.paid ~ ., data = train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01724138
##
## Number of Support Vectors: 2849
##
## ( 1776 1073 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
predicted.values <- predict(model, test[1:13])
table(predicted.values, test$not.fully.paid)
##
## predicted.values 0 1
## 0 2413 460
## 1 0 0