#import data
setwd("C:/Users/Avner/MarketMaker") # Set the working Directory
mydata<-read.csv("cs-training.csv")
str(mydata)
## 'data.frame': 150000 obs. of 12 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SeriousDlqin2yrs : int 1 0 0 0 0 0 0 0 0 0 ...
## $ RevolvingUtilizationOfUnsecuredLines: num 0.766 0.957 0.658 0.234 0.907 ...
## $ age : int 45 40 38 30 49 74 57 39 27 57 ...
## $ NumberOfTime30.59DaysPastDueNotWorse: int 2 0 1 0 1 0 0 0 0 0 ...
## $ DebtRatio : num 0.803 0.1219 0.0851 0.036 0.0249 ...
## $ MonthlyIncome : int 9120 2600 3042 3300 63588 3500 NA 3500 NA 23684 ...
## $ NumberOfOpenCreditLinesAndLoans : int 13 4 2 5 7 3 8 8 2 9 ...
## $ NumberOfTimes90DaysLate : int 0 0 1 0 0 0 0 0 0 0 ...
## $ NumberRealEstateLoansOrLines : int 6 0 0 0 1 1 3 0 0 4 ...
## $ NumberOfTime60.89DaysPastDueNotWorse: int 0 0 0 0 0 0 0 0 0 0 ...
## $ NumberOfDependents : int 2 1 0 0 0 1 0 0 NA 2 ...
summary(mydata)
## X SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines
## Min. : 1 Min. :0.00000 Min. : 0.00
## 1st Qu.: 37501 1st Qu.:0.00000 1st Qu.: 0.03
## Median : 75001 Median :0.00000 Median : 0.15
## Mean : 75001 Mean :0.06684 Mean : 6.05
## 3rd Qu.:112500 3rd Qu.:0.00000 3rd Qu.: 0.56
## Max. :150000 Max. :1.00000 Max. :50708.00
##
## age NumberOfTime30.59DaysPastDueNotWorse DebtRatio
## Min. : 0.0 Min. : 0.000 Min. : 0.0
## 1st Qu.: 41.0 1st Qu.: 0.000 1st Qu.: 0.2
## Median : 52.0 Median : 0.000 Median : 0.4
## Mean : 52.3 Mean : 0.421 Mean : 353.0
## 3rd Qu.: 63.0 3rd Qu.: 0.000 3rd Qu.: 0.9
## Max. :109.0 Max. :98.000 Max. :329664.0
##
## MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
## Min. : 0 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3400 1st Qu.: 5.000 1st Qu.: 0.000
## Median : 5400 Median : 8.000 Median : 0.000
## Mean : 6670 Mean : 8.453 Mean : 0.266
## 3rd Qu.: 8249 3rd Qu.:11.000 3rd Qu.: 0.000
## Max. :3008750 Max. :58.000 Max. :98.000
## NA's :29731
## NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 1.000 Median : 0.0000
## Mean : 1.018 Mean : 0.2404
## 3rd Qu.: 2.000 3rd Qu.: 0.0000
## Max. :54.000 Max. :98.0000
##
## NumberOfDependents
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 0.757
## 3rd Qu.: 1.000
## Max. :20.000
## NA's :3924
library(caTools) # install it first in the console
set.seed(123)
mydata$MonthlyIncome[is.na(mydata$MonthlyIncome)]= median(mydata$MonthlyIncome,na.rm=T)
mydata$NumberOfDependents[is.na(mydata$NumberOfDependents)]= median(mydata$NumberOfDependents,na.rm=T)
sapply(mydata,function(x) sum(is.na(x)))
## X SeriousDlqin2yrs
## 0 0
## RevolvingUtilizationOfUnsecuredLines age
## 0 0
## NumberOfTime30.59DaysPastDueNotWorse DebtRatio
## 0 0
## MonthlyIncome NumberOfOpenCreditLinesAndLoans
## 0 0
## NumberOfTimes90DaysLate NumberRealEstateLoansOrLines
## 0 0
## NumberOfTime60.89DaysPastDueNotWorse NumberOfDependents
## 0 0
# we use this function with the same number
# to randomly generate the same values
mydata$SeriousDlqin2yrs<-as.integer(mydata$SeriousDlqin2yrs)
split = sample.split(mydata$SeriousDlqin2yrs, SplitRatio = 0.70)
# here we chose the SplitRatio to 70% of the dataset,
# and 30% for the test set.
training_set = subset(mydata, split == TRUE)
# we use subset to split the dataset
test_set = subset(mydata, split == FALSE)
classifier <- glm(SeriousDlqin2yrs~.-X, family = binomial, data=training_set)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
classifier$family
##
## Family: binomial
## Link function: logit
summary(classifier)
##
## Call:
## glm(formula = SeriousDlqin2yrs ~ . - X, family = binomial, data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3011 -0.3910 -0.3138 -0.2523 4.7280
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -1.315e+00 5.013e-02 -26.239
## RevolvingUtilizationOfUnsecuredLines -1.888e-05 6.419e-05 -0.294
## age -2.874e-02 9.924e-04 -28.955
## NumberOfTime30.59DaysPastDueNotWorse 5.162e-01 1.328e-02 38.870
## DebtRatio -2.505e-05 1.250e-05 -2.005
## MonthlyIncome -3.229e-05 3.694e-06 -8.740
## NumberOfOpenCreditLinesAndLoans -8.466e-03 3.020e-03 -2.803
## NumberOfTimes90DaysLate 4.678e-01 1.798e-02 26.018
## NumberRealEstateLoansOrLines 5.920e-02 1.266e-02 4.678
## NumberOfTime60.89DaysPastDueNotWorse -9.528e-01 2.109e-02 -45.186
## NumberOfDependents 9.602e-02 1.082e-02 8.870
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## RevolvingUtilizationOfUnsecuredLines 0.76866
## age < 2e-16 ***
## NumberOfTime30.59DaysPastDueNotWorse < 2e-16 ***
## DebtRatio 0.04500 *
## MonthlyIncome < 2e-16 ***
## NumberOfOpenCreditLinesAndLoans 0.00506 **
## NumberOfTimes90DaysLate < 2e-16 ***
## NumberRealEstateLoansOrLines 2.9e-06 ***
## NumberOfTime60.89DaysPastDueNotWorse < 2e-16 ***
## NumberOfDependents < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 51530 on 104999 degrees of freedom
## Residual deviance: 47188 on 104989 degrees of freedom
## AIC: 47210
##
## Number of Fisher Scoring iterations: 6
# glm goes for generalized linear model
Our model is not accuracy enough, so we had to delete some variables (3) to gain accuracy.
classifier <- glm(SeriousDlqin2yrs~.-X-RevolvingUtilizationOfUnsecuredLines-DebtRatio-NumberOfOpenCreditLinesAndLoans, family = binomial, data=training_set)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
classifier$family
##
## Family: binomial
## Link function: logit
summary(classifier)
##
## Call:
## glm(formula = SeriousDlqin2yrs ~ . - X - RevolvingUtilizationOfUnsecuredLines -
## DebtRatio - NumberOfOpenCreditLinesAndLoans, family = binomial,
## data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3185 -0.3913 -0.3146 -0.2524 4.7346
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -1.341e+00 4.957e-02 -27.061
## age -2.937e-02 9.773e-04 -30.053
## NumberOfTime30.59DaysPastDueNotWorse 5.107e-01 1.314e-02 38.866
## MonthlyIncome -3.251e-05 3.653e-06 -8.900
## NumberOfTimes90DaysLate 4.739e-01 1.791e-02 26.454
## NumberRealEstateLoansOrLines 4.043e-02 1.163e-02 3.477
## NumberOfTime60.89DaysPastDueNotWorse -9.529e-01 2.112e-02 -45.113
## NumberOfDependents 9.671e-02 1.078e-02 8.971
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## age < 2e-16 ***
## NumberOfTime30.59DaysPastDueNotWorse < 2e-16 ***
## MonthlyIncome < 2e-16 ***
## NumberOfTimes90DaysLate < 2e-16 ***
## NumberRealEstateLoansOrLines 0.000507 ***
## NumberOfTime60.89DaysPastDueNotWorse < 2e-16 ***
## NumberOfDependents < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 51530 on 104999 degrees of freedom
## Residual deviance: 47201 on 104992 degrees of freedom
## AIC: 47217
##
## Number of Fisher Scoring iterations: 6
We fit a logistic regression model of SeriousDlqin2yrs without useless variables.
model <- glm(SeriousDlqin2yrs ~.-X-RevolvingUtilizationOfUnsecuredLines-DebtRatio-NumberOfOpenCreditLinesAndLoans, family = binomial, data=training_set)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = SeriousDlqin2yrs ~ . - X - RevolvingUtilizationOfUnsecuredLines -
## DebtRatio - NumberOfOpenCreditLinesAndLoans, family = binomial,
## data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3185 -0.3913 -0.3146 -0.2524 4.7346
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -1.341e+00 4.957e-02 -27.061
## age -2.937e-02 9.773e-04 -30.053
## NumberOfTime30.59DaysPastDueNotWorse 5.107e-01 1.314e-02 38.866
## MonthlyIncome -3.251e-05 3.653e-06 -8.900
## NumberOfTimes90DaysLate 4.739e-01 1.791e-02 26.454
## NumberRealEstateLoansOrLines 4.043e-02 1.163e-02 3.477
## NumberOfTime60.89DaysPastDueNotWorse -9.529e-01 2.112e-02 -45.113
## NumberOfDependents 9.671e-02 1.078e-02 8.971
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## age < 2e-16 ***
## NumberOfTime30.59DaysPastDueNotWorse < 2e-16 ***
## MonthlyIncome < 2e-16 ***
## NumberOfTimes90DaysLate < 2e-16 ***
## NumberRealEstateLoansOrLines 0.000507 ***
## NumberOfTime60.89DaysPastDueNotWorse < 2e-16 ***
## NumberOfDependents < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 51530 on 104999 degrees of freedom
## Residual deviance: 47201 on 104992 degrees of freedom
## AIC: 47217
##
## Number of Fisher Scoring iterations: 6
On the test set, We predict the following probability by using the obtained model.
prediction_test=predict(model,test_set, type="response")
#prediction_test
We predicted the probability that the customers will not be able to pay the credit. Now in order to compare our results with the real answers, we transform the predicted values to 0 or 1 (1 if >0.5).**
prediction_test = ifelse(prediction_test>0.5,1,0)
test_set2=test_set
test_set2$X<-NULL
test_set2$RevolvingUtilizationOfUnsecuredLines<-NULL
test_set2$DebtRatio<-NULL
test_set2$NumberOfOpenCreditLinesAndLoans<-NULL
matrix = table(prediction_test, test_set$SeriousDlqin2yrs)
matrix
##
## prediction_test 0 1
## 0 41890 2879
## 1 102 129
TP = matrix[2,2]
TN = matrix[1,1]
FP = matrix[1,2]
FN = matrix[2,1]
We Calculate the accuracy, specificity, sensitivity and the precision of the model.
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
pred = prediction(prediction_test, test_set$SeriousDlqin2yrs)
accuracy = (TN + TP)/(TN+TP+FN+FP)
specificity = TN/(TN + FP)
sensitivity = TP/(TP+FN)
accuracy
## [1] 0.9337556
specificity
## [1] 0.9356921
sensitivity
## [1] 0.5584416