library(readxl)
defaulter<-read_excel("d:/ds/credit card defaulters.xlsx")
credit<-defaulter
This research aimed at the case of customers’ default payments in Taiwan and compares the predictive accuracy of probability of default.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
credit<-rename(credit,default='default payment next month')
credit<-credit[,-1]
credit$default<-as.factor(credit$default)
split<-sample(nrow(credit),nrow(credit)*0.8)
train<-credit[split,]
test<-credit[-split,]
library(ggplot2)
ggplot(credit,aes(default))+geom_bar()
ggplot(credit,aes(SEX))+geom_bar(aes(fill=default))+ggtitle("credit card distribution among sex")
ggplot(credit,aes(EDUCATION))+geom_bar(aes(fill=default))+ggtitle("credit card defaulter by education ")
Clients who completed education in University has high trend of default payment for next month.
ggplot(credit,aes(AGE))+geom_bar(aes(fill=default))+ggtitle("Default payment next month distribution by Age")
Clients between age of 20-40 have tend to make higher transactions.
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
model<-glm(default ~ .,data = train,family=binomial(link='logit'))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = default ~ ., family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1427 -0.6953 -0.5461 -0.2901 3.2102
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.168e-01 1.339e-01 -5.352 8.71e-08 ***
## LIMIT_BAL -5.772e-07 1.748e-07 -3.301 0.000962 ***
## SEX -1.092e-01 3.444e-02 -3.170 0.001523 **
## EDUCATION -1.013e-01 2.358e-02 -4.296 1.74e-05 ***
## MARRIAGE -1.327e-01 3.555e-02 -3.732 0.000190 ***
## AGE 6.337e-03 2.007e-03 3.157 0.001593 **
## PAY_0 5.763e-01 1.979e-02 29.126 < 2e-16 ***
## PAY_2 9.091e-02 2.262e-02 4.018 5.86e-05 ***
## PAY_3 8.042e-02 2.544e-02 3.162 0.001569 **
## PAY_4 4.407e-03 2.827e-02 0.156 0.876134
## PAY_5 2.437e-02 3.059e-02 0.797 0.425591
## PAY_6 2.545e-02 2.511e-02 1.014 0.310691
## BILL_AMT1 -6.804e-06 1.317e-06 -5.167 2.38e-07 ***
## BILL_AMT2 3.305e-06 1.712e-06 1.930 0.053548 .
## BILL_AMT3 2.376e-06 1.485e-06 1.600 0.109590
## BILL_AMT4 -3.506e-07 1.483e-06 -0.236 0.813150
## BILL_AMT5 -2.076e-08 1.698e-06 -0.012 0.990243
## BILL_AMT6 6.729e-07 1.360e-06 0.495 0.620870
## PAY_AMT1 -1.799e-05 2.795e-06 -6.438 1.21e-10 ***
## PAY_AMT2 -8.728e-06 2.271e-06 -3.843 0.000121 ***
## PAY_AMT3 -3.484e-07 1.791e-06 -0.195 0.845741
## PAY_AMT4 -4.030e-06 2.042e-06 -1.973 0.048442 *
## PAY_AMT5 -5.466e-06 2.142e-06 -2.551 0.010734 *
## PAY_AMT6 -1.843e-06 1.427e-06 -1.291 0.196575
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25228 on 23999 degrees of freedom
## Residual deviance: 22179 on 23976 degrees of freedom
## AIC: 22227
##
## Number of Fisher Scoring iterations: 6
pred<-predict(model,test,type = "response")
pred<-ifelse(pred <= 0.5,0,1)
confusionMatrix(as.factor(test$default),as.factor(pred))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4493 126
## 1 1068 313
##
## Accuracy : 0.801
## 95% CI : (0.7907, 0.811)
## No Information Rate : 0.9268
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.262
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8079
## Specificity : 0.7130
## Pos Pred Value : 0.9727
## Neg Pred Value : 0.2266
## Prevalence : 0.9268
## Detection Rate : 0.7488
## Detection Prevalence : 0.7698
## Balanced Accuracy : 0.7605
##
## 'Positive' Class : 0
##
This model has 80% of accuracy with the test data.So with the future data we can trust this model.