cs_training <- read.csv("C:\\Users\\venka\\Downloads\\cs-training.csv")
dim(cs_training)
## [1] 150000 12
names(cs_training)
## [1] "X"
## [2] "SeriousDlqin2yrs"
## [3] "RevolvingUtilizationOfUnsecuredLines"
## [4] "age"
## [5] "NumberOfTime30.59DaysPastDueNotWorse"
## [6] "DebtRatio"
## [7] "MonthlyIncome"
## [8] "NumberOfOpenCreditLinesAndLoans"
## [9] "NumberOfTimes90DaysLate"
## [10] "NumberRealEstateLoansOrLines"
## [11] "NumberOfTime60.89DaysPastDueNotWorse"
## [12] "NumberOfDependents"
summary(cs_training)
## X SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines
## Min. : 1 Min. :0.00000 Min. : 0.00
## 1st Qu.: 37501 1st Qu.:0.00000 1st Qu.: 0.03
## Median : 75001 Median :0.00000 Median : 0.15
## Mean : 75001 Mean :0.06684 Mean : 6.05
## 3rd Qu.:112500 3rd Qu.:0.00000 3rd Qu.: 0.56
## Max. :150000 Max. :1.00000 Max. :50708.00
##
## age NumberOfTime30.59DaysPastDueNotWorse DebtRatio
## Min. : 0.0 Min. : 0.000 Min. : 0.0
## 1st Qu.: 41.0 1st Qu.: 0.000 1st Qu.: 0.2
## Median : 52.0 Median : 0.000 Median : 0.4
## Mean : 52.3 Mean : 0.421 Mean : 353.0
## 3rd Qu.: 63.0 3rd Qu.: 0.000 3rd Qu.: 0.9
## Max. :109.0 Max. :98.000 Max. :329664.0
##
## MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
## Min. : 0 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3400 1st Qu.: 5.000 1st Qu.: 0.000
## Median : 5400 Median : 8.000 Median : 0.000
## Mean : 6670 Mean : 8.453 Mean : 0.266
## 3rd Qu.: 8249 3rd Qu.:11.000 3rd Qu.: 0.000
## Max. :3008750 Max. :58.000 Max. :98.000
## NA's :29731
## NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 1.000 Median : 0.0000
## Mean : 1.018 Mean : 0.2404
## 3rd Qu.: 2.000 3rd Qu.: 0.0000
## Max. :54.000 Max. :98.0000
##
## NumberOfDependents
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 0.757
## 3rd Qu.: 1.000
## Max. :20.000
## NA's :3924
str(cs_training)
## 'data.frame': 150000 obs. of 12 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SeriousDlqin2yrs : int 1 0 0 0 0 0 0 0 0 0 ...
## $ RevolvingUtilizationOfUnsecuredLines: num 0.766 0.957 0.658 0.234 0.907 ...
## $ age : int 45 40 38 30 49 74 57 39 27 57 ...
## $ NumberOfTime30.59DaysPastDueNotWorse: int 2 0 1 0 1 0 0 0 0 0 ...
## $ DebtRatio : num 0.803 0.1219 0.0851 0.036 0.0249 ...
## $ MonthlyIncome : int 9120 2600 3042 3300 63588 3500 NA 3500 NA 23684 ...
## $ NumberOfOpenCreditLinesAndLoans : int 13 4 2 5 7 3 8 8 2 9 ...
## $ NumberOfTimes90DaysLate : int 0 0 1 0 0 0 0 0 0 0 ...
## $ NumberRealEstateLoansOrLines : int 6 0 0 0 1 1 3 0 0 4 ...
## $ NumberOfTime60.89DaysPastDueNotWorse: int 0 0 0 0 0 0 0 0 0 0 ...
## $ NumberOfDependents : int 2 1 0 0 0 1 0 0 NA 2 ...
table(is.na(cs_training))
##
## FALSE TRUE
## 1766345 33655
cs_training1<-cs_training
cs_training1$MonthlyIncome[which(is.na(cs_training1$MonthlyIncome))]<-5400
table(is.na(cs_training1$MonthlyIncome))
##
## FALSE
## 150000
#cs_training$NumberOfDependents
table(is.na(cs_training$NumberOfDependents))
##
## FALSE TRUE
## 146076 3924
cs_training1$NumberOfDependents[which(is.na(cs_training1$NumberOfDependents))]<-0.757
table(is.na(cs_training1$NumberOfDependents))
##
## FALSE
## 150000
Description: Age of borrower in years
“Initial Observations: The data looks fairly well distributed and gives a Poisson or normal distribution.
library("ggplot2")
ggplot(cs_training1,aes(x=SeriousDlqin2yrs,y=age))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
summary(cs_training1$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 41.0 52.0 52.3 63.0 109.0
table(is.na(cs_training1$age))
##
## FALSE
## 150000
quantile(cs_training1$age,c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.91,0.92,0.93,0.94,0.95,1))
## 10% 20% 30% 40% 50% 60% 70% 80% 90% 91% 92% 93% 94% 95% 100%
## 33 39 44 48 52 56 61 65 72 73 74 75 76 78 109
cs_0.hi<-subset(cs_training1,age >101)
ggplot(cs_0.hi,aes(x=SeriousDlqin2yrs,y=age))+geom_boxplot()
qplot(cs_0.hi$age, colour = I("pink"), size=I(5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##Variable: MonthlyIncome Description: Monthly income
“Initial Observations: The data looks to have some outliers at the top end that need to be removed. After this, we may be able to get a better distribution. The frequency of the outliers is very low,
qplot(cs_training1$MonthlyIncome, colour = I("pink"), size=I(5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#outlier treatment for “MonthlyIncome”
library("ggplot2")
ggplot(cs_training1,aes(x=SeriousDlqin2yrs,y=MonthlyIncome))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
summary(cs_training1$MonthlyIncome)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 3903 5400 6418 7400 3008750
table(is.na(cs_training1$MonthlyIncome))
##
## FALSE
## 150000
quantile(cs_training1$MonthlyIncome,c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.91,0.92,0.93,0.94,0.95,1))
## 10% 20% 30% 40% 50% 60%
## 2325.00 3400.00 4333.00 5400.00 5400.00 5400.00
## 70% 80% 90% 91% 92% 93%
## 6613.00 8250.00 10750.00 11105.18 11666.00 12120.00
## 94% 95% 100%
## 12744.18 13500.00 3008750.00
cs_3.hi<-subset(cs_training1,MonthlyIncome <7500)
ggplot(cs_3.hi,aes(x=SeriousDlqin2yrs,y=MonthlyIncome))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
qplot(cs_3.hi$MonthlyIncome, colour = I("pink"), size=I(5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###RevolvingUtilizationOfUnsecuredLines “Description: Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits”
“Initial Observations: This variable contains some outliers. Since the variable should be the ratio of unsecured loans that are being utilized, values between 0 and 1 should make up most of the observations.
Some values might be higher than that if a line of credit was closed while a balance still exists.” #outlier treatment for “RevolvingUtilizationOfUnsecuredLines”
library("ggplot2")
ggplot(cs_training1,aes(x=SeriousDlqin2yrs,y=RevolvingUtilizationOfUnsecuredLines))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
summary(cs_training1$RevolvingUtilizationOfUnsecuredLines)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.03 0.15 6.05 0.56 50708.00
table(is.na(cs_training1$RevolvingUtilizationOfUnsecuredLines))
##
## FALSE
## 150000
quantile(cs_training1$RevolvingUtilizationOfUnsecuredLines,c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.91,0.92,0.93,0.94,0.95,1))
## 10% 20% 30% 40% 50%
## 2.968979e-03 1.922217e-02 4.346136e-02 8.318149e-02 1.541807e-01
## 60% 70% 80% 90% 91%
## 2.714925e-01 4.451364e-01 6.988571e-01 9.812777e-01 9.999999e-01
## 92% 93% 94% 95% 100%
## 9.999999e-01 9.999999e-01 9.999999e-01 9.999999e-01 5.070800e+04
cs_4.hi<-subset(cs_training1,RevolvingUtilizationOfUnsecuredLines <1.3)
ggplot(cs_4.hi,aes(x=SeriousDlqin2yrs,y=RevolvingUtilizationOfUnsecuredLines))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
qplot(cs_4.hi$RevolvingUtilizationOfUnsecuredLines, colour = I("pink"), size=I(5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###NumberOfOpenCreditLinesAndLoans “Description: Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards)
“Initial Observations: The data looks fairly well distributed, but there seems to be a few outliers” #outlier treatment for “NumberOfOpenCreditLinesAndLoans”
library("ggplot2")
ggplot(cs_training1,aes(x=SeriousDlqin2yrs,y=NumberOfOpenCreditLinesAndLoans))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
summary(cs_training1$NumberOfOpenCreditLinesAndLoans)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.000 8.000 8.453 11.000 58.000
table(is.na(cs_training1$NumberOfOpenCreditLinesAndLoans))
##
## FALSE
## 150000
quantile(cs_training1$NumberOfOpenCreditLinesAndLoans,c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.91,0.92,0.93,0.94,0.95,1))
## 10% 20% 30% 40% 50% 60% 70% 80% 90% 91% 92% 93% 94% 95% 100%
## 3 4 5 6 8 9 10 12 15 16 16 17 17 18 58
cs_5.hi<-subset(cs_training1,NumberOfOpenCreditLinesAndLoans <21)
ggplot(cs_5.hi,aes(x=SeriousDlqin2yrs,y=NumberOfOpenCreditLinesAndLoans))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
qplot(cs_5.hi$NumberOfOpenCreditLinesAndLoans, colour = I("pink"), size=I(5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###NumberRealEstateLoansOrLines
“Description: Number of mortgage and real estate loans, including home equity lines of credit”
“Initial Observations: The data looks fairly well distributed, but there seems to be a few outliers with low frequency.” #outlier treatment for “NumberRealEstateLoansOrLines”
library("ggplot2")
ggplot(cs_training1,aes(x=SeriousDlqin2yrs,y=NumberRealEstateLoansOrLines))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
summary(cs_training1$NumberRealEstateLoansOrLines)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.018 2.000 54.000
table(is.na(cs_training1$NumberRealEstateLoansOrLines))
##
## FALSE
## 150000
quantile(cs_training1$NumberRealEstateLoansOrLines,c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.91,0.92,0.93,0.94,0.95,1))
## 10% 20% 30% 40% 50% 60% 70% 80% 90% 91% 92% 93% 94% 95% 100%
## 0 0 0 1 1 1 1 2 2 2 2 2 3 3 54
cs_6.hi<-subset(cs_training1,NumberRealEstateLoansOrLines <3)
ggplot(cs_6.hi,aes(x=SeriousDlqin2yrs,y=NumberRealEstateLoansOrLines))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
qplot(cs_6.hi$NumberRealEstateLoansOrLines, colour = I("pink"), size=I(5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###NumberOfDependents “Description: Number of dependents in family, excluding themselves (spouse, children, etc.)”
“Initial Observations: The number of dependents has some -1 value, and these values were N/A in the initial dataset. We will do something with these values. There are also some outliers that will need to be dropped to improve the chart. Again, the frequency on these outliers is very low.” #outlier treatment for “NumberOfDependents”
library("ggplot2")
ggplot(cs_training1,aes(x=SeriousDlqin2yrs,y=NumberOfDependents))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
summary(cs_training1$NumberOfDependents)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7572 1.0000 20.0000
table(is.na(cs_training1$NumberOfDependents))
##
## FALSE
## 150000
cs_training1$NumberOfDependents[which(is.na(cs_training1$NumberOfDependents))]<-0.757
quantile(cs_training1$NumberOfDependents,c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.91,0.92,0.93,0.94,0.95,1))
## 10% 20% 30% 40% 50% 60% 70% 80% 90% 91%
## 0.000 0.000 0.000 0.000 0.000 0.757 1.000 2.000 2.000 2.000
## 92% 93% 94% 95% 100%
## 3.000 3.000 3.000 3.000 20.000
cs_7.hi<-subset(cs_training1,NumberOfDependents <3)
ggplot(cs_7.hi,aes(x=SeriousDlqin2yrs,y=NumberOfDependents))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
qplot(cs_7.hi$NumberOfDependents, colour = I("pink"), size=I(5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###DebtRatio + Debit ratio = total debit/total asset #outlier treatment for “DebtRatio”
library("ggplot2")
ggplot(cs_training1,aes(x=SeriousDlqin2yrs,y=DebtRatio))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
summary(cs_training1$DebtRatio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.2 0.4 353.0 0.9 329664.0
table(is.na(cs_training1$DebtRatio))
##
## FALSE
## 150000
quantile(cs_training1$DebtRatio,c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.91,0.92,0.93,0.94,0.95,1))
## 10% 20% 30% 40% 50%
## 3.087398e-02 1.337729e-01 2.136969e-01 2.874603e-01 3.665078e-01
## 60% 70% 80% 90% 91%
## 4.675064e-01 6.491891e-01 4.000000e+00 1.267000e+03 1.462000e+03
## 92% 93% 94% 95% 100%
## 1.685000e+03 1.917070e+03 2.172060e+03 2.449000e+03 3.296640e+05
cs_8.hi<-subset(cs_training1,DebtRatio < 0.4)
ggplot(cs_8.hi,aes(x=SeriousDlqin2yrs,y=DebtRatio))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
qplot(cs_8.hi$DebtRatio, colour = I("pink"), size=I(5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###NumberOfTimes90DaysLate “Description: Number of times borrower has been 90 days or more past due.” #outlier treatment for “NumberOfTimes90DaysLate”
library("ggplot2")
ggplot(cs_training1,aes(x=SeriousDlqin2yrs,y=NumberOfTimes90DaysLate))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
summary(cs_training1$NumberOfTimes90DaysLate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.266 0.000 98.000
table(is.na(cs_training1$NumberOfTimes90DaysLate))
##
## FALSE
## 150000
quantile(cs_training1$NumberOfTimes90DaysLate,c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.91,0.92,0.93,0.94,0.95,0.96,0.97,.98,.99,1))
## 10% 20% 30% 40% 50% 60% 70% 80% 90% 91% 92% 93% 94% 95% 96%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
## 97% 98% 99% 100%
## 1 2 3 98
cs_9.hi<-subset(cs_training1,NumberOfTimes90DaysLate >95)
ggplot(cs_9.hi,aes(x=SeriousDlqin2yrs,y=NumberOfTimes90DaysLate))+geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
qplot(cs_9.hi$NumberOfTimes90DaysLate, colour = I("pink"), size=I(5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##Multiple logistic Regression Line
model_2_cs_training1<-glm(SeriousDlqin2yrs~.,family=binomial(),data=cs_training1)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_2_cs_training1)
##
## Call:
## glm(formula = SeriousDlqin2yrs ~ ., family = binomial(), data = cs_training1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2921 -0.3917 -0.3149 -0.2535 4.9464
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -1.341e+00 4.583e-02 -29.253
## X 3.079e-07 2.472e-07 1.246
## RevolvingUtilizationOfUnsecuredLines -3.713e-05 6.115e-05 -0.607
## age -2.849e-02 8.286e-04 -34.382
## NumberOfTime30.59DaysPastDueNotWorse 5.036e-01 1.110e-02 45.366
## DebtRatio -2.886e-05 1.045e-05 -2.761
## MonthlyIncome -3.590e-05 3.148e-06 -11.406
## NumberOfOpenCreditLinesAndLoans -7.499e-03 2.520e-03 -2.976
## NumberOfTimes90DaysLate 4.685e-01 1.522e-02 30.786
## NumberRealEstateLoansOrLines 6.890e-02 1.057e-02 6.516
## NumberOfTime60.89DaysPastDueNotWorse -9.401e-01 1.772e-02 -53.062
## NumberOfDependents 9.364e-02 9.113e-03 10.276
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## X 0.21288
## RevolvingUtilizationOfUnsecuredLines 0.54373
## age < 2e-16 ***
## NumberOfTime30.59DaysPastDueNotWorse < 2e-16 ***
## DebtRatio 0.00577 **
## MonthlyIncome < 2e-16 ***
## NumberOfOpenCreditLinesAndLoans 0.00292 **
## NumberOfTimes90DaysLate < 2e-16 ***
## NumberRealEstateLoansOrLines 7.23e-11 ***
## NumberOfTime60.89DaysPastDueNotWorse < 2e-16 ***
## NumberOfDependents < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 73616 on 149999 degrees of freedom
## Residual deviance: 67519 on 149988 degrees of freedom
## AIC: 67543
##
## Number of Fisher Scoring iterations: 6
#Individual Impact of Variables
library(caret)
## Loading required package: lattice
varImp(model_2_cs_training1, scale = FALSE)
## Overall
## X 1.2456907
## RevolvingUtilizationOfUnsecuredLines 0.6071808
## age 34.3823493
## NumberOfTime30.59DaysPastDueNotWorse 45.3660227
## DebtRatio 2.7606152
## MonthlyIncome 11.4058498
## NumberOfOpenCreditLinesAndLoans 2.9757971
## NumberOfTimes90DaysLate 30.7861837
## NumberRealEstateLoansOrLines 6.5157535
## NumberOfTime60.89DaysPastDueNotWorse 53.0619158
## NumberOfDependents 10.2757933
#Multicollinearity
cat("Need car package")
## Need car package
library(car)
cat("use VIF for identifying the Multicollinearity")
## use VIF for identifying the Multicollinearity
(vif(model_2_cs_training1))
## X RevolvingUtilizationOfUnsecuredLines
## 1.000078 1.000502
## age NumberOfTime30.59DaysPastDueNotWorse
## 1.104773 67.707499
## DebtRatio MonthlyIncome
## 1.065480 1.213763
## NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate
## 1.401747 126.970168
## NumberRealEstateLoansOrLines NumberOfTime60.89DaysPastDueNotWorse
## 1.420294 171.264185
## NumberOfDependents
## 1.064611
#classification table
predicted_values<-predict(model_2_cs_training1,type="response")
cat("Predcited Values")
## Predcited Values
predicted_values[1:10]
## 1 2 3 4 5 6
## 0.19156317 0.07515536 0.17143194 0.08695387 0.01098934 0.03123077
## 7 8 9 10
## 0.04005588 0.06676225 0.09541236 0.03168820
cat("Lets convert them to classes using a threshold")
## Lets convert them to classes using a threshold
threshold=0.5
threshold
## [1] 0.5
predicted_class<-ifelse(predict(model_2_cs_training1,type="response")>threshold,1,0)
cat("Predcited Classes")
## Predcited Classes
actual_values<-cs_training1$SeriousDlqin2yrs
conf_matrix<-table(predicted_class,actual_values)
cat("Confusion Matrix")
## Confusion Matrix
conf_matrix
## actual_values
## predicted_class 0 1
## 0 139649 9590
## 1 325 436
accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix))
cat("Accuracy")
## Accuracy
accuracy
## [1] 0.9339
###AIC and BIC
library(stats)
AIC(model_2_cs_training1)
## [1] 67542.51
BIC(model_2_cs_training1)
## [1] 67661.53
#model2
model_3_cs_training1<-glm(SeriousDlqin2yrs~
RevolvingUtilizationOfUnsecuredLines
+DebtRatio
+MonthlyIncome
+NumberOfOpenCreditLinesAndLoans
+NumberRealEstateLoansOrLines
+NumberOfDependents,family=binomial(),data=cs_training1)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_3_cs_training1)
##
## Call:
## glm(formula = SeriousDlqin2yrs ~ RevolvingUtilizationOfUnsecuredLines +
## DebtRatio + MonthlyIncome + NumberOfOpenCreditLinesAndLoans +
## NumberRealEstateLoansOrLines + NumberOfDependents, family = binomial(),
## data = cs_training1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4661 -0.3924 -0.3620 -0.3311 5.6731
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -2.325e+00 2.457e-02 -94.627
## RevolvingUtilizationOfUnsecuredLines -3.882e-05 6.150e-05 -0.631
## DebtRatio -5.051e-05 1.094e-05 -4.616
## MonthlyIncome -5.422e-05 3.242e-06 -16.723
## NumberOfOpenCreditLinesAndLoans -2.296e-02 2.429e-03 -9.453
## NumberRealEstateLoansOrLines 6.604e-02 1.082e-02 6.101
## NumberOfDependents 1.755e-01 8.615e-03 20.375
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## RevolvingUtilizationOfUnsecuredLines 0.528
## DebtRatio 3.90e-06 ***
## MonthlyIncome < 2e-16 ***
## NumberOfOpenCreditLinesAndLoans < 2e-16 ***
## NumberRealEstateLoansOrLines 1.05e-09 ***
## NumberOfDependents < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 73616 on 149999 degrees of freedom
## Residual deviance: 72807 on 149993 degrees of freedom
## AIC: 72821
##
## Number of Fisher Scoring iterations: 6
#classification table2
threshold=0.5
predicted_values<-ifelse(predict(model_3_cs_training1,type="response")>threshold,1,0)
actual_values<-cs_training1$SeriousDlqin2yrs
conf_matrix<-table(predicted_class,actual_values)
conf_matrix<-table(predicted_values,actual_values)
accuracy2<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix))
accuracy2
## [1] 0.9331533
Banks can come to know conclusion/manage their loan risks better