library(ggplot2)
## Registered S3 methods overwritten by 'tibble':
## method from
## format.tbl pillar
## print.tbl pillar
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MASS) # for glm modelling
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)# for confusion matrix
## Loading required package: lattice
defaultData<-read.csv("defaultData.csv",sep=";")
head(defaultData)
## ID limitBal sex education marriage age pay1 pay2 pay3 pay4 pay5 pay6 billAmt1
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2 -2 3913
## 2 2 120000 2 2 2 26 -1 2 0 0 0 2 2682
## 3 3 90000 2 2 2 34 0 0 0 0 0 0 29239
## 4 4 50000 2 2 1 37 0 0 0 0 0 0 46990
## 5 5 50000 1 2 1 57 -1 0 -1 0 0 0 8617
## 6 6 50000 1 1 2 37 0 0 0 0 0 0 64400
## billAmt2 billAmt3 billAmt4 billAmt5 billAmt6 payAmt1 payAmt2 payAmt3 payAmt4
## 1 3102 689 0 0 0 0 689 0 0
## 2 1725 2682 3272 3455 3261 0 1000 1000 1000
## 3 14027 13559 14331 14948 15549 1518 1500 1000 1000
## 4 48233 49291 28314 28959 29547 2000 2019 1200 1100
## 5 5670 35835 20940 19146 19131 2000 36681 10000 9000
## 6 57069 57608 19394 19619 20024 2500 1815 657 1000
## payAmt5 payAmt6 PaymentDefault
## 1 0 0 1
## 2 0 2000 1
## 3 1000 5000 0
## 4 1069 1000 0
## 5 689 679 0
## 6 1000 800 0
colnames(defaultData)
## [1] "ID" "limitBal" "sex" "education"
## [5] "marriage" "age" "pay1" "pay2"
## [9] "pay3" "pay4" "pay5" "pay6"
## [13] "billAmt1" "billAmt2" "billAmt3" "billAmt4"
## [17] "billAmt5" "billAmt6" "payAmt1" "payAmt2"
## [21] "payAmt3" "payAmt4" "payAmt5" "payAmt6"
## [25] "PaymentDefault"
###Table of values for P , odds and log of odds
p odds logodds
.001 .001001 -6.906755
.01 .010101 -4.59512
.15 .1764706 -1.734601
.4 .6666667 -.4054651
.45 .8181818 -.2006707
.5 1 0
.55 1.222222 .2006707
.8 4 1.386294
.85 5.666667 1.734601
.9 9 2.197225
.999 999 6.906755
.9999 9999 9.21024
\[ logit(p)=log(p/(1-p))=β_0+β_1 x_1+⋯+β_k\] \[ (1-p)/p=1/exp(β_0+β_1 x_1+⋯+β_k x_k ) \] and finally \[ p=exp(β_0+β_1 x_1+⋯+β_k x_k )/(1+exp(β_0+β_1 x_1+⋯+β_k x_k ) ) \] \[ Z=β_0+∑ β_p x_p \] And finally \[ P(Y=1)=e^Z/(1+e^Z ) \]
#### Above figure can be presented as the following also
# Build logistic regression model
logitModelFull <- glm(PaymentDefault ~ limitBal + sex + education + marriage +
age + pay1 + pay2 + pay3 + pay4 + pay5 + pay6 + billAmt1 +
billAmt2 + billAmt3 + billAmt4 + billAmt5 + billAmt6 + payAmt1 +
payAmt2 + payAmt3 + payAmt4 + payAmt5 + payAmt6,
family ='binomial', data = defaultData)
# Take a look at the model
summary(logitModelFull)
##
## Call:
## glm(formula = PaymentDefault ~ limitBal + sex + education + marriage +
## age + pay1 + pay2 + pay3 + pay4 + pay5 + pay6 + billAmt1 +
## billAmt2 + billAmt3 + billAmt4 + billAmt5 + billAmt6 + payAmt1 +
## payAmt2 + payAmt3 + payAmt4 + payAmt5 + payAmt6, family = "binomial",
## data = defaultData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0893 -0.7116 -0.5615 -0.2794 4.2501
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.711e-01 1.505e-01 -3.795 0.000148 ***
## limitBal -4.825e-07 1.985e-07 -2.431 0.015052 *
## sex -8.251e-02 3.880e-02 -2.127 0.033457 *
## education -1.217e-01 2.745e-02 -4.434 9.23e-06 ***
## marriage -1.711e-01 4.016e-02 -4.259 2.05e-05 ***
## age 4.824e-03 2.257e-03 2.137 0.032570 *
## pay1 5.743e-01 2.221e-02 25.864 < 2e-16 ***
## pay2 5.156e-02 2.552e-02 2.020 0.043336 *
## pay3 7.811e-02 2.863e-02 2.728 0.006375 **
## pay4 -1.191e-02 3.285e-02 -0.363 0.716838
## pay5 1.080e-01 3.381e-02 3.193 0.001406 **
## pay6 -1.956e-02 2.750e-02 -0.711 0.476852
## billAmt1 -7.948e-06 1.582e-06 -5.023 5.09e-07 ***
## billAmt2 4.911e-06 2.006e-06 2.448 0.014350 *
## billAmt3 4.203e-07 1.698e-06 0.247 0.804572
## billAmt4 -1.587e-08 1.872e-06 -0.008 0.993234
## billAmt5 9.703e-07 2.154e-06 0.451 0.652293
## billAmt6 6.758e-07 1.591e-06 0.425 0.670955
## payAmt1 -1.878e-05 3.252e-06 -5.777 7.61e-09 ***
## payAmt2 -6.406e-06 2.364e-06 -2.710 0.006731 **
## payAmt3 -3.325e-06 2.401e-06 -1.385 0.166153
## payAmt4 -3.922e-06 2.342e-06 -1.675 0.093970 .
## payAmt5 -2.383e-06 2.168e-06 -1.099 0.271635
## payAmt6 -1.916e-06 1.618e-06 -1.184 0.236521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19438 on 17999 degrees of freedom
## Residual deviance: 17216 on 17976 degrees of freedom
## AIC: 17264
##
## Number of Fisher Scoring iterations: 5
# Take a look at the odds ratios by removing Exponent and rounding them
coefsexp <- coef(logitModelFull) %>% exp() %>% round(2)
coefsexp
## (Intercept) limitBal sex education marriage age
## 0.56 1.00 0.92 0.89 0.84 1.00
## pay1 pay2 pay3 pay4 pay5 pay6
## 1.78 1.05 1.08 0.99 1.11 0.98
## billAmt1 billAmt2 billAmt3 billAmt4 billAmt5 billAmt6
## 1.00 1.00 1.00 1.00 1.00 1.00
## payAmt1 payAmt2 payAmt3 payAmt4 payAmt5 payAmt6
## 1.00 1.00 1.00 1.00 1.00 1.00
# The old (full) model
logitModelFull <- glm(PaymentDefault ~ limitBal + sex + education + marriage +
age + pay1 + pay2 + pay3 + pay4 + pay5 + pay6 + billAmt1 +
billAmt2 + billAmt3 + billAmt4 + billAmt5 + billAmt6 + payAmt1 +
payAmt2 + payAmt3 + payAmt4 + payAmt5 + payAmt6,
family = binomial, defaultData)
The Akaike Information Criterion, or AIC for short, is a method for scoring and selecting a model. The AIC statistic is defined for logistic regression as follows (taken from “The Elements of Statistical Learning“):
Where N is the number of examples in the training dataset, LL is the log-likelihood of the model on the training dataset, and k is the number of parameters in the model.
To use AIC for model selection, we simply choose the model giving smallest AIC over the set of models considered. In the stepAIC, the mode of stepwise search, can be one of “both”, “backward”, or “forward”, with a default of “both”. If the scope argument is missing the default for direction is “backward”.
#Build the new model
logitModelNew <- stepAIC(logitModelFull,trace=0)
#Look at the model
summary(logitModelNew)
##
## Call:
## glm(formula = PaymentDefault ~ limitBal + sex + education + marriage +
## age + pay1 + pay2 + pay3 + pay5 + billAmt1 + billAmt2 + billAmt5 +
## payAmt1 + payAmt2 + payAmt3 + payAmt4, family = binomial,
## data = defaultData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0839 -0.7119 -0.5611 -0.2839 4.1800
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.699e-01 1.504e-01 -3.790 0.000151 ***
## limitBal -5.201e-07 1.954e-07 -2.661 0.007791 **
## sex -8.206e-02 3.878e-02 -2.116 0.034338 *
## education -1.212e-01 2.744e-02 -4.418 9.96e-06 ***
## marriage -1.724e-01 4.014e-02 -4.295 1.75e-05 ***
## age 4.863e-03 2.256e-03 2.156 0.031092 *
## pay1 5.740e-01 2.218e-02 25.882 < 2e-16 ***
## pay2 4.979e-02 2.552e-02 1.951 0.051048 .
## pay3 7.197e-02 2.573e-02 2.798 0.005146 **
## pay5 8.859e-02 2.249e-02 3.938 8.20e-05 ***
## billAmt1 -8.130e-06 1.580e-06 -5.144 2.69e-07 ***
## billAmt2 5.238e-06 1.775e-06 2.951 0.003165 **
## billAmt5 1.790e-06 8.782e-07 2.038 0.041554 *
## payAmt1 -1.931e-05 3.258e-06 -5.928 3.06e-09 ***
## payAmt2 -6.572e-06 2.092e-06 -3.142 0.001681 **
## payAmt3 -3.693e-06 2.187e-06 -1.689 0.091241 .
## payAmt4 -4.611e-06 2.062e-06 -2.237 0.025306 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19438 on 17999 degrees of freedom
## Residual deviance: 17220 on 17983 degrees of freedom
## AIC: 17254
##
## Number of Fisher Scoring iterations: 5
# Save the formula of the new model (it will be needed for the out-of-sample part)
formulaLogit <- as.formula(summary(logitModelNew)$call)
formulaLogit
## PaymentDefault ~ limitBal + sex + education + marriage + age +
## pay1 + pay2 + pay3 + pay5 + billAmt1 + billAmt2 + billAmt5 +
## payAmt1 + payAmt2 + payAmt3 + payAmt4
Like R-square statistics in simple Linear Regression, in logistic regression, there are McFadden and Nagelkerke constants to check the goodness of fit.
### Interpretation: Its good if above 0.4 and reasonable above 0.2
### Accuracy (all correct / all) = TP + TN / TP + TN + FP + FN
# Calculate the accuracy for reduced model i.e 'logitModelNew'
# First we Make predictions
defaultData$predNew <- predict(logitModelNew, type = "response", na.action = na.exclude)
predict_fact<-as.factor(as.numeric(defaultData$predNew>0.5))
reference_factor<-as.factor(as.numeric(defaultData$PaymentDefault>0.5))
confMatrixModelNew<-confusionMatrix(data =predict_fact, reference =reference_factor)
confMatrixModelNew
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 13443 3152
## 1 407 998
##
## Accuracy : 0.8023
## 95% CI : (0.7964, 0.8081)
## No Information Rate : 0.7694
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2747
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9706
## Specificity : 0.2405
## Pos Pred Value : 0.8101
## Neg Pred Value : 0.7103
## Prevalence : 0.7694
## Detection Rate : 0.7468
## Detection Prevalence : 0.9219
## Balanced Accuracy : 0.6055
##
## 'Positive' Class : 0
##
# Calculate the accuracy for Full model i.e 'logitModelFull'
# First we Make predictions
defaultData$predFull <- predict(logitModelFull, type = "response", na.action = na.exclude)
library(caret)
fact<-as.factor(as.numeric(defaultData$predFull>0.5))
dataf<-as.factor(as.numeric(defaultData$PaymentDefault>0.5))
confMatrixModelFull<-confusionMatrix(data =fact, reference =dataf)
confMatrixModelFull
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 13441 3154
## 1 409 996
##
## Accuracy : 0.8021
## 95% CI : (0.7962, 0.8079)
## No Information Rate : 0.7694
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2739
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9705
## Specificity : 0.2400
## Pos Pred Value : 0.8099
## Neg Pred Value : 0.7089
## Prevalence : 0.7694
## Detection Rate : 0.7467
## Detection Prevalence : 0.9219
## Balanced Accuracy : 0.6052
##
## 'Positive' Class : 0
##
confMatrixModelNew[2]
## $table
## Reference
## Prediction 0 1
## 0 13443 3152
## 1 407 998
confMatrixModelNew$table["1","1"]
## [1] 998
The confusion matrix is: Reference Prediction 0 1 0 13443 3152 1 407 998
Here The (0,1) in the column name are True values The (0,1) in the rows are the Predicted values #### so TP=998 #### and FP=407
# Calculating payoff for 0.5 threshold
payoff <- 1000*confMatrixModelNew$table["1","1"] - 250*confMatrixModelNew$table["1","0"]
payoff
## [1] 896250
# make a Data Frame for threshold values
thresh <- c(0.1,0.2,0.3,0.4,0.5)
payoffFrame <- data.frame(thresh)
names(payoffFrame)<-c('Threshold')
for(i in 1:5)
{
pred<-as.factor(as.numeric(defaultData$predNew>(i/10)))
data_ref<-as.factor(as.numeric(defaultData$PaymentDefault>(i/10)))
confMatrix<-confusionMatrix(data =pred, reference =data_ref)
payoffFrame$payoff[i] <- 1000*confMatrix$table["1","1"] - 250*confMatrix$table["1","0"]
}
payoffFrame
## Threshold payoff
## 1 0.1 994250
## 2 0.2 1440250
## 3 0.3 1575500
## 4 0.4 1328750
## 5 0.5 896250
# rbinom(10,1,0.5)
defaultData$isTrain <-rbinom(nrow(defaultData),1,0.66)
train<-subset(defaultData,isTrain=1)
test<-subset(defaultData,isTrain=0)
logitTrainNew2 <- glm(formulaLogit, family = binomial, data = train) # Modeling
test$predNew <- predict(logitTrainNew2, type = "response", newdata = test) # Predictions
predict_fact2<-as.factor(as.numeric(test$predNew>0.3))
reference_factor2<-as.factor(as.numeric(defaultData$PaymentDefault>0.3))
confMatrixModelNew2<-confusionMatrix(data =predict_fact2, reference =reference_factor2)
confMatrixModelNew2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 12180 2157
## 1 1670 1993
##
## Accuracy : 0.7874
## 95% CI : (0.7813, 0.7933)
## No Information Rate : 0.7694
## P-Value [Acc > NIR] : 4.221e-09
##
## Kappa : 0.3751
##
## Mcnemar's Test P-Value : 3.963e-15
##
## Sensitivity : 0.8794
## Specificity : 0.4802
## Pos Pred Value : 0.8496
## Neg Pred Value : 0.5441
## Prevalence : 0.7694
## Detection Rate : 0.6767
## Detection Prevalence : 0.7965
## Balanced Accuracy : 0.6798
##
## 'Positive' Class : 0
##
Cross validation is a clever method to avoid overfitting as you could see. In this exercise you are going to calculate the cross validated accuracy. ### We already have the Logistic Regression Model logitModelNew ## We can test accuracy of this model using 6 fold cross validation and keeping threshold =3
confMatrixModelNew$overall[['Accuracy']]
## [1] 0.8022778
# Accuracy function
costa <- function(actual=defaultData$PaymentDefault, prediction=defaultData$predNew,threshold=0.3 ) {
predict_fact<-as.factor(as.numeric(prediction>threshold))
reference_factor<-as.factor(as.numeric(actual>threshold))
confMatrixModelNew<-confusionMatrix(data =predict_fact, reference =reference_factor)
accuracy<-confMatrixModelNew$overall[['Accuracy']]
return(accuracy)
}
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
cv.glm(data=defaultData,glmfit=logitModelNew,cost=costa,K=6)$delta[1]
## [1] 0.7850556
The accuracy obtained by cross validation is 78% while the reduced model accruacy was also the same (80%) .So our model is not overfitting
References : https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/