Example

We use logistic regression to find a good predictive model on the credit data to predict whether credit applicants are good credit risks or not

Credit data source: https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)

1. Data Exploratory

In the credit data, we have 13 categorical and 7 numeric variables. Interpretation of the variables can be found the link given above.

# Load libraries and data
library(readxl)
library(corrplot)

## corrplot 0.84 loaded

credit <- read_xlsx("germancredit.xlsx" , col_names = T)
str(credit)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1000 obs. of  21 variables:
##  $ A1 : chr  "A11" "A12" "A14" "A11" ...
##  $ A2 : num  6 48 12 42 24 36 24 36 12 30 ...
##  $ A3 : chr  "A34" "A32" "A34" "A32" ...
##  $ A4 : chr  "A43" "A43" "A46" "A42" ...
##  $ A5 : num  1169 5951 2096 7882 4870 ...
##  $ A6 : chr  "A65" "A61" "A61" "A61" ...
##  $ A7 : chr  "A75" "A73" "A74" "A74" ...
##  $ A8 : num  4 2 2 2 3 2 3 2 2 4 ...
##  $ A9 : chr  "A93" "A92" "A93" "A93" ...
##  $ A10: chr  "A101" "A101" "A101" "A103" ...
##  $ A11: num  4 2 3 4 4 4 4 2 4 2 ...
##  $ A12: chr  "A121" "A121" "A121" "A122" ...
##  $ A13: num  67 22 49 45 53 35 53 35 61 28 ...
##  $ A14: chr  "A143" "A143" "A143" "A143" ...
##  $ A15: chr  "A152" "A152" "A152" "A153" ...
##  $ A16: num  2 1 1 1 2 1 1 1 1 2 ...
##  $ A17: chr  "A173" "A173" "A172" "A173" ...
##  $ A18: num  1 1 2 2 2 2 1 1 1 1 ...
##  $ A19: chr  "A192" "A191" "A191" "A191" ...
##  $ A20: chr  "A201" "A201" "A201" "A201" ...
##  $ R1 : num  1 2 1 1 2 1 1 1 1 2 ...

# Change Response from "2= bad, 1 = Good"  to "1=Bad, 0= Good""
credit$R1 <- with(credit,ifelse(R1 == 2, 1,
                                ifelse(R1 == 1, 0, "NA")))
# Change columns to factor
credit[,c(1,3,4,6,7,9,10,12,14,15,17,19,20,21)] <- lapply(credit[,c(1,3,4,6,7,9,10,12,14,15,17,19,20,21)], as.factor)

Using a correlation plot, we can study the variables correlation. There seems little correlation among the variables except A2 and A5 variable.

# Check for correlation between numeric variables
corrplot(cor(credit[,c(2,5,8,11,13,16,18)]))

cor((credit[,c(2,5,8,11,13,16,18)]))

##              A2          A5          A8        A11         A13         A16
## A2   1.00000000  0.62498420  0.07474882 0.03406720 -0.03613637 -0.01128360
## A5   0.62498420  1.00000000 -0.27131570 0.02892632  0.03271642  0.02079455
## A8   0.07474882 -0.27131570  1.00000000 0.04930237  0.05826568  0.02166874
## A11  0.03406720  0.02892632  0.04930237 1.00000000  0.26641918  0.08962523
## A13 -0.03613637  0.03271642  0.05826568 0.26641918  1.00000000  0.14925358
## A16 -0.01128360  0.02079455  0.02166874 0.08962523  0.14925358  1.00000000
## A18 -0.02383448  0.01714215 -0.07120694 0.04264343  0.11820083  0.10966670
##             A18
## A2  -0.02383448
## A5   0.01714215
## A8  -0.07120694
## A11  0.04264343
## A13  0.11820083
## A16  0.10966670
## A18  1.00000000

3. Model the dataset and interpret output

Before modelling the data, I did a 80/20 ratio split so I got a train set (80%) and a test set (20%).

# Split data to Train/Validation (80%) and Test set (20%)
library(caTools)
set.seed(11)
split <- sample.split(credit$R1, SplitRatio = 0.8)
credit_train <- subset(credit, split == TRUE)
credit_test <- subset(credit, split == FALSE)

# Fit all variables in train dataset to logistic regression
fit1 <- glm(R1~., data = credit_train, family = binomial(link="logit"))

In the model output is given below.

# Results of fitted model
summary(fit1)

## 
## Call:
## glm(formula = R1 ~ ., family = binomial(link = "logit"), data = credit_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5012  -0.6785  -0.3532   0.6234   2.7116  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.784e+00  1.264e+00   1.412 0.158046    
## A1A12       -5.116e-01  2.515e-01  -2.034 0.041952 *  
## A1A13       -1.275e+00  4.330e-01  -2.944 0.003240 ** 
## A1A14       -1.774e+00  2.666e-01  -6.655 2.83e-11 ***
## A2           3.117e-02  1.108e-02   2.813 0.004901 ** 
## A3A31       -2.878e-01  6.581e-01  -0.437 0.661926    
## A3A32       -1.045e+00  5.304e-01  -1.971 0.048759 *  
## A3A33       -1.206e+00  5.666e-01  -2.129 0.033251 *  
## A3A34       -1.986e+00  5.359e-01  -3.705 0.000211 ***
## A4A41       -1.926e+00  4.591e-01  -4.195 2.73e-05 ***
## A4A410      -2.141e+00  9.316e-01  -2.298 0.021537 *  
## A4A42       -8.416e-01  2.928e-01  -2.874 0.004054 ** 
## A4A43       -9.894e-01  2.773e-01  -3.568 0.000360 ***
## A4A44       -2.695e-01  9.058e-01  -0.298 0.766074    
## A4A45       -2.690e-01  7.812e-01  -0.344 0.730546    
## A4A46       -2.344e-01  4.631e-01  -0.506 0.612733    
## A4A48       -1.586e+01  4.676e+02  -0.034 0.972949    
## A4A49       -8.618e-01  3.803e-01  -2.266 0.023429 *  
## A5           1.180e-04  5.115e-05   2.308 0.021015 *  
## A6A62       -3.814e-01  3.257e-01  -1.171 0.241551    
## A6A63       -1.019e-01  4.344e-01  -0.235 0.814576    
## A6A64       -1.389e+00  6.291e-01  -2.208 0.027217 *  
## A6A65       -7.095e-01  2.907e-01  -2.441 0.014666 *  
## A7A72       -3.995e-01  4.934e-01  -0.810 0.418100    
## A7A73       -3.106e-01  4.779e-01  -0.650 0.515735    
## A7A74       -1.202e+00  5.148e-01  -2.335 0.019570 *  
## A7A75       -4.146e-01  4.709e-01  -0.881 0.378584    
## A8           4.340e-01  1.026e-01   4.229 2.34e-05 ***
## A9A92       -5.221e-01  4.387e-01  -1.190 0.234079    
## A9A93       -1.294e+00  4.369e-01  -2.961 0.003062 ** 
## A9A94       -9.273e-01  5.212e-01  -1.779 0.075230 .  
## A10A102      6.121e-01  4.920e-01   1.244 0.213430    
## A10A103     -8.593e-01  4.988e-01  -1.723 0.084960 .  
## A11          4.331e-02  9.926e-02   0.436 0.662577    
## A12A122      2.841e-01  2.902e-01   0.979 0.327487    
## A12A123      3.347e-01  2.700e-01   1.239 0.215237    
## A12A124      9.550e-01  4.981e-01   1.917 0.055183 .  
## A13         -1.273e-02  1.080e-02  -1.179 0.238575    
## A14A142     -2.179e-01  4.641e-01  -0.469 0.638781    
## A14A143     -8.163e-01  2.757e-01  -2.961 0.003066 ** 
## A15A152     -2.151e-01  2.787e-01  -0.772 0.440309    
## A15A153     -8.587e-01  5.682e-01  -1.511 0.130762    
## A16          3.917e-01  2.159e-01   1.814 0.069636 .  
## A17A172     -2.100e-01  7.613e-01  -0.276 0.782680    
## A17A173     -3.219e-01  7.348e-01  -0.438 0.661305    
## A17A174     -3.566e-01  7.382e-01  -0.483 0.629053    
## A18          1.129e-01  2.945e-01   0.384 0.701316    
## A19A192     -3.226e-01  2.336e-01  -1.381 0.167419    
## A20A202     -1.225e+00  6.679e-01  -1.834 0.066709 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 977.38  on 799  degrees of freedom
## Residual deviance: 691.83  on 751  degrees of freedom
## AIC: 789.83
## 
## Number of Fisher Scoring iterations: 14

There are predictors with p-value less than the 0.05 significance level.
For example, predictors A1 and A2 which we can reject null hypothesis and conclude that there is an association between these predictors and probabilty of classifying customer as “bad”.

Looking at the coefficients of the variables, we can make some interpretations. For example,

The negative coefficient of A4A2 and A4A3 implies that customers who borrowed for furniture and radio purchases were less likely to be a bad customer keeping the other predictors constant.
Coefficient of A2 (0.0311) implies that a unit increase in loan duration (months), increases the log odds of classifying it as “bad customer” by 0.0311 keeping the other predictors constant.

4. Predict with test set

Next, let’s make a prediction with the test set using a threshold of 0.5.

# Make prediction on test set and see confusion matrix
yhat_test <- predict(fit1, credit_test, type = "response")
cm <- table( credit_test$R1,yhat_test > 0.5) 
cm

##    
##     FALSE TRUE
##   0   113   27
##   1    33   27

We can also check on the sensitivity, specificity and accuracy of the model.

# Sensitivity (TP/TP+FN)
cm[4]/(cm[3]+cm[2])

## [1] 0.45

# Specificity (TN/TN+FP)
cm[1]/(cm[1]+cm[3])

## [1] 0.8071429

# Accuracy (TP+TN)/(no.of dataset)
(cm[1]+cm[4])/nrow(credit_test)

## [1] 0.7

Even though there is an accuracy of 70% on the test data, we see that the sensitivity is rather low at 0.45 which means only 45% of the “bad customers” are correctly classified.

5. Fit model with significant variable.

I decided to run another logistic regression model with selected variables that are found signifcant, predict with test set.

# fit model again with variables found significant

fit2 <- glm(R1~ A1+A2+A3+A4+A5+A6+A7+A8+A9+A14, data = credit_train, family = binomial(link = "logit"))
summary(fit2)

## 
## Call:
## glm(formula = R1 ~ A1 + A2 + A3 + A4 + A5 + A6 + A7 + A8 + A9 + 
##     A14, family = binomial(link = "logit"), data = credit_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3134  -0.7002  -0.3864   0.6773   2.6938  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.643e+00  8.306e-01   1.979 0.047871 *  
## A1A12       -5.631e-01  2.400e-01  -2.347 0.018942 *  
## A1A13       -1.287e+00  4.179e-01  -3.081 0.002062 ** 
## A1A14       -1.783e+00  2.585e-01  -6.896 5.34e-12 ***
## A2           3.341e-02  1.044e-02   3.200 0.001374 ** 
## A3A31       -5.585e-01  6.222e-01  -0.898 0.369393    
## A3A32       -1.294e+00  4.994e-01  -2.591 0.009560 ** 
## A3A33       -1.231e+00  5.540e-01  -2.223 0.026239 *  
## A3A34       -1.987e+00  5.196e-01  -3.824 0.000131 ***
## A4A41       -1.947e+00  4.492e-01  -4.334 1.46e-05 ***
## A4A410      -2.469e+00  9.762e-01  -2.529 0.011425 *  
## A4A42       -7.026e-01  2.765e-01  -2.541 0.011056 *  
## A4A43       -9.604e-01  2.652e-01  -3.622 0.000292 ***
## A4A44       -3.495e-01  8.938e-01  -0.391 0.695787    
## A4A45       -1.515e-01  7.159e-01  -0.212 0.832369    
## A4A46       -9.065e-02  4.456e-01  -0.203 0.838810    
## A4A48       -1.564e+01  4.624e+02  -0.034 0.973021    
## A4A49       -7.954e-01  3.653e-01  -2.178 0.029430 *  
## A5           1.091e-04  4.725e-05   2.310 0.020912 *  
## A6A62       -1.495e-01  3.056e-01  -0.489 0.624810    
## A6A63       -7.628e-02  4.302e-01  -0.177 0.859256    
## A6A64       -1.294e+00  6.029e-01  -2.146 0.031867 *  
## A6A65       -6.665e-01  2.799e-01  -2.381 0.017244 *  
## A7A72       -3.554e-01  4.321e-01  -0.822 0.410847    
## A7A73       -3.462e-01  4.042e-01  -0.856 0.391750    
## A7A74       -1.194e+00  4.505e-01  -2.650 0.008048 ** 
## A7A75       -4.391e-01  4.178e-01  -1.051 0.293230    
## A8           4.264e-01  9.704e-02   4.394 1.11e-05 ***
## A9A92       -2.930e-01  4.210e-01  -0.696 0.486455    
## A9A93       -1.108e+00  4.156e-01  -2.666 0.007678 ** 
## A9A94       -8.419e-01  5.043e-01  -1.669 0.095056 .  
## A14A142     -2.017e-01  4.530e-01  -0.445 0.656088    
## A14A143     -7.799e-01  2.671e-01  -2.920 0.003505 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 977.38  on 799  degrees of freedom
## Residual deviance: 715.98  on 767  degrees of freedom
## AIC: 781.98
## 
## Number of Fisher Scoring iterations: 14

anova(fit1, fit2, test = "LRT")

In the new model, AIC has dropped from 789 to 781 but there is no change in accuracy.

# Make prediction on test set
yhat_test <- predict(fit2, credit_test, type = "response")
cm <- table( credit_test$R1,yhat_test > 0.5) 
cm

##    
##     FALSE TRUE
##   0   113   27
##   1    33   27

# Sensitivity (TP/TP+FN)
cm[4]/(cm[3]+cm[2])

## [1] 0.45

# Specificity (TN/TN+FP)
cm[1]/(cm[1]+cm[3])

## [1] 0.8071429

# Accuracy (TP+TN)/(no.of dataset)
(cm[1]+cm[4])/nrow(credit_test)

## [1] 0.7

Using ROC to determine the quality of fit, the area under curve (AUC) is 62.8%. This means that there is 62.8% chance that model is able to classify positive/True (1) and negative/False class (0) correctly.

# Develop ROC curve to determine the quality of fit

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

roc <- roc(credit_test$R1, as.integer(yhat_test >0.5))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc,main="ROC curve")

roc

## 
## Call:
## roc.default(response = credit_test$R1, predictor = as.integer(yhat_test >     0.5))
## 
## Data: as.integer(yhat_test > 0.5) in 140 controls (credit_test$R1 0) < 60 cases (credit_test$R1 1).
## Area under the curve: 0.6286

6. Try out a range of Threshold

Hence, I decided to try out a range of the threshold to check on the accuracy and AUC of model because 0.5 threshold may not be right.
If we are looking to achieve the highest accuracy then a threshold of 0.7 is good while for highest AUC, the threshold is 0.1 to 0.3.

outputLG <- data.frame(
  threshold = seq(0.1,0.9,0.1),
  accuracy = rep(0,9),
  auc = rep(0,9))


for (i in 1:nrow(outputLG)) {
  yhat_int <- as.integer(yhat_test > outputLG$threshold[i])
  cm <- table(credit_test$R1,yhat_int)
  outputLG$accuracy[i] <- sum(diag(cm))/nrow(credit_test)
  r<-roc(credit_test$R1,yhat_int)
  outputLG$auc[i] <- as.numeric(r$auc)
}

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

outputLG

#sensitivity (TP/TP+FN)
cm[4]/(cm[3]+cm[4])

## [1] 1

#specificity (TN/TN+FP)
cm[1]/(cm[1]+cm[2])

## [1] 0.7035176

#accuracy (TP+TN)/(no.of dataset)
(cm[1]+cm[4])/nrow(credit)

## [1] 0.141

7. Determine a threshold with cost involved.

Assuming incorrectly identifying a bad customer as good, is 5 times worse than incorrectly classifying a good customer as bad, we shall determine a good threshold probability.

I created a range of threshold from 0.01 to 0.99.
Following, I let the cost of misclassifying a good customer as bad to be 1 and cost of misclassfying a bad customer as 5, i.e. 5 times of the number of misclassifying a bad customer as good (c2) to get the “cost” of FN misclassification.

# The cost of incorrectly classfying a "bad" customer is 5 times the loss of incorrectly classifying a "good" customer. 
# Calulating loss for the value of thresholds ranging from 0.01 to 1.

cost <- c()
threshold <- seq(0.01,0.99, 0.01)
for(i in 1:length(threshold)){
  yhat_int <- as.integer(yhat_test > threshold[i]) # calculate threshold predictions
  cm <-as.matrix(table(credit_test$R1,yhat_int))
  c1 <- ifelse((ncol(cm)>1), cm[1,2],  0 ) #c1 is misclassify good customers as bad customer 
  c2 <- ifelse((nrow(cm)>1), cm[2,1],  0 ) #c2 is misclassify bad customers as good customer 
  cost <- c(cost, c2*5 + c1)
}

plot(threshold,cost,xlab = "Threshold",ylab = "Cost",main = "Cost vs Threshold")

Hence, we see the threshold with the lowest “cost” is 0.1. In that case, it misclassified 4 bad customer as good and forgo 87 good customers as they are classified as bad.

# Lowest cost
# Which is threshold gives the lowest cost
min(cost)

## [1] 107

threshold[which.min(cost)]

## [1] 0.1

yhat_int <- as.integer(yhat_test > threshold[10]) # calculate threshold predictions
table(credit_test$R1,yhat_int)

##    yhat_int
##      0  1
##   0 53 87
##   1  4 56

Classifiction with Logistic Regression on Credit data

Brenda Koo

9 October 2019

Logistic Regression Overview