German Credit

Logistic Regression

Exploratory Analysis

Exploratory Analysis

Logit

rows <- sample(nrow(german_credit))
german_credit_randomized <- german_credit[rows, ]

split_german_credit <- round(nrow(german_credit_randomized)*0.75)
train_german_credit <- german_credit_randomized[1:split_german_credit, ]
test_german_credit <- german_credit_randomized[(split_german_credit + 1):nrow(german_credit_randomized), ]

For logit model, AIC came out to be 791.727. For probit model, AIC came out to be 790.26. And, for cloglog model, AIC came out to be 789.07. The significant parameters are similar for all the three link functions.

For logit model, BIC came out to be 1017.82 For probit model, AIC came out to be 1016.63 And, for cloglog model, AIC came out to be 1015.46 The significant parameters are similar for all the three link functions.

Logit Model
null_log <- glm(formula = response~1, family = "binomial", data = train_german_credit)
full_log <- glm(formula = response ~ . , family = "binomial", 
                data = train_german_credit)

summary(full_log)
## 
## Call:
## glm(formula = response ~ ., family = "binomial", data = train_german_credit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0764  -0.7138  -0.3931   0.7203   2.4910  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -7.711e-01  1.233e+00  -0.625  0.53180    
## chk_acctA12       -4.136e-01  2.548e-01  -1.623  0.10459    
## chk_acctA13       -9.493e-01  4.092e-01  -2.320  0.02035 *  
## chk_acctA14       -1.695e+00  2.626e-01  -6.454 1.09e-10 ***
## duration           2.576e-02  1.102e-02   2.338  0.01939 *  
## credit_hisA31      4.822e-01  6.214e-01   0.776  0.43771    
## credit_hisA32     -4.181e-01  4.735e-01  -0.883  0.37721    
## credit_hisA33     -1.099e+00  5.431e-01  -2.024  0.04298 *  
## credit_hisA34     -1.440e+00  4.831e-01  -2.980  0.00288 ** 
## purposeA41        -2.021e+00  4.548e-01  -4.444 8.82e-06 ***
## purposeA410       -1.912e+00  1.011e+00  -1.891  0.05859 .  
## purposeA42        -7.281e-01  2.949e-01  -2.469  0.01353 *  
## purposeA43        -6.425e-01  2.774e-01  -2.316  0.02053 *  
## purposeA44        -7.444e-02  8.880e-01  -0.084  0.93319    
## purposeA45        -6.122e-02  6.009e-01  -0.102  0.91886    
## purposeA46         3.119e-01  4.478e-01   0.696  0.48615    
## purposeA48        -2.125e+00  1.224e+00  -1.736  0.08264 .  
## purposeA49        -6.586e-01  3.869e-01  -1.702  0.08875 .  
## amount             1.141e-04  5.274e-05   2.163  0.03050 *  
## saving_acctA62     1.497e-02  3.298e-01   0.045  0.96380    
## saving_acctA63    -2.506e-01  4.375e-01  -0.573  0.56671    
## saving_acctA64    -7.828e-01  5.740e-01  -1.364  0.17266    
## saving_acctA65    -5.563e-01  2.973e-01  -1.871  0.06134 .  
## present_empA72    -3.733e-01  5.062e-01  -0.737  0.46089    
## present_empA73    -3.436e-01  4.819e-01  -0.713  0.47578    
## present_empA74    -1.129e+00  5.255e-01  -2.147  0.03176 *  
## present_empA75    -6.814e-01  5.007e-01  -1.361  0.17356    
## installment_rate   2.908e-01  9.883e-02   2.942  0.00326 ** 
## sexA92            -1.267e-01  4.368e-01  -0.290  0.77182    
## sexA93            -6.055e-01  4.342e-01  -1.394  0.16319    
## sexA94             2.986e-02  5.096e-01   0.059  0.95328    
## other_debtorA102   8.926e-01  4.749e-01   1.880  0.06017 .  
## other_debtorA103  -1.148e+00  5.004e-01  -2.294  0.02180 *  
## present_resid     -2.243e-03  9.911e-02  -0.023  0.98194    
## propertyA122       4.166e-01  2.878e-01   1.447  0.14784    
## propertyA123       3.315e-01  2.706e-01   1.225  0.22049    
## propertyA124       5.784e-01  4.672e-01   1.238  0.21567    
## age               -3.334e-03  1.020e-02  -0.327  0.74387    
## other_installA142  8.858e-02  4.813e-01   0.184  0.85398    
## other_installA143 -5.101e-01  2.770e-01  -1.842  0.06553 .  
## housingA152       -4.642e-01  2.744e-01  -1.692  0.09070 .  
## housingA153       -4.187e-01  5.361e-01  -0.781  0.43485    
## n_credits          4.202e-01  2.145e-01   1.959  0.05010 .  
## jobA172            7.151e-01  7.902e-01   0.905  0.36546    
## jobA173            8.205e-01  7.662e-01   1.071  0.28427    
## jobA174            8.639e-01  7.846e-01   1.101  0.27089    
## n_people           4.322e-01  2.843e-01   1.520  0.12840    
## telephoneA192     -3.449e-01  2.258e-01  -1.528  0.12658    
## foreignA202       -1.214e+00  6.325e-01  -1.920  0.05486 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 932.61  on 749  degrees of freedom
## Residual deviance: 693.44  on 701  degrees of freedom
## AIC: 791.44
## 
## Number of Fisher Scoring iterations: 5
AIC(full_log)
## [1] 791.4415
BIC(full_log)
## [1] 1017.825

Insample Prediction

Stepwise Selection and Insample Prediction
Stepwise Selection
stepwise_AICmodel <- step(full_log,direction = "both")

stepwise_BICmodel <- step(full_log,direction = "both", k = log(nrow(train_german_credit)))

Variables selected after applying Stepwise AIC are chk_acct, duration, credit_his, purpose, amount, present_emp, installment_rate, sex, other_debtor, other_install, n_credits, n_people and foreign

AIC is 778.2989039

Variables selected after applying Stepwise AIC are chk_acct and duration

BIC is 852.5304001

LASSO Selection
dummy <- model.matrix(~., data = german_credit) #converting data to numeric matrix
dummy <- dummy[,-1] 
nrow(german_credit)
## [1] 1000
rows_dummy <- sample(nrow(dummy))
dummy_randomized <- dummy[rows_dummy, ]

split_dummy <- round(nrow(dummy_randomized)*0.75)
train_dummy <- dummy_randomized[1:split_dummy, ]
test_dummy <- dummy_randomized[(split_dummy + 1):nrow(dummy_randomized), ]


credit_lasso <- cv.glmnet(x = as.matrix(train_dummy[,-49]), y = train_dummy[,49], 
                          family = "binomial", type.measure = "class", alpha = 1)




plot(credit_lasso)

coef(credit_lasso, credit_lasso$lambda.min)
## 49 x 1 sparse Matrix of class "dgCMatrix"
##                               1
## (Intercept)       -0.5613264252
## chk_acctA12       -0.1420174360
## chk_acctA13       -0.6622696525
## chk_acctA14       -1.4872929361
## duration           0.0206773121
## credit_hisA31      0.5067554283
## credit_hisA32     -0.1705706879
## credit_hisA33     -0.3736114895
## credit_hisA34     -1.2091923212
## purposeA41        -1.0111826105
## purposeA410       -0.8213943124
## purposeA42        -0.4842731717
## purposeA43        -0.6815762862
## purposeA44         0.0020436481
## purposeA45         .           
## purposeA46         0.0819945789
## purposeA48        -1.0366689351
## purposeA49        -0.2324811942
## amount             0.0001184591
## saving_acctA62    -0.5223704470
## saving_acctA63    -0.6220660805
## saving_acctA64    -1.0229609816
## saving_acctA65    -0.7526951004
## present_empA72     0.2154133169
## present_empA73     .           
## present_empA74    -0.3740548135
## present_empA75     .           
## installment_rate   0.2376622322
## sexA92             .           
## sexA93            -0.4153892160
## sexA94             0.0078418519
## other_debtorA102   0.3058138572
## other_debtorA103  -0.8905784569
## present_resid      .           
## propertyA122       0.1741980172
## propertyA123       0.0828950145
## propertyA124       0.4542421897
## age               -0.0103108439
## other_installA142  .           
## other_installA143 -0.0786072726
## housingA152       -0.2152687022
## housingA153       -0.3380414897
## n_credits          0.2165851270
## jobA172            .           
## jobA173            0.0941302125
## jobA174            .           
## n_people           0.1511730774
## telephoneA192     -0.1738092064
## foreignA202       -0.7650523339

Variables selected for in sample prediction are based on Stepwise AIC model.

In Sample Selection

The logistic model has been fitted based on the variable selected from the above the step, The ROC curve is plotted and the area under the curve was found to be 0.8182. The misclassification rate (false negative rate) is calculated to be 0.221.

best_insample <- glm(formula = response ~ chk_acct + duration + credit_his + purpose + amount + present_emp + installment_rate + sex + other_debtor +  other_install+ foreign + n_credits + n_people, family = "binomial", data = train_german_credit)

insample_pred <- predict(best_insample, 
                      type = "response")


insample_ROC <- roc(train_german_credit$response,insample_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(insample_ROC, col = "blue")

auc(insample_ROC)
## Area under the curve: 0.8182
insamplecutoff_prob <- coords(insample_ROC,"best",ret = "threshold")
## Warning in coords.roc(insample_ROC, "best", ret = "threshold"): An
## upcoming version of pROC will set the 'transpose' argument to FALSE by
## default. Set transpose = TRUE explicitly to keep the current behavior,
## or transpose = FALSE to adopt the new one and silence this warning. Type
## help(coords_transpose) for additional information.
insample_class <- ifelse(insample_pred > insamplecutoff_prob, 1, 0)
table(insample_class,train_german_credit$response,
      dnn = list("predicted", "actual"))
##          actual
## predicted   0   1
##         0 359  52
##         1 156 183

Out of Sample Prediction

Out of Sample Prediction

The ROC curve is plotted and the area under the curve was found to be 0.138 The misclassification rate (false negative rate) is calculated to be

best_log <- glm(formula = response ~ chk_acct + duration + credit_his + purpose + amount + present_emp + installment_rate + sex + other_debtor +  other_install+ foreign + n_credits + n_people, family = "binomial", data = train_german_credit)

train_prob <- predict(best_log, type = "response")

german_credit_prob <- predict(best_log, newdata = test_german_credit[,-21], type = "response")

ROC <- roc(test_german_credit$response,german_credit_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ROC, col = "blue")

auc(ROC)
## Area under the curve: 0.8124
cutoff_prob <- coords(ROC,"best",ret = "threshold")
## Warning in coords.roc(ROC, "best", ret = "threshold"): An upcoming
## version of pROC will set the 'transpose' argument to FALSE by default.
## Set transpose = TRUE explicitly to keep the current behavior, or
## transpose = FALSE to adopt the new one and silence this warning. Type
## help(coords_transpose) for additional information.
german_credit_pred <- ifelse(german_credit_prob > cutoff_prob, 1, 0)
table(german_credit_pred,test_german_credit$response,
      dnn = list("predicted", "actual"))
##          actual
## predicted   0   1
##         0 123   9
##         1  62  56

Cross-Validation

Cross-Validation

Based on the cross validation function, the misclassification rate is found out to be 0.535.

pcut <- optimal.pcut.glm0

costfunc2 <- function(obs, pred.p){
  weight1 <- 5   # define the weight for "true=1 but pred=0" (FN)
  weight0 <- 1    # define the weight for "true=0 but pred=1" (FP)
  c1 <- (obs == 1) & (pred.p < pcut)    # count for "true=1 but pred=0"   (FN)
  c0 <- (obs == 0) & (pred.p >= pcut)   # count for "true=0 but pred=1"   (FP)
  cost <- mean(weight1 * c1 + weight0 * c0)  # misclassification with weight
  return(cost) # you have to return to a value when you write R functions
}

credit_glm1 <- glm(response~. , family = binomial, data = german_credit)
cv.result <-  cv.glm(data = german_credit, glmfit = credit_glm1, cost = costfunc2, K = 4) 
cv.result$delta[2]
## [1] 0.535

Classification Tree

Out of Sample Prediction

Out of Sample Prediction

The misclassification rate (false negative rate) is 0.076 and is higher than that of the logistic regression model.

tree_model <- rpart(formula = response ~ chk_acct + duration + credit_his + purpose +            amount + present_emp + installment_rate + sex + other_debtor +  other_install+            foreign + n_credits + n_people, data = train_german_credit, method = "class", parms = list(loss = matrix(c(0,5,1,0), nrow = 2)))



prp(tree_model, extra = 1)

tree_predict <- predict(tree_model, test_german_credit[,-21], type = "class")


table(test_german_credit$response, tree_predict, dnn = c("Actual","Predicted"))
##       Predicted
## Actual   0   1
##      0  82 103
##      1   5  60

Cross-Validation

Cross-Validation

The misclassification rate is 0.061 and is same as that of the logistic regression model.

credit_rpart <- rpart(formula = response ~ . , data = train_german_credit, method = "class", 
                       parms = list(loss = matrix(c(0,5,1,0), nrow = 2)), cp = 0.0001)



plotcp(credit_rpart)

prp(prune(credit_rpart, cp = 0.012)) # Pruning the tree

credit_rpartbest <- rpart(formula = response ~ . , data = train_german_credit, method = "class", 
                      parms = list(loss = matrix(c(0,5,1,0), nrow = 2)), cp = 0.012)

prp(credit_rpartbest, extra = 1)

credit_testpredict <- predict(credit_rpartbest, test_german_credit[,-21], type = "class")


table(test_german_credit$response, credit_testpredict, dnn = c("Actual","Predicted"))
##       Predicted
## Actual   0   1
##      0  83 102
##      1   4  61