German Credit Risk Data Modeling

Introduction

The German credit scoring data is a dataset provided by Prof. Hogmann in the file german.data. The data set has information of 1000 individuals, on the basis of which they have been classified as risky or not. The variable response in the dataset corresponds to the risk label, 1 has been classified as good and 2 has been classified as bad. In our analysis, we have changed these labels to 0 and 1, 0 corresponding to a good credit record and 1 corresponding to a bad one.

Packages & Input

library(ROCR)
library(ggplot2)
library(tibble)
library(class)
library(dplyr)
library(readxl)
library(tibble)
library(boot)
library(glmnet)
library(MASS)
library(rpart)
library(rpart.plot)

# Input Data

german_credit <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")

colnames(german_credit) <- c("chk_acct", "duration", "credit_his", "purpose", 
                             "amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor", 
                             "present_resid", "property", "age", "other_install", "housing", "n_credits", 
                             "job", "n_people", "telephone", "foreign", "response")
german_credit$response <- german_credit$response - 1
german_credit$response <- as.factor(german_credit$response)

# Splitting data into train and test
set.seed(13472138)
index <- sample(nrow(german_credit),nrow(german_credit)*0.70)
german.train = german_credit[index,]
german.test = german_credit[-index,]

Exploratory Data Analysis

# Summary of the data

summary(german_credit)

##  chk_acct     duration    credit_his    purpose        amount      saving_acct
##  A11:274   Min.   : 4.0   A30: 40    A43    :280   Min.   :  250   A61:603    
##  A12:269   1st Qu.:12.0   A31: 49    A40    :234   1st Qu.: 1366   A62:103    
##  A13: 63   Median :18.0   A32:530    A42    :181   Median : 2320   A63: 63    
##  A14:394   Mean   :20.9   A33: 88    A41    :103   Mean   : 3271   A64: 48    
##            3rd Qu.:24.0   A34:293    A49    : 97   3rd Qu.: 3972   A65:183    
##            Max.   :72.0              A46    : 50   Max.   :18424              
##                                      (Other): 55                              
##  present_emp installment_rate  sex      other_debtor present_resid   property  
##  A71: 62     Min.   :1.000    A91: 50   A101:907     Min.   :1.000   A121:282  
##  A72:172     1st Qu.:2.000    A92:310   A102: 41     1st Qu.:2.000   A122:232  
##  A73:339     Median :3.000    A93:548   A103: 52     Median :3.000   A123:332  
##  A74:174     Mean   :2.973    A94: 92                Mean   :2.845   A124:154  
##  A75:253     3rd Qu.:4.000                           3rd Qu.:4.000             
##              Max.   :4.000                           Max.   :4.000             
##                                                                                
##       age        other_install housing      n_credits       job     
##  Min.   :19.00   A141:139      A151:179   Min.   :1.000   A171: 22  
##  1st Qu.:27.00   A142: 47      A152:713   1st Qu.:1.000   A172:200  
##  Median :33.00   A143:814      A153:108   Median :1.000   A173:630  
##  Mean   :35.55                            Mean   :1.407   A174:148  
##  3rd Qu.:42.00                            3rd Qu.:2.000             
##  Max.   :75.00                            Max.   :4.000             
##                                                                     
##     n_people     telephone  foreign    response
##  Min.   :1.000   A191:596   A201:963   0:700   
##  1st Qu.:1.000   A192:404   A202: 37   1:300   
##  Median :1.000                                 
##  Mean   :1.155                                 
##  3rd Qu.:1.000                                 
##  Max.   :2.000                                 
##

# Visualizing distribution of data

ggplot(german_credit, aes(x=response, y=amount, fill=response)) + 
  geom_boxplot()

ggplot(german_credit, aes(x=response, y=duration, fill=response)) + 
  geom_boxplot()

ggplot(german_credit, aes(x=response, y=installment_rate, fill=response)) + 
  geom_boxplot()

ggplot(german_credit, aes(x=response, y=age, fill=response)) + 
  geom_boxplot()

ggplot(german_credit, aes(x=response, y=n_credits, fill=response)) + 
  geom_boxplot()

ggplot(german_credit, aes(x=response, y=n_people, fill=response)) + 
  geom_boxplot()

# Visualization of Categorical Data

ggplot(german_credit, aes(chk_acct, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge")

ggplot(german_credit, aes(credit_his, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge")

ggplot(german_credit, aes(purpose, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge")

ggplot(german_credit, aes(other_debtor, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge")

ggplot(german_credit, aes(sex, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge")

ggplot(german_credit, aes(other_install, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge")

ggplot(german_credit, aes(foreign, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge")

Logistic Regression

Running Logistic Regression on all variables. Using stepwise AIC to select most important variables

german.glm0<- glm(response~., family=binomial, data=german.train)
summary(german.glm0)

## 
## Call:
## glm(formula = response ~ ., family = binomial, data = german.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6528  -0.6763  -0.3246   0.6780   2.6199  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -5.860e-01  1.290e+00  -0.454  0.64966    
## chk_acctA12       -3.226e-01  2.676e-01  -1.206  0.22795    
## chk_acctA13       -1.079e+00  4.550e-01  -2.372  0.01768 *  
## chk_acctA14       -1.963e+00  2.967e-01  -6.617 3.67e-11 ***
## duration           3.533e-02  1.213e-02   2.913  0.00358 ** 
## credit_hisA31      9.005e-01  7.062e-01   1.275  0.20228    
## credit_hisA32     -5.349e-02  5.848e-01  -0.091  0.92712    
## credit_hisA33     -4.204e-01  6.421e-01  -0.655  0.51265    
## credit_hisA34     -8.857e-01  5.965e-01  -1.485  0.13761    
## purposeA41        -1.441e+00  4.456e-01  -3.233  0.00122 ** 
## purposeA410        4.492e-02  1.052e+00   0.043  0.96594    
## purposeA42        -5.103e-01  3.127e-01  -1.632  0.10266    
## purposeA43        -8.464e-01  3.042e-01  -2.783  0.00539 ** 
## purposeA44        -3.723e-01  8.555e-01  -0.435  0.66337    
## purposeA45        -3.261e-01  6.654e-01  -0.490  0.62401    
## purposeA46         3.105e-02  4.998e-01   0.062  0.95046    
## purposeA48        -1.973e+00  1.317e+00  -1.498  0.13424    
## purposeA49        -3.633e-01  4.105e-01  -0.885  0.37625    
## amount             1.113e-04  5.657e-05   1.968  0.04912 *  
## saving_acctA62    -3.958e-01  3.455e-01  -1.146  0.25197    
## saving_acctA63    -1.329e-01  4.531e-01  -0.293  0.76933    
## saving_acctA64    -1.686e+00  6.726e-01  -2.506  0.01221 *  
## saving_acctA65    -1.040e+00  3.355e-01  -3.099  0.00194 ** 
## present_empA72    -4.061e-02  5.032e-01  -0.081  0.93568    
## present_empA73    -3.969e-01  4.907e-01  -0.809  0.41860    
## present_empA74    -1.260e+00  5.357e-01  -2.353  0.01863 *  
## present_empA75    -4.144e-01  5.111e-01  -0.811  0.41748    
## installment_rate   3.658e-01  1.118e-01   3.273  0.00106 ** 
## sexA92            -1.018e-01  4.918e-01  -0.207  0.83603    
## sexA93            -8.231e-01  4.931e-01  -1.669  0.09507 .  
## sexA94            -2.326e-01  5.577e-01  -0.417  0.67670    
## other_debtorA102   6.796e-01  5.376e-01   1.264  0.20613    
## other_debtorA103  -1.159e+00  5.059e-01  -2.292  0.02193 *  
## present_resid     -9.671e-02  1.057e-01  -0.915  0.36011    
## propertyA122       2.609e-01  3.038e-01   0.859  0.39047    
## propertyA123       1.866e-01  2.865e-01   0.651  0.51492    
## propertyA124       4.593e-01  5.321e-01   0.863  0.38809    
## age               -5.643e-03  1.158e-02  -0.487  0.62601    
## other_installA142  2.069e-02  5.332e-01   0.039  0.96905    
## other_installA143 -5.441e-01  3.098e-01  -1.756  0.07903 .  
## housingA152       -6.978e-01  2.881e-01  -2.422  0.01543 *  
## housingA153       -3.018e-01  5.923e-01  -0.510  0.61030    
## n_credits          4.074e-01  2.352e-01   1.732  0.08327 .  
## jobA172            6.604e-01  7.594e-01   0.870  0.38444    
## jobA173            7.706e-01  7.275e-01   1.059  0.28948    
## jobA174            7.519e-01  7.533e-01   0.998  0.31824    
## n_people           2.974e-01  3.287e-01   0.905  0.36557    
## telephoneA192     -3.137e-01  2.496e-01  -1.257  0.20877    
## foreignA202       -1.460e+00  7.754e-01  -1.883  0.05971 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 871.48  on 699  degrees of freedom
## Residual deviance: 606.27  on 651  degrees of freedom
## AIC: 704.27
## 
## Number of Fisher Scoring iterations: 5

german.glm1 <- step(german.glm0)

## Start:  AIC=704.27
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + property + age + other_install + housing + 
##     n_credits + job + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - property          3   607.40 699.40
## - job               3   607.50 699.50
## - age               1   606.51 702.51
## - n_people          1   607.08 703.08
## - present_resid     1   607.11 703.11
## - telephone         1   607.86 703.86
## - other_install     2   610.18 704.18
## <none>                  606.27 704.27
## - purpose           9   624.62 704.62
## - n_credits         1   609.29 705.29
## - amount            1   610.16 706.16
## - housing           2   612.22 706.22
## - foreign           1   610.66 706.66
## - sex               3   615.08 707.08
## - other_debtor      2   614.17 708.17
## - present_emp       4   618.79 708.79
## - credit_his        4   619.80 709.80
## - duration          1   614.87 710.87
## - saving_acct       4   622.96 712.96
## - installment_rate  1   617.38 713.38
## - chk_acct          3   663.36 755.36
## 
## Step:  AIC=699.4
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + age + other_install + housing + n_credits + 
##     job + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - job               3   608.73 694.73
## - age               1   607.69 697.69
## - present_resid     1   608.18 698.18
## - n_people          1   608.21 698.21
## - telephone         1   608.80 698.80
## <none>                  607.40 699.40
## - other_install     2   611.53 699.53
## - purpose           9   626.00 700.00
## - n_credits         1   610.26 700.26
## - amount            1   611.61 701.61
## - foreign           1   611.80 701.80
## - sex               3   615.83 701.83
## - housing           2   614.97 702.97
## - other_debtor      2   615.67 703.67
## - present_emp       4   620.70 704.70
## - credit_his        4   620.84 704.84
## - duration          1   616.72 706.72
## - saving_acct       4   624.11 708.11
## - installment_rate  1   618.84 708.84
## - chk_acct          3   666.23 752.23
## 
## Step:  AIC=694.73
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + age + other_install + housing + n_credits + 
##     n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - age               1   609.10 693.10
## - n_people          1   609.39 693.39
## - present_resid     1   609.45 693.45
## - telephone         1   609.88 693.88
## - purpose           9   626.63 694.63
## <none>                  608.73 694.73
## - other_install     2   613.04 695.04
## - n_credits         1   611.45 695.45
## - sex               3   616.97 696.97
## - amount            1   613.07 697.07
## - foreign           1   613.16 697.16
## - housing           2   616.64 698.64
## - other_debtor      2   617.13 699.13
## - present_emp       4   621.21 699.21
## - credit_his        4   621.88 699.88
## - duration          1   618.51 702.51
## - saving_acct       4   625.82 703.82
## - installment_rate  1   620.72 704.72
## - chk_acct          3   666.74 746.74
## 
## Step:  AIC=693.1
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + other_install + housing + n_credits + n_people + 
##     telephone + foreign
## 
##                    Df Deviance    AIC
## - n_people          1   609.65 691.65
## - present_resid     1   609.99 691.99
## - telephone         1   610.41 692.41
## - purpose           9   627.05 693.05
## <none>                  609.10 693.10
## - other_install     2   613.34 693.34
## - n_credits         1   611.85 693.85
## - sex               3   617.44 695.44
## - amount            1   613.53 695.53
## - foreign           1   613.65 695.65
## - housing           2   617.38 697.38
## - present_emp       4   621.54 697.54
## - other_debtor      2   617.59 697.59
## - credit_his        4   622.35 698.35
## - duration          1   618.98 700.98
## - saving_acct       4   626.23 702.23
## - installment_rate  1   620.99 702.99
## - chk_acct          3   667.48 745.48
## 
## Step:  AIC=691.65
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + other_install + housing + n_credits + telephone + 
##     foreign
## 
##                    Df Deviance    AIC
## - present_resid     1   610.48 690.48
## - telephone         1   610.92 690.92
## <none>                  609.65 691.65
## - other_install     2   613.78 691.78
## - purpose           9   627.89 691.89
## - n_credits         1   612.69 692.69
## - sex               3   617.44 693.44
## - foreign           1   614.07 694.07
## - amount            1   614.11 694.11
## - housing           2   617.90 695.90
## - other_debtor      2   617.96 695.96
## - present_emp       4   622.34 696.34
## - credit_his        4   623.96 697.96
## - duration          1   619.26 699.26
## - saving_acct       4   626.64 700.64
## - installment_rate  1   621.21 701.21
## - chk_acct          3   668.08 744.08
## 
## Step:  AIC=690.48
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     other_install + housing + n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - telephone         1   611.88 689.88
## <none>                  610.48 690.48
## - other_install     2   614.74 690.74
## - purpose           9   629.19 691.19
## - n_credits         1   613.35 691.35
## - sex               3   618.18 692.18
## - foreign           1   614.77 692.77
## - amount            1   615.08 693.08
## - housing           2   617.91 693.91
## - other_debtor      2   618.89 694.89
## - present_emp       4   623.75 695.75
## - credit_his        4   625.25 697.25
## - duration          1   619.81 697.81
## - installment_rate  1   621.84 699.84
## - saving_acct       4   627.99 699.99
## - chk_acct          3   668.17 742.17
## 
## Step:  AIC=689.88
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     other_install + housing + n_credits + foreign
## 
##                    Df Deviance    AIC
## <none>                  611.88 689.88
## - other_install     2   616.02 690.02
## - purpose           9   630.35 690.35
## - n_credits         1   614.56 690.56
## - amount            1   615.56 691.56
## - sex               3   619.68 691.68
## - foreign           1   615.78 691.78
## - housing           2   619.28 693.28
## - other_debtor      2   620.36 694.36
## - present_emp       4   625.57 695.57
## - credit_his        4   626.57 696.57
## - duration          1   622.02 698.02
## - installment_rate  1   622.68 698.68
## - saving_acct       4   629.08 699.08
## - chk_acct          3   671.93 743.93

Finding the optimal probability cutoff value

# predicting on train set
predict_logit_train <- predict(german.glm1, type="response")

# define a cost function with input "obs" being observed response 
# and "pi" being predicted probability, and "pcut" being the threshold.
costfunc = function(obs, pred.p, pcut){
  weight1 = 5   # define the weight for "true=1 but pred=0" (FN)
  weight0 = 1    # define the weight for "true=0 but pred=1" (FP)
  c1 = (obs==1)&(pred.p<pcut)    # count for "true=1 but pred=0"   (FN)
  c0 = (obs==0)&(pred.p>=pcut)   # count for "true=0 but pred=1"   (FP)
  cost = mean(weight1*c1 + weight0*c0)  # misclassification with weight
  return(cost) # you have to return to a value when you write R functions
} # end of the function

# define a sequence from 0.01 to 1 by 0.01
p.seq = seq(0.01, 1, 0.01) 

# write a loop for all p-cut to see which one provides the smallest cost
# first, need to define a 0 vector in order to save the value of cost from all pcut
cost = rep(0, length(p.seq))  
for(i in 1:length(p.seq)){ 
  cost[i] = costfunc(obs = german.train$response, pred.p = predict_logit_train, pcut = p.seq[i])  
} # end of the loop

optimal.pcut = p.seq[which(cost==min(cost))][1]

optimal.pcut

## [1] 0.14

plot(p.seq, cost)

Train and Test Predictions

pred.glm.gtrain.glm0 <- predict(german.glm1, type = "response")
pred.glm.gtest.glm0 <- predict(german.glm1, newdata=german.test,type = "response")

pred.train <- as.numeric(pred.glm.gtrain.glm0 > optimal.pcut)
pred.test <- as.numeric(pred.glm.gtest.glm0 > optimal.pcut)


confusion_matrix_train <- table(german.train$response, pred.train)
confusion_matrix_train

##    pred.train
##       0   1
##   0 251 229
##   1  15 205

confusion_matrix_test <- table(german.test$response, pred.test)
confusion_matrix_test

##    pred.test
##       0   1
##   0 105 115
##   1  11  69

misclassification_rate_train <- round((confusion_matrix_train[2]+confusion_matrix_train[3])/sum(confusion_matrix_train), 2)
misclassification_rate_train

## [1] 0.35

misclassification_rate_test <- round((confusion_matrix_test[2]+confusion_matrix_test[3])/sum(confusion_matrix_test), 2)
misclassification_rate_test

## [1] 0.42

cat("train misclassfication rate:", misclassification_rate_train, "| test misclassfication rate:", misclassification_rate_test)

## train misclassfication rate: 0.35 | test misclassfication rate: 0.42

ROC Curve and AUC

#Train
#ROC
pred <- prediction(pred.glm.gtrain.glm0, german.train$response)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#AUC
unlist(slot(performance(pred, "auc"), "y.values"))

## [1] 0.8515436

#Test
#ROC
pred <- prediction(pred.glm.gtest.glm0, german.test$response)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#AUC
unlist(slot(performance(pred, "auc"), "y.values"))

## [1] 0.7554545

3 Fold Cross Validation

# Performing Cross Validation Using Asymmetric Cost Function

costfunc = function(obs, pred.p){
  weight1 = 5   # define the weight for "true=1 but pred=0" (FN)
  weight0 = 1    # define the weight for "true=0 but pred=1" (FP)
  c1 = (obs==1)&(pred.p<optimal.pcut)    # count for "true=1 but pred=0"   (FN)
  c0 = (obs==0)&(pred.p>=optimal.pcut)   # count for "true=0 but pred=1"   (FP)
  cost = mean(weight1*c1 + weight0*c0)  # misclassification with weight
  return(cost) # you have to return to a value when you write R functions
} # end of the function

#pcut <- 1/6

german_credit.glm1 <- glm(response ~ chk_acct + duration + credit_his + purpose +
                                 amount + saving_acct + installment_rate + sex + other_debtor + other_install +
                                 housing + telephone + foreign, family = binomial, german_credit)
cv.result <- cv.glm(data = german_credit,glmfit = german_credit.glm1,cost = costfunc ,K=3)

# the assymetrical classification rate 
cv.result$delta[2]

## [1] 0.53198

# Cross Validation with AUC as cost function

auccost = function(obs,pred.p)
{ #cost needs observed first then

  pred <- prediction(pred.p,obs) #prediction needs predicted first then
  cost = unlist(slot(performance(pred,"auc"),"y.values")) # AUC
  return(cost)
}

cv.result <- cv.glm(data = german_credit,glmfit = german_credit.glm1,cost = auccost ,K=3)

# the assymetrical classification rate 
cv.result$delta[2]

## [1] 0.7932289

Decision Tree

Getting a lengthy tree by using all the variables and cp very low

german_largetree <- rpart(formula = response~., data = german.train, 
                              parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))

prp(german_largetree, extra = 1, nn.font=40,box.palette = "red")

Pruning the model

plotcp(german_largetree)

printcp(german_largetree)

## 
## Classification tree:
## rpart(formula = response ~ ., data = german.train, parms = list(loss = matrix(c(0, 
##     5, 1, 0), nrow = 2)))
## 
## Variables actually used in tree construction:
## [1] age           chk_acct      credit_his    duration      other_install
## [6] property      purpose       saving_acct   sex          
## 
## Root node error: 480/700 = 0.68571
## 
## n= 700 
## 
##         CP nsplit rel error xerror    xstd
## 1 0.206250      0   1.00000 5.0000 0.12794
## 2 0.066667      1   0.79375 2.3417 0.12602
## 3 0.023958      2   0.72708 3.1313 0.13463
## 4 0.018750      6   0.63125 2.9229 0.13293
## 5 0.016667      7   0.61250 2.8083 0.13163
## 6 0.014583      8   0.59583 2.7354 0.13094
## 7 0.010000     11   0.55208 2.5562 0.12870

Pruning the tree using optimal complexity parameter and then plotting the optimal tree

german.prunedtree <- rpart(response~., data = german.train, method = "class",
                     parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)),cp=0.018750)
prp(german.prunedtree, extra = 1, nn.font=500,box.palette = "red")

Predictions

pred0<- predict(german.prunedtree, type="class")
table(german.train$response, pred0, dnn = c("True", "Pred"))

##     Pred
## True   0   1
##    0 242 238
##    1  13 207

pred1<- predict(german.prunedtree, newdata=german.test, type="class")
table(german.test$response, pred1, dnn = c("True", "Pred"))

##     Pred
## True   0   1
##    0 101 119
##    1  12  68

Train and Test Asymmetric Misclassfication Rate or Asymmetric Misclassification Cost

pred.glm.gtrain.tree <- predict(german.prunedtree, type = "prob")[,2]
pred.glm.gtest.tree <- predict(german.prunedtree, newdata=german.test,type = "prob")[,2]

pred.train <- as.numeric(pred.glm.gtrain.tree > optimal.pcut)
pred.test <- as.numeric(pred.glm.gtest.tree > optimal.pcut)


confusion_matrix_train <- table(german.train$response, pred.train)
confusion_matrix_train

##    pred.train
##       0   1
##   0 242 238
##   1  13 207

confusion_matrix_test <- table(german.test$response, pred.test)
confusion_matrix_test

##    pred.test
##       0   1
##   0 101 119
##   1  12  68

misclassification_rate_train <- round((confusion_matrix_train[2]+confusion_matrix_train[3])/sum(confusion_matrix_train), 2)
misclassification_rate_train

## [1] 0.36

misclassification_rate_test <- round((confusion_matrix_test[2]+confusion_matrix_test[3])/sum(confusion_matrix_test), 2)
misclassification_rate_test

## [1] 0.44

cat("train misclassfication rate:", misclassification_rate_train, "| test misclassfication rate:", misclassification_rate_test)

## train misclassfication rate: 0.36 | test misclassfication rate: 0.44

Test ROC and AUC

#ROC
pred_tree = prediction(pred.glm.gtest.tree, german.test$response)
perf_tree = performance(pred_tree, "tpr", "fpr")
plot(perf_tree, colorize=TRUE)

#AUC
unlist(slot(performance(pred_tree, "auc"), "y.values"))

## [1] 0.6640909

Model Comparison

MAJOR FINDING: Misclassication Rate and AUC of Logistic Regression is better than Decision Tree. Therefore we can conclude prediction power of Logistic Regression > Decision Tree