Data Preparation

fileUrl1 <- "http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
german.data <- read.table(fileUrl1)

dim(german.data)
## [1] 1000   21
str(german.data)
## 'data.frame':    1000 obs. of  21 variables:
##  $ V1 : Factor w/ 4 levels "A11","A12","A13",..: 1 2 4 1 1 4 4 2 4 2 ...
##  $ V2 : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ V3 : Factor w/ 5 levels "A30","A31","A32",..: 5 3 5 3 4 3 3 3 3 5 ...
##  $ V4 : Factor w/ 10 levels "A40","A41","A410",..: 5 5 8 4 1 8 4 2 5 1 ...
##  $ V5 : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ V6 : Factor w/ 5 levels "A61","A62","A63",..: 5 1 1 1 1 5 3 1 4 1 ...
##  $ V7 : Factor w/ 5 levels "A71","A72","A73",..: 5 3 4 4 3 3 5 3 4 1 ...
##  $ V8 : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ V9 : Factor w/ 4 levels "A91","A92","A93",..: 3 2 3 3 3 3 3 3 1 4 ...
##  $ V10: Factor w/ 3 levels "A101","A102",..: 1 1 1 3 1 1 1 1 1 1 ...
##  $ V11: int  4 2 3 4 4 4 4 2 4 2 ...
##  $ V12: Factor w/ 4 levels "A121","A122",..: 1 1 1 2 4 4 2 3 1 3 ...
##  $ V13: int  67 22 49 45 53 35 53 35 61 28 ...
##  $ V14: Factor w/ 3 levels "A141","A142",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ V15: Factor w/ 3 levels "A151","A152",..: 2 2 2 3 3 3 2 1 2 2 ...
##  $ V16: int  2 1 1 1 2 1 1 1 1 2 ...
##  $ V17: Factor w/ 4 levels "A171","A172",..: 3 3 2 3 3 2 3 4 2 4 ...
##  $ V18: int  1 1 2 2 2 2 1 1 1 1 ...
##  $ V19: Factor w/ 2 levels "A191","A192": 2 1 1 1 1 2 1 2 1 1 ...
##  $ V20: Factor w/ 2 levels "A201","A202": 1 1 1 1 1 1 1 1 1 1 ...
##  $ V21: int  1 2 1 1 2 1 1 1 1 2 ...
names <- c("chk_acct", "duration", "credit_his", "purpose","amount",
           "saving_acct","present_emp","installment_rate","marital_sex", "other_debtors","present_resid",
           "property","age","other_install","housing","n_credits","job","n_people",
           "telephone","foreign","response")
colnames(german.data)<-names
german.data$response <- german.data$response-1
german.data$response <- as.factor(german.data$response)

Split the Data into Training and Testing Set

set.seed(3976527)
trainrows <- sample(1:nrow(german.data), nrow(german.data) * 0.8)
germandata.train <- german.data[trainrows, ]
germandata.test <- german.data[-trainrows,]

Exploratory Data Analysis

Density plot of Continuous Numerical Variables

Ordinal Variables

Categorical Variables

Modeling with All Predictor Variables

Comparison

Deviance

rbind(c("logit", "probit", "cloglog"), c(germandata.train.glm0$deviance, germandata.train.glm.probit$deviance, germandata.train.glm.cloglog$deviance))
##      [,1]               [,2]               [,3]              
## [1,] "logit"            "probit"           "cloglog"         
## [2,] "694.031349331065" "693.721758804378" "693.044063965851"

Misclassification Rate

pred.glm.gtrain.glm0 <- predict(germandata.train.glm0, type = "response")
pred.glm.gtrain.glm.probit <- predict(germandata.train.glm.probit, type = "response")
pred.glm.gtrain.glm.cloglog <- predict(germandata.train.glm.cloglog, type = "response")
#+++++++++++++++++++++ Misclassification Rate, using pcut=1/6
        #+++++logit link
pcut <- 1/6
class.pred.train.glm0 <- (pred.glm.gtrain.glm0 > pcut)*1
MR.glm0 <- mean(germandata.train$response!=class.pred.train.glm0)
MR.glm0
## [1] 0.335
table(germandata.train$response, class.pred.train.glm0, dnn = c("True", "Predicted"))
##     Predicted
## True   0   1
##    0 325 238
##    1  30 207
        #+++++probit link
class.pred.train.probit <- (pred.glm.gtrain.glm.probit > pcut)*1
MR.glm.probit <- mean(germandata.train$response!=class.pred.train.probit)
MR.glm.probit
## [1] 0.34875
table(germandata.train$response, class.pred.train.probit, dnn = c("True", "Predicted"))
##     Predicted
## True   0   1
##    0 312 251
##    1  28 209
        #+++++cloglog link
class.pred.train.cloglog <- (pred.glm.gtrain.glm.cloglog > pcut)*1
MR.glm.cloglog <- mean(germandata.train$response!=class.pred.train.cloglog)
MR.glm.cloglog
## [1] 0.33625
table(germandata.train$response, class.pred.train.probit, dnn = c("True", "Predicted"))
##     Predicted
## True   0   1
##    0 312 251
##    1  28 209

Area under the ROC Curve (AUC)

     # +++++ area under the curve
library(verification)
roc.logit <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.gtrain.glm0)

roc.logit$roc.vol
##      Model      Area      p.value binorm.area
## 1 Model  1 0.8429151 2.355037e-53          NA
roc.probit <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.gtrain.glm.probit)

roc.probit$roc.vol
##      Model      Area    p.value binorm.area
## 1 Model  1 0.8422031 3.8426e-53          NA
roc.cloglog <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.gtrain.glm.cloglog)

roc.cloglog$roc.vol
##      Model      Area      p.value binorm.area
## 1 Model  1 0.8391453 3.110441e-52          NA

Variable Selection

Variable selection: AIC, BIC, LASSO

AIC

germandata.glm.AIC <- step(germandata.train.glm0)
## Start:  AIC=792.03
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + present_resid + property + age + other_install + 
##     housing + n_credits + job + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - job               3   695.22 787.22
## - n_people          1   694.04 790.04
## - housing           2   696.44 790.44
## - other_debtors     2   696.94 790.94
## - present_resid     1   695.59 791.59
## - property          3   700.01 792.01
## <none>                  694.03 792.03
## - telephone         1   696.23 792.23
## - n_credits         1   696.42 792.42
## - marital_sex       3   700.54 792.54
## - age               1   697.85 793.85
## - other_install     2   700.62 794.62
## - foreign           1   699.36 795.36
## - saving_acct       4   706.21 796.21
## - duration          1   700.27 796.27
## - present_emp       4   707.99 797.99
## - amount            1   703.85 799.85
## - credit_his        4   711.43 801.43
## - installment_rate  1   706.04 802.04
## - purpose           9   729.28 809.28
## - chk_acct          3   743.44 835.44
## 
## Step:  AIC=787.22
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + present_resid + property + age + other_install + 
##     housing + n_credits + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - n_people          1   695.25 785.25
## - housing           2   697.42 785.42
## - other_debtors     2   698.00 786.00
## - property          3   700.78 786.78
## - present_resid     1   697.20 787.20
## <none>                  695.22 787.22
## - n_credits         1   697.44 787.44
## - marital_sex       3   701.65 787.65
## - telephone         1   698.77 788.77
## - age               1   698.88 788.88
## - other_install     2   702.08 790.08
## - foreign           1   700.51 790.51
## - saving_acct       4   707.66 791.66
## - duration          1   701.81 791.81
## - present_emp       4   708.88 792.88
## - amount            1   704.69 794.69
## - credit_his        4   712.43 796.43
## - installment_rate  1   706.94 796.94
## - purpose           9   730.75 804.75
## - chk_acct          3   744.80 830.80
## 
## Step:  AIC=785.25
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + present_resid + property + age + other_install + 
##     housing + n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - housing           2   697.43 783.43
## - other_debtors     2   698.00 784.00
## - property          3   700.80 784.80
## - present_resid     1   697.23 785.23
## <none>                  695.25 785.25
## - n_credits         1   697.54 785.54
## - marital_sex       3   701.73 785.73
## - telephone         1   698.84 786.84
## - age               1   698.89 786.89
## - other_install     2   702.13 788.13
## - foreign           1   700.53 788.53
## - saving_acct       4   707.66 789.66
## - duration          1   701.82 789.82
## - present_emp       4   708.90 790.90
## - amount            1   704.70 792.70
## - credit_his        4   712.52 794.52
## - installment_rate  1   706.95 794.95
## - purpose           9   730.76 802.76
## - chk_acct          3   745.00 829.00
## 
## Step:  AIC=783.43
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + present_resid + property + age + other_install + 
##     n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - other_debtors     2   699.97 781.97
## <none>                  697.43 783.43
## - property          3   703.65 783.65
## - n_credits         1   699.67 783.67
## - marital_sex       3   704.65 784.65
## - present_resid     1   700.74 784.74
## - telephone         1   700.92 784.92
## - other_install     2   704.11 786.11
## - age               1   702.16 786.16
## - foreign           1   702.42 786.42
## - saving_acct       4   709.53 787.53
## - duration          1   703.61 787.61
## - present_emp       4   711.37 789.37
## - amount            1   707.34 791.34
## - credit_his        4   715.17 793.17
## - installment_rate  1   709.24 793.24
## - purpose           9   733.61 801.61
## - chk_acct          3   749.26 829.26
## 
## Step:  AIC=781.97
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     present_resid + property + age + other_install + n_credits + 
##     telephone + foreign
## 
##                    Df Deviance    AIC
## <none>                  699.97 781.97
## - n_credits         1   702.37 782.37
## - property          3   706.41 782.41
## - marital_sex       3   707.25 783.25
## - telephone         1   703.29 783.29
## - present_resid     1   703.35 783.35
## - other_install     2   706.21 784.21
## - age               1   705.08 785.08
## - saving_acct       4   711.37 785.37
## - foreign           1   705.61 785.61
## - duration          1   705.89 785.89
## - present_emp       4   714.06 788.06
## - amount            1   710.64 790.64
## - credit_his        4   717.21 791.21
## - installment_rate  1   712.15 792.15
## - purpose           9   737.36 801.36
## - chk_acct          3   750.98 826.98
AIC(germandata.glm.AIC)
## [1] 781.9713
BIC(germandata.glm.AIC)
## [1] 974.0404
summary(germandata.glm.AIC)
## 
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + purpose + 
##     amount + saving_acct + present_emp + installment_rate + marital_sex + 
##     present_resid + property + age + other_install + n_credits + 
##     telephone + foreign, family = binomial, data = germandata.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1489  -0.6862  -0.3577   0.6559   2.5428  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        8.359e-01  1.026e+00   0.815 0.415334    
## chk_acctA12       -3.623e-01  2.473e-01  -1.465 0.142939    
## chk_acctA13       -1.061e+00  4.174e-01  -2.542 0.011026 *  
## chk_acctA14       -1.699e+00  2.639e-01  -6.436 1.23e-10 ***
## duration           2.498e-02  1.027e-02   2.433 0.014984 *  
## credit_hisA31     -8.397e-02  6.096e-01  -0.138 0.890433    
## credit_hisA32     -5.320e-01  4.607e-01  -1.155 0.248163    
## credit_hisA33     -8.832e-01  5.073e-01  -1.741 0.081702 .  
## credit_hisA34     -1.497e+00  4.792e-01  -3.124 0.001786 ** 
## purposeA41        -2.033e+00  4.478e-01  -4.540 5.62e-06 ***
## purposeA410       -2.194e+00  8.841e-01  -2.481 0.013095 *  
## purposeA42        -9.264e-01  2.946e-01  -3.145 0.001663 ** 
## purposeA43        -9.950e-01  2.812e-01  -3.538 0.000403 ***
## purposeA44        -9.784e-01  8.607e-01  -1.137 0.255672    
## purposeA45        -9.408e-02  5.960e-01  -0.158 0.874559    
## purposeA46        -1.514e-02  4.719e-01  -0.032 0.974405    
## purposeA48        -1.768e+00  1.206e+00  -1.467 0.142505    
## purposeA49        -7.603e-01  3.769e-01  -2.017 0.043656 *  
## amount             1.571e-04  4.903e-05   3.204 0.001356 ** 
## saving_acctA62    -3.185e-01  3.260e-01  -0.977 0.328537    
## saving_acctA63    -2.794e-01  4.635e-01  -0.603 0.546622    
## saving_acctA64    -9.372e-01  5.192e-01  -1.805 0.071030 .  
## saving_acctA65    -8.643e-01  3.005e-01  -2.876 0.004022 ** 
## present_empA72     1.009e-01  4.286e-01   0.235 0.813843    
## present_empA73    -1.565e-01  4.065e-01  -0.385 0.700208    
## present_empA74    -1.008e+00  4.564e-01  -2.208 0.027273 *  
## present_empA75    -6.643e-01  4.246e-01  -1.564 0.117703    
## installment_rate   3.431e-01  9.995e-02   3.432 0.000598 ***
## marital_sexA92    -7.657e-01  4.324e-01  -1.771 0.076554 .  
## marital_sexA93    -1.062e+00  4.207e-01  -2.525 0.011579 *  
## marital_sexA94    -6.811e-01  5.196e-01  -1.311 0.189931    
## present_resid      1.752e-01  9.579e-02   1.829 0.067367 .  
## propertyA122       5.602e-01  2.892e-01   1.937 0.052727 .  
## propertyA123       3.509e-01  2.655e-01   1.322 0.186254    
## propertyA124       8.130e-01  3.475e-01   2.339 0.019318 *  
## age               -2.275e-02  1.022e-02  -2.227 0.025955 *  
## other_installA142 -4.565e-01  4.613e-01  -0.990 0.322327    
## other_installA143 -6.718e-01  2.678e-01  -2.509 0.012124 *  
## n_credits          3.238e-01  2.095e-01   1.546 0.122171    
## telephoneA192     -3.981e-01  2.203e-01  -1.807 0.070696 .  
## foreignA202       -1.409e+00  6.512e-01  -2.163 0.030518 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 972.25  on 799  degrees of freedom
## Residual deviance: 699.97  on 759  degrees of freedom
## AIC: 781.97
## 
## Number of Fisher Scoring iterations: 5
mean.residual.d.AIC <- germandata.glm.AIC$deviance/germandata.glm.AIC$df.residual
mean.residual.d.AIC
## [1] 0.9222284

BIC

germandata.glm.BIC <- step(germandata.train.glm0, k = log(nrow(germandata.train)))
## Start:  AIC=1021.58
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + present_resid + property + age + other_install + 
##     housing + n_credits + job + n_people + telephone + foreign
## 
##                    Df Deviance     AIC
## - purpose           9   729.28  996.66
## - job               3   695.22 1002.72
## - saving_acct       4   706.21 1007.02
## - property          3   700.01 1007.50
## - marital_sex       3   700.54 1008.03
## - present_emp       4   707.99 1008.80
## - housing           2   696.44 1010.62
## - other_debtors     2   696.94 1011.11
## - credit_his        4   711.43 1012.23
## - other_install     2   700.62 1014.80
## - n_people          1   694.04 1014.90
## - present_resid     1   695.59 1016.45
## - telephone         1   696.23 1017.09
## - n_credits         1   696.42 1017.28
## - age               1   697.85 1018.71
## - foreign           1   699.36 1020.22
## - duration          1   700.27 1021.13
## <none>                  694.03 1021.58
## - amount            1   703.85 1024.71
## - installment_rate  1   706.04 1026.90
## - chk_acct          3   743.44 1050.93
## 
## Step:  AIC=996.66
## response ~ chk_acct + duration + credit_his + amount + saving_acct + 
##     present_emp + installment_rate + marital_sex + other_debtors + 
##     present_resid + property + age + other_install + housing + 
##     n_credits + job + n_people + telephone + foreign
## 
##                    Df Deviance     AIC
## - job               3   730.75  978.08
## - marital_sex       3   734.57  981.90
## - saving_acct       4   741.96  982.61
## - property          3   735.60  982.93
## - present_emp       4   743.19  983.83
## - housing           2   732.39  986.41
## - credit_his        4   746.41  987.05
## - other_debtors     2   733.50  987.51
## - other_install     2   734.75  988.77
## - n_people          1   729.29  989.99
## - present_resid     1   730.05  990.75
## - age               1   730.63  991.33
## - n_credits         1   732.02  992.72
## - telephone         1   732.13  992.83
## - foreign           1   732.97  993.67
## - duration          1   734.56  995.26
## - amount            1   735.70  996.40
## <none>                  729.28  996.66
## - installment_rate  1   742.22 1002.92
## - chk_acct          3   783.25 1030.58
## 
## Step:  AIC=978.08
## response ~ chk_acct + duration + credit_his + amount + saving_acct + 
##     present_emp + installment_rate + marital_sex + other_debtors + 
##     present_resid + property + age + other_install + housing + 
##     n_credits + n_people + telephone + foreign
## 
##                    Df Deviance     AIC
## - marital_sex       3   735.93  963.21
## - property          3   736.60  963.88
## - saving_acct       4   743.39  963.98
## - present_emp       4   744.66  965.25
## - housing           2   733.61  967.57
## - credit_his        4   748.11  968.71
## - other_debtors     2   734.81  968.78
## - other_install     2   736.31  970.27
## - n_people          1   730.76  971.41
## - present_resid     1   731.84  972.49
## - age               1   732.01  972.66
## - n_credits         1   733.45  974.09
## - foreign           1   734.46  975.10
## - telephone         1   735.67  976.31
## - amount            1   736.43  977.07
## - duration          1   736.58  977.22
## <none>                  730.75  978.08
## - installment_rate  1   742.98  983.62
## - chk_acct          3   785.12 1012.40
## 
## Step:  AIC=963.21
## response ~ chk_acct + duration + credit_his + amount + saving_acct + 
##     present_emp + installment_rate + other_debtors + present_resid + 
##     property + age + other_install + housing + n_credits + n_people + 
##     telephone + foreign
## 
##                    Df Deviance    AIC
## - property          3   741.59 948.81
## - saving_acct       4   748.57 949.11
## - present_emp       4   752.44 952.98
## - housing           2   739.61 953.52
## - other_debtors     2   740.10 954.01
## - credit_his        4   753.77 954.31
## - other_install     2   741.23 955.13
## - n_people          1   735.99 956.59
## - present_resid     1   736.74 957.34
## - age               1   736.87 957.47
## - n_credits         1   738.58 959.17
## - foreign           1   739.79 960.38
## - amount            1   741.21 961.81
## - telephone         1   741.29 961.88
## - duration          1   742.17 962.76
## <none>                  735.93 963.21
## - installment_rate  1   746.12 966.71
## - chk_acct          3   791.75 998.98
## 
## Step:  AIC=948.81
## response ~ chk_acct + duration + credit_his + amount + saving_acct + 
##     present_emp + installment_rate + other_debtors + present_resid + 
##     age + other_install + housing + n_credits + n_people + telephone + 
##     foreign
## 
##                    Df Deviance    AIC
## - saving_acct       4   753.53 934.01
## - present_emp       4   757.92 938.40
## - housing           2   745.32 939.17
## - other_debtors     2   745.78 939.63
## - credit_his        4   759.19 939.67
## - other_install     2   747.87 941.72
## - n_people          1   741.66 942.20
## - present_resid     1   742.32 942.86
## - age               1   742.55 943.08
## - n_credits         1   743.79 944.33
## - foreign           1   745.38 945.91
## - telephone         1   745.83 946.37
## <none>                  741.59 948.81
## - amount            1   748.37 948.91
## - duration          1   748.55 949.09
## - installment_rate  1   752.63 953.17
## - chk_acct          3   798.74 985.91
## 
## Step:  AIC=934.01
## response ~ chk_acct + duration + credit_his + amount + present_emp + 
##     installment_rate + other_debtors + present_resid + age + 
##     other_install + housing + n_credits + n_people + telephone + 
##     foreign
## 
##                    Df Deviance    AIC
## - other_debtors     2   756.39 923.50
## - housing           2   757.04 924.16
## - credit_his        4   771.65 925.40
## - present_emp       4   772.38 926.12
## - other_install     2   759.68 926.79
## - n_people          1   753.72 927.52
## - present_resid     1   753.97 927.77
## - age               1   754.65 928.45
## - n_credits         1   756.17 929.97
## - foreign           1   758.05 931.85
## - telephone         1   758.45 932.25
## - amount            1   759.87 933.66
## <none>                  753.53 934.01
## - duration          1   760.43 934.23
## - installment_rate  1   764.30 938.10
## - chk_acct          3   823.83 984.26
## 
## Step:  AIC=923.5
## response ~ chk_acct + duration + credit_his + amount + present_emp + 
##     installment_rate + present_resid + age + other_install + 
##     housing + n_credits + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - housing           2   759.90 913.65
## - credit_his        4   774.14 914.52
## - other_install     2   762.12 915.87
## - present_emp       4   775.88 916.26
## - n_people          1   756.65 917.08
## - present_resid     1   756.92 917.35
## - age               1   757.78 918.21
## - n_credits         1   759.24 919.67
## - telephone         1   761.06 921.49
## - foreign           1   761.69 922.13
## - duration          1   763.04 923.47
## <none>                  756.39 923.50
## - amount            1   763.53 923.97
## - installment_rate  1   767.37 927.80
## - chk_acct          3   825.48 972.54
## 
## Step:  AIC=913.65
## response ~ chk_acct + duration + credit_his + amount + present_emp + 
##     installment_rate + present_resid + age + other_install + 
##     n_credits + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - credit_his        4   778.62 905.63
## - other_install     2   765.66 906.04
## - n_people          1   760.17 907.23
## - present_emp       4   780.76 907.77
## - age               1   761.77 908.83
## - present_resid     1   762.02 909.08
## - n_credits         1   762.50 909.56
## - telephone         1   764.21 911.27
## - foreign           1   765.15 912.21
## <none>                  759.90 913.65
## - duration          1   766.87 913.93
## - amount            1   767.22 914.28
## - installment_rate  1   771.04 918.10
## - chk_acct          3   833.14 966.83
## 
## Step:  AIC=905.63
## response ~ chk_acct + duration + amount + present_emp + installment_rate + 
##     present_resid + age + other_install + n_credits + n_people + 
##     telephone + foreign
## 
##                    Df Deviance    AIC
## - n_credits         1   778.64 898.96
## - n_people          1   778.78 899.11
## - present_resid     1   780.70 901.03
## - other_install     2   787.95 901.59
## - age               1   781.40 901.72
## - present_emp       4   801.58 901.85
## - telephone         1   782.80 903.12
## - foreign           1   785.14 905.46
## <none>                  778.62 905.63
## - amount            1   785.58 905.90
## - duration          1   787.42 907.75
## - installment_rate  1   789.61 909.93
## - chk_acct          3   866.00 972.96
## 
## Step:  AIC=898.96
## response ~ chk_acct + duration + amount + present_emp + installment_rate + 
##     present_resid + age + other_install + n_people + telephone + 
##     foreign
## 
##                    Df Deviance    AIC
## - n_people          1   778.80 892.43
## - present_resid     1   780.77 894.41
## - other_install     2   787.96 894.92
## - age               1   781.40 895.04
## - present_emp       4   801.62 895.20
## - telephone         1   782.80 896.44
## - foreign           1   785.15 898.78
## <none>                  778.64 898.96
## - amount            1   785.64 899.28
## - duration          1   787.43 901.07
## - installment_rate  1   789.64 903.27
## - chk_acct          3   866.47 966.74
## 
## Step:  AIC=892.43
## response ~ chk_acct + duration + amount + present_emp + installment_rate + 
##     present_resid + age + other_install + telephone + foreign
## 
##                    Df Deviance    AIC
## - present_resid     1   780.90 887.85
## - other_install     2   788.00 888.27
## - age               1   781.73 888.69
## - present_emp       4   802.13 889.03
## - telephone         1   782.92 889.87
## - foreign           1   785.33 892.28
## <none>                  778.80 892.43
## - amount            1   785.83 892.79
## - duration          1   787.58 894.53
## - installment_rate  1   789.91 896.87
## - chk_acct          3   866.48 960.06
## 
## Step:  AIC=887.85
## response ~ chk_acct + duration + amount + present_emp + installment_rate + 
##     age + other_install + telephone + foreign
## 
##                    Df Deviance    AIC
## - present_emp       4   802.43 882.65
## - age               1   783.20 883.47
## - other_install     2   790.04 883.63
## - telephone         1   784.60 884.87
## - foreign           1   787.51 887.78
## - amount            1   787.52 887.79
## <none>                  780.90 887.85
## - duration          1   790.25 890.52
## - installment_rate  1   791.92 892.19
## - chk_acct          3   870.54 957.44
## 
## Step:  AIC=882.65
## response ~ chk_acct + duration + amount + installment_rate + 
##     age + other_install + telephone + foreign
## 
##                    Df Deviance    AIC
## - other_install     2   810.90 877.74
## - telephone         1   806.58 880.11
## - foreign           1   808.55 882.08
## - age               1   808.68 882.22
## - amount            1   808.91 882.44
## <none>                  802.43 882.65
## - duration          1   809.71 883.24
## - installment_rate  1   812.15 885.69
## - chk_acct          3   900.46 960.63
## 
## Step:  AIC=877.74
## response ~ chk_acct + duration + amount + installment_rate + 
##     age + telephone + foreign
## 
##                    Df Deviance    AIC
## - telephone         1   815.03 875.19
## - age               1   816.16 876.32
## - foreign           1   816.47 876.63
## - amount            1   817.38 877.54
## <none>                  810.90 877.74
## - duration          1   818.52 878.68
## - installment_rate  1   820.41 880.57
## - chk_acct          3   909.11 955.90
## 
## Step:  AIC=875.19
## response ~ chk_acct + duration + amount + installment_rate + 
##     age + foreign
## 
##                    Df Deviance    AIC
## - amount            1   819.23 872.71
## - foreign           1   819.82 873.30
## - age               1   821.70 875.17
## <none>                  815.03 875.19
## - installment_rate  1   823.47 876.95
## - duration          1   823.92 877.40
## - chk_acct          3   915.39 955.50
## 
## Step:  AIC=872.71
## response ~ chk_acct + duration + installment_rate + age + foreign
## 
##                    Df Deviance    AIC
## - foreign           1   823.52 870.31
## - installment_rate  1   824.44 871.23
## - age               1   824.78 871.57
## <none>                  819.23 872.71
## - duration          1   851.13 897.92
## - chk_acct          3   918.43 951.85
## 
## Step:  AIC=870.31
## response ~ chk_acct + duration + installment_rate + age
## 
##                    Df Deviance    AIC
## - age               1   828.92 869.03
## - installment_rate  1   829.50 869.61
## <none>                  823.52 870.31
## - duration          1   858.12 898.23
## - chk_acct          3   920.42 947.16
## 
## Step:  AIC=869.03
## response ~ chk_acct + duration + installment_rate
## 
##                    Df Deviance    AIC
## - installment_rate  1   834.25 867.67
## <none>                  828.92 869.03
## - duration          1   864.27 897.69
## - chk_acct          3   926.95 947.00
## 
## Step:  AIC=867.67
## response ~ chk_acct + duration
## 
##            Df Deviance    AIC
## <none>          834.25 867.67
## - duration  1   870.67 897.41
## - chk_acct  3   930.49 943.86
AIC(germandata.glm.BIC)
## [1] 844.2468
BIC(germandata.glm.BIC)
## [1] 867.6699
summary(germandata.glm.BIC)
## 
## Call:
## glm(formula = response ~ chk_acct + duration, family = binomial, 
##     data = germandata.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6540  -0.8661  -0.4857   0.9688   2.3389  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.952041   0.198218  -4.803 1.56e-06 ***
## chk_acctA12 -0.415088   0.202532  -2.049  0.04041 *  
## chk_acctA13 -1.142595   0.369528  -3.092  0.00199 ** 
## chk_acctA14 -1.960184   0.225469  -8.694  < 2e-16 ***
## duration     0.040686   0.006906   5.891 3.83e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 972.25  on 799  degrees of freedom
## Residual deviance: 834.25  on 795  degrees of freedom
## AIC: 844.25
## 
## Number of Fisher Scoring iterations: 4
mean.residual.d.BIC <- germandata.glm.BIC$deviance/germandata.glm.BIC$df.residual
mean.residual.d.BIC
## [1] 1.049367

LASSO

    # scale numerical variables
nums <- sapply(germandata.train, is.numeric)
nums <- as.vector(which(nums==TRUE))
germandata.nums.std <- scale(german.data[,nums])
    # change categorical variables to dummy variables
dummy <- model.matrix( ~ chk_acct+credit_his+purpose+saving_acct+present_emp+
                           marital_sex+other_debtors+property+other_install+housing+job+telephone+foreign-1, 
                       data= german.data)
y.std <- german.data$response
germandata.std <- cbind(dummy,germandata.nums.std,y.std)
head(germandata.std)
##   chk_acctA11 chk_acctA12 chk_acctA13 chk_acctA14 credit_hisA31
## 1           1           0           0           0             0
## 2           0           1           0           0             0
## 3           0           0           0           1             0
## 4           1           0           0           0             0
## 5           1           0           0           0             0
## 6           0           0           0           1             0
##   credit_hisA32 credit_hisA33 credit_hisA34 purposeA41 purposeA410
## 1             0             0             1          0           0
## 2             1             0             0          0           0
## 3             0             0             1          0           0
## 4             1             0             0          0           0
## 5             0             1             0          0           0
## 6             1             0             0          0           0
##   purposeA42 purposeA43 purposeA44 purposeA45 purposeA46 purposeA48
## 1          0          1          0          0          0          0
## 2          0          1          0          0          0          0
## 3          0          0          0          0          1          0
## 4          1          0          0          0          0          0
## 5          0          0          0          0          0          0
## 6          0          0          0          0          1          0
##   purposeA49 saving_acctA62 saving_acctA63 saving_acctA64 saving_acctA65
## 1          0              0              0              0              1
## 2          0              0              0              0              0
## 3          0              0              0              0              0
## 4          0              0              0              0              0
## 5          0              0              0              0              0
## 6          0              0              0              0              1
##   present_empA72 present_empA73 present_empA74 present_empA75
## 1              0              0              0              1
## 2              0              1              0              0
## 3              0              0              1              0
## 4              0              0              1              0
## 5              0              1              0              0
## 6              0              1              0              0
##   marital_sexA92 marital_sexA93 marital_sexA94 other_debtorsA102
## 1              0              1              0                 0
## 2              1              0              0                 0
## 3              0              1              0                 0
## 4              0              1              0                 0
## 5              0              1              0                 0
## 6              0              1              0                 0
##   other_debtorsA103 propertyA122 propertyA123 propertyA124
## 1                 0            0            0            0
## 2                 0            0            0            0
## 3                 0            0            0            0
## 4                 1            1            0            0
## 5                 0            0            0            1
## 6                 0            0            0            1
##   other_installA142 other_installA143 housingA152 housingA153 jobA172
## 1                 0                 1           1           0       0
## 2                 0                 1           1           0       0
## 3                 0                 1           1           0       1
## 4                 0                 1           0           1       0
## 5                 0                 1           0           1       0
## 6                 0                 1           0           1       1
##   jobA173 jobA174 telephoneA192 foreignA202   duration     amount
## 1       1       0             1           0 -1.2358595 -0.7447588
## 2       1       0             0           0  2.2470700  0.9493418
## 3       0       0             0           0 -0.7382981 -0.4163541
## 4       1       0             0           0  1.7495086  1.6334296
## 5       1       0             0           0  0.2568246  0.5663801
## 6       0       0             1           0  1.2519473  2.0489838
##   installment_rate present_resid         age  n_credits   n_people y.std
## 1       0.91801781     1.0464631  2.76507291  1.0265652 -0.4280754     1
## 2      -0.86974813    -0.7655942 -1.19080809 -0.7045734 -0.4280754     2
## 3      -0.86974813     0.1404344  1.18272051 -0.7045734  2.3337012     1
## 4      -0.86974813     1.0464631  0.83108664 -0.7045734  2.3337012     1
## 5       0.02413484     1.0464631  1.53435438  1.0265652  2.3337012     2
## 6      -0.86974813     1.0464631 -0.04799802 -0.7045734  2.3337012     1
dim(germandata.std)
## [1] 1000   50
X.train <- as.matrix(germandata.std[trainrows,-50])
Y.train <- as.factor(germandata.std[trainrows, 50])
library(glmnet)
lasso.fit<- glmnet(x=X.train, y=Y.train, family = "binomial", alpha = 1)
summary(lasso.fit)
##            Length Class     Mode     
## a0           76   -none-    numeric  
## beta       3724   dgCMatrix S4       
## df           76   -none-    numeric  
## dim           2   -none-    numeric  
## lambda       76   -none-    numeric  
## dev.ratio    76   -none-    numeric  
## nulldev       1   -none-    numeric  
## npasses       1   -none-    numeric  
## jerr          1   -none-    numeric  
## offset        1   -none-    logical  
## classnames    2   -none-    character
## call          5   -none-    call     
## nobs          1   -none-    numeric
plot(lasso.fit, xvar = "lambda", label=TRUE)

cv.lasso<- cv.glmnet(x = X.train, y = Y.train, family = "binomial", alpha = 1, nfolds = 10)
plot(cv.lasso)

cv.lasso$lambda.min
## [1] 0.005578583
coef(lasso.fit, s=cv.lasso$lambda.min)
## 50 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)        1.00777559
## chk_acctA11        0.37166592
## chk_acctA12        .         
## chk_acctA13       -0.50844111
## chk_acctA14       -1.22534962
## credit_hisA31      0.32612154
## credit_hisA32      .         
## credit_hisA33     -0.21399751
## credit_hisA34     -0.75321515
## purposeA41        -1.44364971
## purposeA410       -1.22121852
## purposeA42        -0.44145508
## purposeA43        -0.61670344
## purposeA44        -0.33754980
## purposeA45         0.11918168
## purposeA46         0.05552924
## purposeA48        -0.81661282
## purposeA49        -0.24642018
## saving_acctA62    -0.08048020
## saving_acctA63    -0.15969592
## saving_acctA64    -0.73023981
## saving_acctA65    -0.65496338
## present_empA72     0.19479054
## present_empA73     .         
## present_empA74    -0.65913015
## present_empA75    -0.33492568
## marital_sexA92     .         
## marital_sexA93    -0.27173384
## marital_sexA94     .         
## other_debtorsA102  0.09789027
## other_debtorsA103 -0.47461284
## propertyA122       0.21376361
## propertyA123       0.09163231
## propertyA124       0.35070600
## other_installA142 -0.13957622
## other_installA143 -0.49138634
## housingA152       -0.27145292
## housingA153        .         
## jobA172            .         
## jobA173            .         
## jobA174            .         
## telephoneA192     -0.26164989
## foreignA202       -0.85692089
## duration           0.30013208
## amount             0.34005439
## installment_rate   0.27168363
## present_resid      0.06207061
## age               -0.13786597
## n_credits          0.10649131
## n_people           .
#++++++++++++++++++++ mean residual diviance of LASSO
germandata.glm.lasso <- glm(response~.-n_people-job, family = "binomial", data = germandata.train)
summary(germandata.glm.lasso)
## 
## Call:
## glm(formula = response ~ . - n_people - job, family = "binomial", 
##     data = germandata.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1003  -0.6622  -0.3514   0.6386   2.5483  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.171e+00  1.049e+00   1.117 0.264098    
## chk_acctA12       -3.286e-01  2.503e-01  -1.313 0.189291    
## chk_acctA13       -1.055e+00  4.186e-01  -2.521 0.011690 *  
## chk_acctA14       -1.676e+00  2.650e-01  -6.325 2.53e-10 ***
## duration           2.662e-02  1.040e-02   2.559 0.010487 *  
## credit_hisA31     -4.295e-02  6.152e-01  -0.070 0.944342    
## credit_hisA32     -5.250e-01  4.630e-01  -1.134 0.256835    
## credit_hisA33     -9.025e-01  5.089e-01  -1.774 0.076127 .  
## credit_hisA34     -1.492e+00  4.817e-01  -3.098 0.001951 ** 
## purposeA41        -1.984e+00  4.486e-01  -4.422 9.79e-06 ***
## purposeA410       -2.241e+00  9.007e-01  -2.488 0.012829 *  
## purposeA42        -9.439e-01  2.965e-01  -3.184 0.001454 ** 
## purposeA43        -9.500e-01  2.853e-01  -3.330 0.000868 ***
## purposeA44        -9.719e-01  8.615e-01  -1.128 0.259256    
## purposeA45        -6.826e-02  6.075e-01  -0.112 0.910533    
## purposeA46        -6.997e-02  4.778e-01  -0.146 0.883568    
## purposeA48        -1.814e+00  1.220e+00  -1.487 0.137022    
## purposeA49        -7.789e-01  3.801e-01  -2.049 0.040477 *  
## amount             1.503e-04  4.957e-05   3.031 0.002437 ** 
## saving_acctA62    -3.851e-01  3.309e-01  -1.164 0.244511    
## saving_acctA63    -3.245e-01  4.637e-01  -0.700 0.484043    
## saving_acctA64    -1.011e+00  5.273e-01  -1.917 0.055178 .  
## saving_acctA65    -8.986e-01  3.021e-01  -2.975 0.002930 ** 
## present_empA72     6.830e-02  4.303e-01   0.159 0.873891    
## present_empA73    -1.353e-01  4.075e-01  -0.332 0.739906    
## present_empA74    -1.010e+00  4.577e-01  -2.206 0.027356 *  
## present_empA75    -6.549e-01  4.254e-01  -1.540 0.123636    
## installment_rate   3.399e-01  1.010e-01   3.366 0.000762 ***
## marital_sexA92    -8.055e-01  4.347e-01  -1.853 0.063847 .  
## marital_sexA93    -1.036e+00  4.229e-01  -2.451 0.014261 *  
## marital_sexA94    -6.953e-01  5.209e-01  -1.335 0.181984    
## other_debtorsA102  2.820e-01  4.671e-01   0.604 0.546028    
## other_debtorsA103 -7.361e-01  5.073e-01  -1.451 0.146771    
## present_resid      1.402e-01  9.992e-02   1.403 0.160559    
## propertyA122       5.791e-01  2.912e-01   1.989 0.046748 *  
## propertyA123       3.244e-01  2.670e-01   1.215 0.224338    
## propertyA124       8.674e-01  4.650e-01   1.865 0.062116 .  
## age               -1.963e-02  1.040e-02  -1.887 0.059175 .  
## other_installA142 -4.456e-01  4.642e-01  -0.960 0.337149    
## other_installA143 -7.075e-01  2.695e-01  -2.626 0.008646 ** 
## housingA152       -3.700e-01  2.596e-01  -1.425 0.154119    
## housingA153       -4.570e-01  5.262e-01  -0.869 0.385118    
## n_credits          3.182e-01  2.104e-01   1.512 0.130547    
## telephoneA192     -4.165e-01  2.216e-01  -1.880 0.060174 .  
## foreignA202       -1.376e+00  6.522e-01  -2.110 0.034828 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 972.25  on 799  degrees of freedom
## Residual deviance: 695.25  on 755  degrees of freedom
## AIC: 785.25
## 
## Number of Fisher Scoring iterations: 5
mean.residual.d.LASSO <- germandata.glm.lasso$deviance/germandata.glm.lasso$df.residual
mean.residual.d.LASSO     # not rigorous
## [1] 0.9208569
pred.lasso.train<- predict(germandata.glm.lasso, newx = X.train, s=cv.lasso$lambda.min)

In-Sample Prediction of the Three Models

Misclassification Rate

pred.glm.train.AIC <- predict(germandata.glm.AIC, type = "response")
pred.glm.train.BIC <- predict(germandata.glm.BIC, type = "response")
pred.glm.train.LASSO<- predict(lasso.fit, newx = X.train, s=cv.lasso$lambda.min)

pcut <- 1/6
class.pred.glm.train.AIC <- (pred.glm.train.AIC>pcut)*1
class.pred.glm.train.BIC <- (pred.glm.train.BIC>pcut)*1
class.pred.glm.train.LASSO <- (pred.glm.train.LASSO>pcut)*1



MR.AIC <- mean(germandata.train$response!=class.pred.glm.train.AIC)
MR.AIC
## [1] 0.3325
MR.BIC <- mean(germandata.train$response!=class.pred.glm.train.BIC)
MR.BIC
## [1] 0.4175
MR.LASSO <- mean(germandata.train$response!=class.pred.glm.train.LASSO)
MR.LASSO
## [1] 0.2125

Area Under the Curve

roc_curve.aic <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.train.AIC, main = "AIC")

roc_curve.aic$roc.vol
##      Model      Area     p.value binorm.area
## 1 Model  1 0.8404868 1.24557e-52          NA
roc_curve.bic <-roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.train.BIC, main = "BIC")

roc_curve.bic$roc.vol
##      Model      Area      p.value binorm.area
## 1 Model  1 0.7513209 1.215515e-29          NA
roc_curve.lasso <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.train.LASSO, main = "LASSO")

roc_curve.lasso$roc.vol
##      Model    Area    p.value binorm.area
## 1 Model  1 0.83614 2.3854e-51          NA

Out-of-Sample Prediction Using Selected Model

pred.glm.test.AIC <- predict(germandata.glm.AIC, newdata = germandata.test, type = "response")
roc_curve.test=roc.plot(x=(germandata.test$response==1),pred = pred.glm.test.AIC)

roc_curve.test$roc.vol
##      Model      Area      p.value binorm.area
## 1 Model  1 0.7534469 4.412475e-09          NA
pcut <- 1/6
class.pred.glm.test.AIC <- (pred.glm.test.AIC>pcut)*1
table(germandata.test$response, class.pred.glm.test.AIC, dnn = c("True", "Predicted"))
##     Predicted
## True  0  1
##    0 75 62
##    1  9 54
MR.AIC.test <- mean(germandata.test$response!=class.pred.glm.test.AIC)
MR.AIC.test
## [1] 0.355
#  Out-of-Sample Cost (Asymmetric Misclassification Rate)
costfunc = function(obs, pred.p, pcut){
    weight1 = 5   # define the weight for "true=1 but pred=0" (FN)
    weight0 = 1    # define the weight for "true=0 but pred=1" (FP)
    c1 = (obs==1)&(pred.p<pcut)    # count for "true=1 but pred=0"   (FN)
    c0 = (obs==0)&(pred.p>=pcut)   # count for "true=0 but pred=1"   (FP)
    cost = mean(weight1*c1 + weight0*c0)  # misclassification with weight
    return(cost) # you have to return to a value when you write R functions
}

cost.test = costfunc(obs = germandata.test$response, pred.p = pred.glm.test.AIC, pcut = pcut)  
cost.test
## [1] 0.535

Grid Search for Optimal Cutoff Probability

costfunc = function(obs, pred.p, pcut){
    weight1 = 5   # define the weight for "true=1 but pred=0" (FN)
    weight0 = 1    # define the weight for "true=0 but pred=1" (FP)
    c1 = (obs==1)&(pred.p<pcut)    # count for "true=1 but pred=0"   (FN)
    c0 = (obs==0)&(pred.p>=pcut)   # count for "true=0 but pred=1"   (FP)
    cost = mean(weight1*c1 + weight0*c0)  # misclassification with weight
    return(cost) # you have to return to a value when you write R functions
} 

p.seq = seq(0.01, 1, 0.01) 
cost = rep(0, length(p.seq))  
for(i in 1:length(p.seq)){ 
    cost[i] = costfunc(obs = germandata.train$response, pred.p = pred.glm.train.AIC, pcut = p.seq[i])  
}

plot(p.seq, cost)

optimal.pcut.glm = p.seq[which(cost==min(cost))]
optimal.pcut.glm
## [1] 0.12
class.pred.glm.opt.cut <- (pred.glm.test.AIC>optimal.pcut.glm)*1
MR.opt.pcut<- mean(germandata.test$response!= class.pred.glm.opt.cut)
MR.opt.pcut
## [1] 0.39
FPR.opt.cut <- sum(germandata.test$response==0 & class.pred.glm.opt.cut==1)/sum(germandata.test$response==0)
FPR.opt.cut
## [1] 0.5255474
FNR.opt.cut <- sum(germandata.test$response==1 & class.pred.glm.opt.cut==0)/sum(germandata.test$response==1)
FNR.opt.cut
## [1] 0.0952381
cost.opt.cut.train <- costfunc(obs = germandata.train$response, pred.p = pred.glm.train.AIC, pcut = optimal.pcut.glm)  
cost.opt.cut.train
## [1] 0.46625
cost.opt.cut.test <- costfunc(obs = germandata.test$response, pred.p=pred.glm.test.AIC, pcut = optimal.pcut.glm)
cost.opt.cut.test
## [1] 0.51

Compare Cross Validation and Validation Set Approach

# Specify Cost Function

## First cost function - asymmetric misclassification rate
costamr <- function(r, pi) {
    weight1 = 5
    weight0 = 1
    c1 = (r == 1) & (pi < optimal.pcut.glm)  #logical vector - true if actual 1 but predict 0
    c0 = (r == 0) & (pi > optimal.pcut.glm)  #logical vecotr - true if actual 0 but predict 1
    return(mean(weight1 * c1 + weight0 * c0))
}

costauc <- function(r, pi){roc.plot(x = r == "1", pred = pi)$roc.vol$Area}

alldata.glm0 <- glm(response~., family = binomial, german.data)
alldata.glm0.AIC <- step(alldata.glm0)
## Start:  AIC=993.82
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + present_resid + property + age + other_install + 
##     housing + n_credits + job + n_people + telephone + foreign
## 
##                    Df Deviance     AIC
## - job               3   896.56  988.56
## - property          3   899.08  991.08
## - present_resid     1   895.82  991.82
## - n_people          1   896.94  992.94
## <none>                  895.82  993.82
## - n_credits         1   897.89  993.89
## - present_emp       4   904.03  994.03
## - housing           2   900.05  994.05
## - telephone         1   898.06  994.06
## - age               1   898.34  994.34
## - marital_sex       3   905.15  997.15
## - other_debtors     2   903.24  997.24
## - foreign           1   901.88  997.88
## - other_install     2   903.98  997.98
## - amount            1   904.28 1000.28
## - duration          1   904.87 1000.87
## - saving_acct       4   915.63 1005.63
## - installment_rate  1   910.27 1006.27
## - credit_his        4   917.62 1007.62
## - purpose           9   931.12 1011.12
## - chk_acct          3   962.05 1054.05
## 
## Step:  AIC=988.56
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + present_resid + property + age + other_install + 
##     housing + n_credits + n_people + telephone + foreign
## 
##                    Df Deviance     AIC
## - property          3   899.79  985.79
## - present_resid     1   896.57  986.57
## - n_people          1   897.67  987.67
## - present_emp       4   904.32  988.32
## - n_credits         1   898.47  988.47
## <none>                  896.56  988.56
## - housing           2   900.60  988.60
## - telephone         1   899.13  989.13
## - age               1   899.19  989.19
## - marital_sex       3   905.83  991.83
## - other_debtors     2   903.87  991.87
## - foreign           1   902.67  992.67
## - other_install     2   904.95  992.95
## - amount            1   905.31  995.31
## - duration          1   905.85  995.85
## - saving_acct       4   917.02 1001.02
## - installment_rate  1   911.45 1001.45
## - credit_his        4   918.12 1002.12
## - purpose           9   931.82 1005.82
## - chk_acct          3   962.35 1048.35
## 
## Step:  AIC=985.79
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + present_resid + age + other_install + housing + 
##     n_credits + n_people + telephone + foreign
## 
##                    Df Deviance     AIC
## - present_resid     1   899.81  983.81
## - n_people          1   900.79  984.79
## - housing           2   903.47  985.47
## - n_credits         1   901.49  985.49
## <none>                  899.79  985.79
## - telephone         1   901.81  985.81
## - present_emp       4   907.85  985.85
## - age               1   902.52  986.52
## - marital_sex       3   908.67  988.67
## - foreign           1   905.83  989.83
## - other_debtors     2   908.05  990.05
## - other_install     2   908.87  990.87
## - amount            1   909.80  993.80
## - duration          1   909.99  993.99
## - saving_acct       4   919.78  997.78
## - installment_rate  1   915.56  999.56
## - credit_his        4   921.66  999.66
## - purpose           9   936.35 1004.35
## - chk_acct          3   967.78 1047.78
## 
## Step:  AIC=983.81
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + age + other_install + housing + n_credits + 
##     n_people + telephone + foreign
## 
##                    Df Deviance     AIC
## - n_people          1   900.81  982.81
## - n_credits         1   901.53  983.53
## - telephone         1   901.81  983.81
## <none>                  899.81  983.81
## - present_emp       4   907.86  983.86
## - housing           2   903.95  983.95
## - age               1   902.53  984.53
## - marital_sex       3   908.69  986.69
## - foreign           1   905.88  987.88
## - other_debtors     2   908.08  988.08
## - other_install     2   908.87  988.87
## - amount            1   909.80  991.80
## - duration          1   910.05  992.05
## - saving_acct       4   919.78  995.78
## - installment_rate  1   915.59  997.59
## - credit_his        4   921.66  997.66
## - purpose           9   936.35 1002.35
## - chk_acct          3   968.09 1046.09
## 
## Step:  AIC=982.81
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + age + other_install + housing + n_credits + 
##     telephone + foreign
## 
##                    Df Deviance     AIC
## - n_credits         1   902.80  982.80
## <none>                  900.81  982.81
## - present_emp       4   908.82  982.82
## - telephone         1   902.90  982.90
## - housing           2   905.01  983.01
## - age               1   903.38  983.38
## - marital_sex       3   908.75  984.75
## - foreign           1   906.79  986.79
## - other_debtors     2   908.83  986.83
## - other_install     2   909.90  987.90
## - amount            1   910.50  990.50
## - duration          1   910.95  990.95
## - saving_acct       4   920.53  994.53
## - installment_rate  1   915.95  995.95
## - credit_his        4   923.28  997.28
## - purpose           9   937.61 1001.61
## - chk_acct          3   969.35 1045.35
## 
## Step:  AIC=982.8
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + marital_sex + 
##     other_debtors + age + other_install + housing + telephone + 
##     foreign
## 
##                    Df Deviance     AIC
## - present_emp       4   910.50  982.50
## - telephone         1   904.72  982.72
## <none>                  902.80  982.80
## - housing           2   907.17  983.17
## - age               1   905.21  983.21
## - marital_sex       3   910.47  984.47
## - other_debtors     2   910.88  986.88
## - foreign           1   909.14  987.14
## - other_install     2   912.57  988.57
## - duration          1   912.50  990.50
## - amount            1   912.54  990.54
## - saving_acct       4   922.72  994.72
## - credit_his        4   923.43  995.43
## - installment_rate  1   917.65  995.65
## - purpose           9   940.20 1002.20
## - chk_acct          3   971.04 1045.04
## 
## Step:  AIC=982.5
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + installment_rate + marital_sex + other_debtors + 
##     age + other_install + housing + telephone + foreign
## 
##                    Df Deviance     AIC
## <none>                  910.50  982.50
## - telephone         1   912.82  982.82
## - age               1   912.96  982.96
## - housing           2   915.01  983.01
## - foreign           1   916.63  986.63
## - marital_sex       3   920.98  986.98
## - other_debtors     2   919.41  987.41
## - other_install     2   920.76  988.76
## - duration          1   918.79  988.79
## - amount            1   920.07  990.07
## - saving_acct       4   931.53  995.53
## - installment_rate  1   925.92  995.92
## - credit_his        4   932.30  996.30
## - purpose           9   947.78 1001.78
## - chk_acct          3   979.75 1045.75
cv.result.amr = cv.glm(german.data, glmfit=alldata.glm0.AIC, cost=costamr, K=5)
cv.result.amr$delta[2]
## [1] 0.5104
cv.result.auc = cv.glm(german.data, glmfit=alldata.glm0.AIC, cost=costauc, K=5)

cv.result.auc$delta[2]
## [1] 0.7850393

Classification Tree

Grow a Large Tree

library(rpart)
germandata.largetree <- rpart(formula = response~., data = germandata.train, 
                              parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)), cp=0.001)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.3
prp(germandata.largetree, extra = 1)

plotcp(germandata.largetree)

printcp(germandata.largetree)
## 
## Classification tree:
## rpart(formula = response ~ ., data = germandata.train, parms = list(loss = matrix(c(0, 
##     5, 1, 0), nrow = 2)), cp = 0.001)
## 
## Variables actually used in tree construction:
##  [1] age           amount        chk_acct      credit_his    duration     
##  [6] housing       job           marital_sex   other_debtors other_install
## [11] present_emp   property      purpose       saving_acct  
## 
## Root node error: 563/800 = 0.70375
## 
## n= 800 
## 
##           CP nsplit rel error xerror    xstd
## 1  0.1438721      0   1.00000 5.0000 0.11470
## 2  0.0799290      1   0.85613 2.6128 0.11870
## 3  0.0284192      2   0.77620 2.6057 0.11911
## 4  0.0275311      5   0.69094 2.6021 0.11892
## 5  0.0248668      7   0.63588 2.6181 0.11913
## 6  0.0195382      9   0.58615 2.5133 0.11798
## 7  0.0177620     10   0.56661 2.5435 0.11847
## 8  0.0142096     11   0.54885 2.5098 0.11805
## 9  0.0115453     12   0.53464 2.4263 0.11693
## 10 0.0097691     14   0.51155 2.2877 0.11476
## 11 0.0088810     16   0.49201 2.3037 0.11503
## 12 0.0071048     17   0.48313 2.2753 0.11442
## 13 0.0041445     18   0.47602 2.1989 0.11321
## 14 0.0035524     21   0.46359 2.2078 0.11334
## 15 0.0011841     27   0.43872 2.1847 0.11288
## 16 0.0010000     30   0.43517 2.1776 0.11244

Prune the Tree

tree.default0.01 <- rpart(response~., data = germandata.train, method = "class",
                                          parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)), cp=0.01)
prp(tree.default0.01)

pruned0.008 <- rpart(response~., data = germandata.train, method = "class",
                      parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)),cp=0.008)
prp(pruned0.008, extra = 1)

Misclassification Rate

# in-sample misclassification rate
germandata.train.largetree.pred <- predict(germandata.largetree, type = "class")
germandata.train.default0.01 <- predict(tree.default0.01, type = "class")
germandata.train.pruned0.008 <- predict(pruned0.008, type = "class")
MR.train.largetree <- mean(germandata.train$response!=germandata.train.largetree.pred)
MR.train.largetree
## [1] 0.26125
MR.train.0.01 <- mean(germandata.train$response!=germandata.train.default0.01)
MR.train.0.01
## [1] 0.325
MR.train.0.008 <- mean(germandata.train$response!=germandata.train.pruned0.008)
MR.train.0.008
## [1] 0.305
# out-of-sample prediction using classification tree
germandata.test.largetree.pred <- predict(germandata.largetree, germandata.test, type = "class")
germandata.test.default0.01 <- predict(tree.default0.01, germandata.test, type = "class")
germandata.test.pruned0.008 <- predict(pruned0.008, germandata.test, type = "class")

MR.test.largetree <- mean(germandata.test$response!=germandata.test.largetree.pred)
MR.test.largetree
## [1] 0.385
MR.test.0.01 <- mean(germandata.test$response!=germandata.test.default0.01)
MR.test.0.01
## [1] 0.39
MR.test.0.008 <- mean(germandata.test$response!=germandata.test.pruned0.008)
MR.test.0.008
## [1] 0.38
germandata.test.pred.tree2= predict(pruned0.008, germandata.test, type = "prob")
germandata.train.pred.tree2 = predict(pruned0.008,germandata.train, type="prob")

roc_curve.bic <- roc.plot(x=(germandata.test$response == "1"), pred = germandata.test.pred.tree2[,2])

roc_curve.bic$roc.vol
##      Model      Area      p.value binorm.area
## 1 Model  1 0.6671301 4.240929e-05          NA
roc_curve.bic.train <- roc.plot(x=(germandata.train$response == "1"), pred = germandata.train.pred.tree2[,2])

roc_curve.bic$roc.vol
##      Model      Area      p.value binorm.area
## 1 Model  1 0.6671301 4.240929e-05          NA
cost.cart <- function(r, pi){
    weight1 = 5
    weight0 = 1
    c1 = (r==1)&(pi==0) #logical vector - true if actual 1 but predict 0
    c0 = (r==0)&(pi==1) #logical vector - true if actual 0 but predict 1
    return(mean(weight1*c1+weight0*c0))
}
cost.cart(germandata.test$response,germandata.test.pruned0.008)
## [1] 0.7
cost.cart(germandata.test$response,germandata.test.default0.01)
## [1] 0.67