Data Preparation
fileUrl1 <- "http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
german.data <- read.table(fileUrl1)
dim(german.data)
## [1] 1000 21
str(german.data)
## 'data.frame': 1000 obs. of 21 variables:
## $ V1 : Factor w/ 4 levels "A11","A12","A13",..: 1 2 4 1 1 4 4 2 4 2 ...
## $ V2 : int 6 48 12 42 24 36 24 36 12 30 ...
## $ V3 : Factor w/ 5 levels "A30","A31","A32",..: 5 3 5 3 4 3 3 3 3 5 ...
## $ V4 : Factor w/ 10 levels "A40","A41","A410",..: 5 5 8 4 1 8 4 2 5 1 ...
## $ V5 : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ V6 : Factor w/ 5 levels "A61","A62","A63",..: 5 1 1 1 1 5 3 1 4 1 ...
## $ V7 : Factor w/ 5 levels "A71","A72","A73",..: 5 3 4 4 3 3 5 3 4 1 ...
## $ V8 : int 4 2 2 2 3 2 3 2 2 4 ...
## $ V9 : Factor w/ 4 levels "A91","A92","A93",..: 3 2 3 3 3 3 3 3 1 4 ...
## $ V10: Factor w/ 3 levels "A101","A102",..: 1 1 1 3 1 1 1 1 1 1 ...
## $ V11: int 4 2 3 4 4 4 4 2 4 2 ...
## $ V12: Factor w/ 4 levels "A121","A122",..: 1 1 1 2 4 4 2 3 1 3 ...
## $ V13: int 67 22 49 45 53 35 53 35 61 28 ...
## $ V14: Factor w/ 3 levels "A141","A142",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ V15: Factor w/ 3 levels "A151","A152",..: 2 2 2 3 3 3 2 1 2 2 ...
## $ V16: int 2 1 1 1 2 1 1 1 1 2 ...
## $ V17: Factor w/ 4 levels "A171","A172",..: 3 3 2 3 3 2 3 4 2 4 ...
## $ V18: int 1 1 2 2 2 2 1 1 1 1 ...
## $ V19: Factor w/ 2 levels "A191","A192": 2 1 1 1 1 2 1 2 1 1 ...
## $ V20: Factor w/ 2 levels "A201","A202": 1 1 1 1 1 1 1 1 1 1 ...
## $ V21: int 1 2 1 1 2 1 1 1 1 2 ...
names <- c("chk_acct", "duration", "credit_his", "purpose","amount",
"saving_acct","present_emp","installment_rate","marital_sex", "other_debtors","present_resid",
"property","age","other_install","housing","n_credits","job","n_people",
"telephone","foreign","response")
colnames(german.data)<-names
german.data$response <- german.data$response-1
german.data$response <- as.factor(german.data$response)
Split the Data into Training and Testing Set
set.seed(3976527)
trainrows <- sample(1:nrow(german.data), nrow(german.data) * 0.8)
germandata.train <- german.data[trainrows, ]
germandata.test <- german.data[-trainrows,]
Exploratory Data Analysis
Density plot of Continuous Numerical Variables

Ordinal Variables

Categorical Variables

Modeling with All Predictor Variables
Models with Different Link Functions
#+++ logit link
germandata.train.glm0 <- glm(response~., family = binomial, germandata.train)
summary(germandata.train.glm0)
##
## Call:
## glm(formula = response ~ ., family = binomial, data = germandata.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1158 -0.6599 -0.3497 0.6223 2.4966
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.322e-01 1.192e+00 0.782 0.434109
## chk_acctA12 -3.190e-01 2.520e-01 -1.266 0.205592
## chk_acctA13 -1.031e+00 4.187e-01 -2.463 0.013778 *
## chk_acctA14 -1.677e+00 2.659e-01 -6.304 2.89e-10 ***
## duration 2.628e-02 1.053e-02 2.494 0.012615 *
## credit_hisA31 -3.105e-02 6.195e-01 -0.050 0.960029
## credit_hisA32 -5.322e-01 4.639e-01 -1.147 0.251259
## credit_hisA33 -9.140e-01 5.113e-01 -1.788 0.073836 .
## credit_hisA34 -1.505e+00 4.835e-01 -3.112 0.001858 **
## purposeA41 -2.003e+00 4.536e-01 -4.416 1.00e-05 ***
## purposeA410 -2.171e+00 9.007e-01 -2.410 0.015957 *
## purposeA42 -9.508e-01 2.982e-01 -3.189 0.001429 **
## purposeA43 -9.520e-01 2.862e-01 -3.326 0.000882 ***
## purposeA44 -9.301e-01 8.570e-01 -1.085 0.277784
## purposeA45 -8.418e-02 6.118e-01 -0.138 0.890560
## purposeA46 -8.153e-02 4.802e-01 -0.170 0.865173
## purposeA48 -1.894e+00 1.225e+00 -1.546 0.122144
## purposeA49 -7.893e-01 3.824e-01 -2.064 0.039032 *
## amount 1.582e-04 5.119e-05 3.091 0.001997 **
## saving_acctA62 -4.171e-01 3.339e-01 -1.249 0.211700
## saving_acctA63 -3.272e-01 4.664e-01 -0.702 0.482903
## saving_acctA64 -1.023e+00 5.326e-01 -1.921 0.054788 .
## saving_acctA65 -8.894e-01 3.035e-01 -2.930 0.003390 **
## present_empA72 -1.364e-01 4.806e-01 -0.284 0.776601
## present_empA73 -3.421e-01 4.696e-01 -0.728 0.466317
## present_empA74 -1.219e+00 5.163e-01 -2.360 0.018252 *
## present_empA75 -8.353e-01 4.757e-01 -1.756 0.079098 .
## installment_rate 3.494e-01 1.025e-01 3.408 0.000655 ***
## marital_sexA92 -8.170e-01 4.379e-01 -1.866 0.062092 .
## marital_sexA93 -1.057e+00 4.286e-01 -2.467 0.013625 *
## marital_sexA94 -6.999e-01 5.232e-01 -1.338 0.180964
## other_debtorsA102 2.796e-01 4.652e-01 0.601 0.547865
## other_debtorsA103 -7.607e-01 5.076e-01 -1.499 0.134000
## present_resid 1.257e-01 1.009e-01 1.246 0.212872
## propertyA122 5.890e-01 2.938e-01 2.005 0.044969 *
## propertyA123 3.523e-01 2.705e-01 1.302 0.192795
## propertyA124 9.415e-01 4.693e-01 2.006 0.044835 *
## age -2.028e-02 1.050e-02 -1.933 0.053282 .
## other_installA142 -4.484e-01 4.648e-01 -0.965 0.334743
## other_installA143 -6.965e-01 2.708e-01 -2.572 0.010111 *
## housingA152 -3.903e-01 2.612e-01 -1.494 0.135209
## housingA153 -4.876e-01 5.277e-01 -0.924 0.355516
## n_credits 3.281e-01 2.125e-01 1.544 0.122699
## jobA172 5.486e-01 7.314e-01 0.750 0.453215
## jobA173 3.932e-01 7.035e-01 0.559 0.576211
## jobA174 1.527e-01 7.116e-01 0.215 0.830116
## n_people 2.789e-02 2.839e-01 0.098 0.921751
## telephoneA192 -3.468e-01 2.351e-01 -1.475 0.140119
## foreignA202 -1.381e+00 6.512e-01 -2.121 0.033934 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 972.25 on 799 degrees of freedom
## Residual deviance: 694.03 on 751 degrees of freedom
## AIC: 792.03
##
## Number of Fisher Scoring iterations: 5
extractAIC(germandata.train.glm0)
## [1] 49.0000 792.0313
#+++ probit link
germandata.train.glm.probit <- glm(response~., family = binomial(link = probit), germandata.train)
summary(germandata.train.glm.probit)
##
## Call:
## glm(formula = response ~ ., family = binomial(link = probit),
## data = germandata.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0829 -0.6863 -0.3343 0.6481 2.5270
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.261e-01 6.985e-01 0.610 0.54180
## chk_acctA12 -1.818e-01 1.476e-01 -1.232 0.21808
## chk_acctA13 -5.856e-01 2.379e-01 -2.462 0.01383 *
## chk_acctA14 -9.911e-01 1.511e-01 -6.561 5.33e-11 ***
## duration 1.468e-02 6.137e-03 2.391 0.01678 *
## credit_hisA31 2.927e-02 3.622e-01 0.081 0.93558
## credit_hisA32 -2.850e-01 2.715e-01 -1.050 0.29383
## credit_hisA33 -5.215e-01 2.990e-01 -1.744 0.08116 .
## credit_hisA34 -8.642e-01 2.810e-01 -3.075 0.00210 **
## purposeA41 -1.175e+00 2.550e-01 -4.608 4.06e-06 ***
## purposeA410 -1.298e+00 5.205e-01 -2.494 0.01264 *
## purposeA42 -5.223e-01 1.728e-01 -3.022 0.00251 **
## purposeA43 -5.143e-01 1.635e-01 -3.145 0.00166 **
## purposeA44 -5.584e-01 4.984e-01 -1.120 0.26253
## purposeA45 8.759e-03 3.640e-01 0.024 0.98080
## purposeA46 -3.337e-02 2.770e-01 -0.120 0.90411
## purposeA48 -1.134e+00 7.036e-01 -1.611 0.10711
## purposeA49 -4.384e-01 2.203e-01 -1.990 0.04656 *
## amount 9.321e-05 2.970e-05 3.139 0.00170 **
## saving_acctA62 -2.436e-01 1.936e-01 -1.258 0.20838
## saving_acctA63 -1.906e-01 2.590e-01 -0.736 0.46172
## saving_acctA64 -5.643e-01 2.946e-01 -1.915 0.05544 .
## saving_acctA65 -4.707e-01 1.704e-01 -2.763 0.00573 **
## present_empA72 -7.351e-02 2.843e-01 -0.259 0.79598
## present_empA73 -1.821e-01 2.773e-01 -0.657 0.51145
## present_empA74 -6.969e-01 3.010e-01 -2.315 0.02062 *
## present_empA75 -4.813e-01 2.803e-01 -1.717 0.08600 .
## installment_rate 1.984e-01 5.874e-02 3.378 0.00073 ***
## marital_sexA92 -4.535e-01 2.540e-01 -1.785 0.07420 .
## marital_sexA93 -5.860e-01 2.486e-01 -2.357 0.01842 *
## marital_sexA94 -3.837e-01 3.038e-01 -1.263 0.20652
## other_debtorsA102 1.472e-01 2.729e-01 0.540 0.58949
## other_debtorsA103 -4.368e-01 2.876e-01 -1.519 0.12879
## present_resid 6.772e-02 5.808e-02 1.166 0.24364
## propertyA122 3.614e-01 1.679e-01 2.153 0.03136 *
## propertyA123 2.162e-01 1.560e-01 1.386 0.16582
## propertyA124 5.330e-01 2.676e-01 1.992 0.04639 *
## age -1.190e-02 6.003e-03 -1.982 0.04744 *
## other_installA142 -2.851e-01 2.743e-01 -1.039 0.29861
## other_installA143 -4.078e-01 1.574e-01 -2.590 0.00960 **
## housingA152 -2.224e-01 1.514e-01 -1.469 0.14181
## housingA153 -2.112e-01 3.010e-01 -0.702 0.48284
## n_credits 2.105e-01 1.224e-01 1.720 0.08551 .
## jobA172 3.417e-01 4.307e-01 0.793 0.42750
## jobA173 2.454e-01 4.156e-01 0.590 0.55487
## jobA174 1.051e-01 4.210e-01 0.250 0.80277
## n_people 3.228e-02 1.636e-01 0.197 0.84360
## telephoneA192 -1.899e-01 1.336e-01 -1.421 0.15527
## foreignA202 -7.719e-01 3.601e-01 -2.144 0.03207 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 972.25 on 799 degrees of freedom
## Residual deviance: 693.72 on 751 degrees of freedom
## AIC: 791.72
##
## Number of Fisher Scoring iterations: 6
extractAIC(germandata.train.glm.probit)
## [1] 49.0000 791.7218
#+++ complementary log-log link
germandata.train.glm.cloglog <- glm(response~., family = binomial(link = cloglog), germandata.train)
summary(germandata.train.glm.cloglog)
##
## Call:
## glm(formula = response ~ ., family = binomial(link = cloglog),
## data = germandata.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0175 -0.6647 -0.3796 0.5171 2.4561
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.045e-02 8.667e-01 -0.070 0.944397
## chk_acctA12 -2.213e-01 1.794e-01 -1.234 0.217386
## chk_acctA13 -8.916e-01 3.493e-01 -2.552 0.010704 *
## chk_acctA14 -1.306e+00 2.101e-01 -6.214 5.16e-10 ***
## duration 2.127e-02 7.454e-03 2.853 0.004330 **
## credit_hisA31 1.218e-01 3.995e-01 0.305 0.760528
## credit_hisA32 -2.875e-01 3.093e-01 -0.930 0.352613
## credit_hisA33 -5.890e-01 3.508e-01 -1.679 0.093106 .
## credit_hisA34 -1.056e+00 3.327e-01 -3.175 0.001499 **
## purposeA41 -1.510e+00 3.526e-01 -4.284 1.84e-05 ***
## purposeA410 -1.830e+00 6.714e-01 -2.726 0.006413 **
## purposeA42 -7.384e-01 2.169e-01 -3.404 0.000664 ***
## purposeA43 -7.530e-01 2.108e-01 -3.573 0.000353 ***
## purposeA44 -7.992e-01 6.454e-01 -1.238 0.215562
## purposeA45 -1.861e-01 4.444e-01 -0.419 0.675451
## purposeA46 -1.203e-01 3.362e-01 -0.358 0.720445
## purposeA48 -1.486e+00 1.030e+00 -1.442 0.149372
## purposeA49 -6.578e-01 2.813e-01 -2.338 0.019381 *
## amount 1.133e-04 3.623e-05 3.126 0.001769 **
## saving_acctA62 -2.390e-01 2.410e-01 -0.992 0.321412
## saving_acctA63 -5.010e-01 3.918e-01 -1.279 0.200954
## saving_acctA64 -8.509e-01 4.360e-01 -1.952 0.050995 .
## saving_acctA65 -7.955e-01 2.450e-01 -3.248 0.001164 **
## present_empA72 -1.451e-01 3.406e-01 -0.426 0.670147
## present_empA73 -2.094e-01 3.348e-01 -0.625 0.531686
## present_empA74 -9.059e-01 3.753e-01 -2.414 0.015777 *
## present_empA75 -5.833e-01 3.437e-01 -1.697 0.089714 .
## installment_rate 3.041e-01 7.742e-02 3.927 8.59e-05 ***
## marital_sexA92 -5.740e-01 3.065e-01 -1.873 0.061119 .
## marital_sexA93 -7.716e-01 3.029e-01 -2.547 0.010865 *
## marital_sexA94 -4.633e-01 3.785e-01 -1.224 0.220931
## other_debtorsA102 8.476e-02 3.426e-01 0.247 0.804586
## other_debtorsA103 -5.941e-01 3.898e-01 -1.524 0.127514
## present_resid 1.296e-01 7.471e-02 1.735 0.082680 .
## propertyA122 4.513e-01 2.246e-01 2.010 0.044482 *
## propertyA123 2.600e-01 2.078e-01 1.251 0.210881
## propertyA124 7.602e-01 3.336e-01 2.279 0.022668 *
## age -1.618e-02 7.826e-03 -2.067 0.038731 *
## other_installA142 -3.388e-01 3.383e-01 -1.001 0.316593
## other_installA143 -5.293e-01 1.979e-01 -2.675 0.007476 **
## housingA152 -2.668e-01 1.909e-01 -1.398 0.162105
## housingA153 -4.415e-01 3.671e-01 -1.203 0.229027
## n_credits 2.726e-01 1.564e-01 1.743 0.081256 .
## jobA172 3.035e-01 5.300e-01 0.573 0.566913
## jobA173 1.956e-01 5.087e-01 0.385 0.700553
## jobA174 -1.453e-02 5.178e-01 -0.028 0.977613
## n_people 6.956e-02 2.169e-01 0.321 0.748479
## telephoneA192 -2.498e-01 1.737e-01 -1.438 0.150373
## foreignA202 -9.862e-01 5.282e-01 -1.867 0.061864 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 972.25 on 799 degrees of freedom
## Residual deviance: 693.04 on 751 degrees of freedom
## AIC: 791.04
##
## Number of Fisher Scoring iterations: 12
extractAIC(germandata.train.glm.cloglog)
## [1] 49.0000 791.0441
Comparison
Deviance
rbind(c("logit", "probit", "cloglog"), c(germandata.train.glm0$deviance, germandata.train.glm.probit$deviance, germandata.train.glm.cloglog$deviance))
## [,1] [,2] [,3]
## [1,] "logit" "probit" "cloglog"
## [2,] "694.031349331065" "693.721758804378" "693.044063965851"
Misclassification Rate
pred.glm.gtrain.glm0 <- predict(germandata.train.glm0, type = "response")
pred.glm.gtrain.glm.probit <- predict(germandata.train.glm.probit, type = "response")
pred.glm.gtrain.glm.cloglog <- predict(germandata.train.glm.cloglog, type = "response")
#+++++++++++++++++++++ Misclassification Rate, using pcut=1/6
#+++++logit link
pcut <- 1/6
class.pred.train.glm0 <- (pred.glm.gtrain.glm0 > pcut)*1
MR.glm0 <- mean(germandata.train$response!=class.pred.train.glm0)
MR.glm0
## [1] 0.335
table(germandata.train$response, class.pred.train.glm0, dnn = c("True", "Predicted"))
## Predicted
## True 0 1
## 0 325 238
## 1 30 207
#+++++probit link
class.pred.train.probit <- (pred.glm.gtrain.glm.probit > pcut)*1
MR.glm.probit <- mean(germandata.train$response!=class.pred.train.probit)
MR.glm.probit
## [1] 0.34875
table(germandata.train$response, class.pred.train.probit, dnn = c("True", "Predicted"))
## Predicted
## True 0 1
## 0 312 251
## 1 28 209
#+++++cloglog link
class.pred.train.cloglog <- (pred.glm.gtrain.glm.cloglog > pcut)*1
MR.glm.cloglog <- mean(germandata.train$response!=class.pred.train.cloglog)
MR.glm.cloglog
## [1] 0.33625
table(germandata.train$response, class.pred.train.probit, dnn = c("True", "Predicted"))
## Predicted
## True 0 1
## 0 312 251
## 1 28 209
Area under the ROC Curve (AUC)
# +++++ area under the curve
library(verification)
roc.logit <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.gtrain.glm0)

roc.logit$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8429151 2.355037e-53 NA
roc.probit <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.gtrain.glm.probit)

roc.probit$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8422031 3.8426e-53 NA
roc.cloglog <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.gtrain.glm.cloglog)

roc.cloglog$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8391453 3.110441e-52 NA
Variable Selection
Variable selection: AIC, BIC, LASSO
AIC
germandata.glm.AIC <- step(germandata.train.glm0)
## Start: AIC=792.03
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + present_resid + property + age + other_install +
## housing + n_credits + job + n_people + telephone + foreign
##
## Df Deviance AIC
## - job 3 695.22 787.22
## - n_people 1 694.04 790.04
## - housing 2 696.44 790.44
## - other_debtors 2 696.94 790.94
## - present_resid 1 695.59 791.59
## - property 3 700.01 792.01
## <none> 694.03 792.03
## - telephone 1 696.23 792.23
## - n_credits 1 696.42 792.42
## - marital_sex 3 700.54 792.54
## - age 1 697.85 793.85
## - other_install 2 700.62 794.62
## - foreign 1 699.36 795.36
## - saving_acct 4 706.21 796.21
## - duration 1 700.27 796.27
## - present_emp 4 707.99 797.99
## - amount 1 703.85 799.85
## - credit_his 4 711.43 801.43
## - installment_rate 1 706.04 802.04
## - purpose 9 729.28 809.28
## - chk_acct 3 743.44 835.44
##
## Step: AIC=787.22
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + present_resid + property + age + other_install +
## housing + n_credits + n_people + telephone + foreign
##
## Df Deviance AIC
## - n_people 1 695.25 785.25
## - housing 2 697.42 785.42
## - other_debtors 2 698.00 786.00
## - property 3 700.78 786.78
## - present_resid 1 697.20 787.20
## <none> 695.22 787.22
## - n_credits 1 697.44 787.44
## - marital_sex 3 701.65 787.65
## - telephone 1 698.77 788.77
## - age 1 698.88 788.88
## - other_install 2 702.08 790.08
## - foreign 1 700.51 790.51
## - saving_acct 4 707.66 791.66
## - duration 1 701.81 791.81
## - present_emp 4 708.88 792.88
## - amount 1 704.69 794.69
## - credit_his 4 712.43 796.43
## - installment_rate 1 706.94 796.94
## - purpose 9 730.75 804.75
## - chk_acct 3 744.80 830.80
##
## Step: AIC=785.25
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + present_resid + property + age + other_install +
## housing + n_credits + telephone + foreign
##
## Df Deviance AIC
## - housing 2 697.43 783.43
## - other_debtors 2 698.00 784.00
## - property 3 700.80 784.80
## - present_resid 1 697.23 785.23
## <none> 695.25 785.25
## - n_credits 1 697.54 785.54
## - marital_sex 3 701.73 785.73
## - telephone 1 698.84 786.84
## - age 1 698.89 786.89
## - other_install 2 702.13 788.13
## - foreign 1 700.53 788.53
## - saving_acct 4 707.66 789.66
## - duration 1 701.82 789.82
## - present_emp 4 708.90 790.90
## - amount 1 704.70 792.70
## - credit_his 4 712.52 794.52
## - installment_rate 1 706.95 794.95
## - purpose 9 730.76 802.76
## - chk_acct 3 745.00 829.00
##
## Step: AIC=783.43
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + present_resid + property + age + other_install +
## n_credits + telephone + foreign
##
## Df Deviance AIC
## - other_debtors 2 699.97 781.97
## <none> 697.43 783.43
## - property 3 703.65 783.65
## - n_credits 1 699.67 783.67
## - marital_sex 3 704.65 784.65
## - present_resid 1 700.74 784.74
## - telephone 1 700.92 784.92
## - other_install 2 704.11 786.11
## - age 1 702.16 786.16
## - foreign 1 702.42 786.42
## - saving_acct 4 709.53 787.53
## - duration 1 703.61 787.61
## - present_emp 4 711.37 789.37
## - amount 1 707.34 791.34
## - credit_his 4 715.17 793.17
## - installment_rate 1 709.24 793.24
## - purpose 9 733.61 801.61
## - chk_acct 3 749.26 829.26
##
## Step: AIC=781.97
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## present_resid + property + age + other_install + n_credits +
## telephone + foreign
##
## Df Deviance AIC
## <none> 699.97 781.97
## - n_credits 1 702.37 782.37
## - property 3 706.41 782.41
## - marital_sex 3 707.25 783.25
## - telephone 1 703.29 783.29
## - present_resid 1 703.35 783.35
## - other_install 2 706.21 784.21
## - age 1 705.08 785.08
## - saving_acct 4 711.37 785.37
## - foreign 1 705.61 785.61
## - duration 1 705.89 785.89
## - present_emp 4 714.06 788.06
## - amount 1 710.64 790.64
## - credit_his 4 717.21 791.21
## - installment_rate 1 712.15 792.15
## - purpose 9 737.36 801.36
## - chk_acct 3 750.98 826.98
AIC(germandata.glm.AIC)
## [1] 781.9713
BIC(germandata.glm.AIC)
## [1] 974.0404
summary(germandata.glm.AIC)
##
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + purpose +
## amount + saving_acct + present_emp + installment_rate + marital_sex +
## present_resid + property + age + other_install + n_credits +
## telephone + foreign, family = binomial, data = germandata.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1489 -0.6862 -0.3577 0.6559 2.5428
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.359e-01 1.026e+00 0.815 0.415334
## chk_acctA12 -3.623e-01 2.473e-01 -1.465 0.142939
## chk_acctA13 -1.061e+00 4.174e-01 -2.542 0.011026 *
## chk_acctA14 -1.699e+00 2.639e-01 -6.436 1.23e-10 ***
## duration 2.498e-02 1.027e-02 2.433 0.014984 *
## credit_hisA31 -8.397e-02 6.096e-01 -0.138 0.890433
## credit_hisA32 -5.320e-01 4.607e-01 -1.155 0.248163
## credit_hisA33 -8.832e-01 5.073e-01 -1.741 0.081702 .
## credit_hisA34 -1.497e+00 4.792e-01 -3.124 0.001786 **
## purposeA41 -2.033e+00 4.478e-01 -4.540 5.62e-06 ***
## purposeA410 -2.194e+00 8.841e-01 -2.481 0.013095 *
## purposeA42 -9.264e-01 2.946e-01 -3.145 0.001663 **
## purposeA43 -9.950e-01 2.812e-01 -3.538 0.000403 ***
## purposeA44 -9.784e-01 8.607e-01 -1.137 0.255672
## purposeA45 -9.408e-02 5.960e-01 -0.158 0.874559
## purposeA46 -1.514e-02 4.719e-01 -0.032 0.974405
## purposeA48 -1.768e+00 1.206e+00 -1.467 0.142505
## purposeA49 -7.603e-01 3.769e-01 -2.017 0.043656 *
## amount 1.571e-04 4.903e-05 3.204 0.001356 **
## saving_acctA62 -3.185e-01 3.260e-01 -0.977 0.328537
## saving_acctA63 -2.794e-01 4.635e-01 -0.603 0.546622
## saving_acctA64 -9.372e-01 5.192e-01 -1.805 0.071030 .
## saving_acctA65 -8.643e-01 3.005e-01 -2.876 0.004022 **
## present_empA72 1.009e-01 4.286e-01 0.235 0.813843
## present_empA73 -1.565e-01 4.065e-01 -0.385 0.700208
## present_empA74 -1.008e+00 4.564e-01 -2.208 0.027273 *
## present_empA75 -6.643e-01 4.246e-01 -1.564 0.117703
## installment_rate 3.431e-01 9.995e-02 3.432 0.000598 ***
## marital_sexA92 -7.657e-01 4.324e-01 -1.771 0.076554 .
## marital_sexA93 -1.062e+00 4.207e-01 -2.525 0.011579 *
## marital_sexA94 -6.811e-01 5.196e-01 -1.311 0.189931
## present_resid 1.752e-01 9.579e-02 1.829 0.067367 .
## propertyA122 5.602e-01 2.892e-01 1.937 0.052727 .
## propertyA123 3.509e-01 2.655e-01 1.322 0.186254
## propertyA124 8.130e-01 3.475e-01 2.339 0.019318 *
## age -2.275e-02 1.022e-02 -2.227 0.025955 *
## other_installA142 -4.565e-01 4.613e-01 -0.990 0.322327
## other_installA143 -6.718e-01 2.678e-01 -2.509 0.012124 *
## n_credits 3.238e-01 2.095e-01 1.546 0.122171
## telephoneA192 -3.981e-01 2.203e-01 -1.807 0.070696 .
## foreignA202 -1.409e+00 6.512e-01 -2.163 0.030518 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 972.25 on 799 degrees of freedom
## Residual deviance: 699.97 on 759 degrees of freedom
## AIC: 781.97
##
## Number of Fisher Scoring iterations: 5
mean.residual.d.AIC <- germandata.glm.AIC$deviance/germandata.glm.AIC$df.residual
mean.residual.d.AIC
## [1] 0.9222284
BIC
germandata.glm.BIC <- step(germandata.train.glm0, k = log(nrow(germandata.train)))
## Start: AIC=1021.58
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + present_resid + property + age + other_install +
## housing + n_credits + job + n_people + telephone + foreign
##
## Df Deviance AIC
## - purpose 9 729.28 996.66
## - job 3 695.22 1002.72
## - saving_acct 4 706.21 1007.02
## - property 3 700.01 1007.50
## - marital_sex 3 700.54 1008.03
## - present_emp 4 707.99 1008.80
## - housing 2 696.44 1010.62
## - other_debtors 2 696.94 1011.11
## - credit_his 4 711.43 1012.23
## - other_install 2 700.62 1014.80
## - n_people 1 694.04 1014.90
## - present_resid 1 695.59 1016.45
## - telephone 1 696.23 1017.09
## - n_credits 1 696.42 1017.28
## - age 1 697.85 1018.71
## - foreign 1 699.36 1020.22
## - duration 1 700.27 1021.13
## <none> 694.03 1021.58
## - amount 1 703.85 1024.71
## - installment_rate 1 706.04 1026.90
## - chk_acct 3 743.44 1050.93
##
## Step: AIC=996.66
## response ~ chk_acct + duration + credit_his + amount + saving_acct +
## present_emp + installment_rate + marital_sex + other_debtors +
## present_resid + property + age + other_install + housing +
## n_credits + job + n_people + telephone + foreign
##
## Df Deviance AIC
## - job 3 730.75 978.08
## - marital_sex 3 734.57 981.90
## - saving_acct 4 741.96 982.61
## - property 3 735.60 982.93
## - present_emp 4 743.19 983.83
## - housing 2 732.39 986.41
## - credit_his 4 746.41 987.05
## - other_debtors 2 733.50 987.51
## - other_install 2 734.75 988.77
## - n_people 1 729.29 989.99
## - present_resid 1 730.05 990.75
## - age 1 730.63 991.33
## - n_credits 1 732.02 992.72
## - telephone 1 732.13 992.83
## - foreign 1 732.97 993.67
## - duration 1 734.56 995.26
## - amount 1 735.70 996.40
## <none> 729.28 996.66
## - installment_rate 1 742.22 1002.92
## - chk_acct 3 783.25 1030.58
##
## Step: AIC=978.08
## response ~ chk_acct + duration + credit_his + amount + saving_acct +
## present_emp + installment_rate + marital_sex + other_debtors +
## present_resid + property + age + other_install + housing +
## n_credits + n_people + telephone + foreign
##
## Df Deviance AIC
## - marital_sex 3 735.93 963.21
## - property 3 736.60 963.88
## - saving_acct 4 743.39 963.98
## - present_emp 4 744.66 965.25
## - housing 2 733.61 967.57
## - credit_his 4 748.11 968.71
## - other_debtors 2 734.81 968.78
## - other_install 2 736.31 970.27
## - n_people 1 730.76 971.41
## - present_resid 1 731.84 972.49
## - age 1 732.01 972.66
## - n_credits 1 733.45 974.09
## - foreign 1 734.46 975.10
## - telephone 1 735.67 976.31
## - amount 1 736.43 977.07
## - duration 1 736.58 977.22
## <none> 730.75 978.08
## - installment_rate 1 742.98 983.62
## - chk_acct 3 785.12 1012.40
##
## Step: AIC=963.21
## response ~ chk_acct + duration + credit_his + amount + saving_acct +
## present_emp + installment_rate + other_debtors + present_resid +
## property + age + other_install + housing + n_credits + n_people +
## telephone + foreign
##
## Df Deviance AIC
## - property 3 741.59 948.81
## - saving_acct 4 748.57 949.11
## - present_emp 4 752.44 952.98
## - housing 2 739.61 953.52
## - other_debtors 2 740.10 954.01
## - credit_his 4 753.77 954.31
## - other_install 2 741.23 955.13
## - n_people 1 735.99 956.59
## - present_resid 1 736.74 957.34
## - age 1 736.87 957.47
## - n_credits 1 738.58 959.17
## - foreign 1 739.79 960.38
## - amount 1 741.21 961.81
## - telephone 1 741.29 961.88
## - duration 1 742.17 962.76
## <none> 735.93 963.21
## - installment_rate 1 746.12 966.71
## - chk_acct 3 791.75 998.98
##
## Step: AIC=948.81
## response ~ chk_acct + duration + credit_his + amount + saving_acct +
## present_emp + installment_rate + other_debtors + present_resid +
## age + other_install + housing + n_credits + n_people + telephone +
## foreign
##
## Df Deviance AIC
## - saving_acct 4 753.53 934.01
## - present_emp 4 757.92 938.40
## - housing 2 745.32 939.17
## - other_debtors 2 745.78 939.63
## - credit_his 4 759.19 939.67
## - other_install 2 747.87 941.72
## - n_people 1 741.66 942.20
## - present_resid 1 742.32 942.86
## - age 1 742.55 943.08
## - n_credits 1 743.79 944.33
## - foreign 1 745.38 945.91
## - telephone 1 745.83 946.37
## <none> 741.59 948.81
## - amount 1 748.37 948.91
## - duration 1 748.55 949.09
## - installment_rate 1 752.63 953.17
## - chk_acct 3 798.74 985.91
##
## Step: AIC=934.01
## response ~ chk_acct + duration + credit_his + amount + present_emp +
## installment_rate + other_debtors + present_resid + age +
## other_install + housing + n_credits + n_people + telephone +
## foreign
##
## Df Deviance AIC
## - other_debtors 2 756.39 923.50
## - housing 2 757.04 924.16
## - credit_his 4 771.65 925.40
## - present_emp 4 772.38 926.12
## - other_install 2 759.68 926.79
## - n_people 1 753.72 927.52
## - present_resid 1 753.97 927.77
## - age 1 754.65 928.45
## - n_credits 1 756.17 929.97
## - foreign 1 758.05 931.85
## - telephone 1 758.45 932.25
## - amount 1 759.87 933.66
## <none> 753.53 934.01
## - duration 1 760.43 934.23
## - installment_rate 1 764.30 938.10
## - chk_acct 3 823.83 984.26
##
## Step: AIC=923.5
## response ~ chk_acct + duration + credit_his + amount + present_emp +
## installment_rate + present_resid + age + other_install +
## housing + n_credits + n_people + telephone + foreign
##
## Df Deviance AIC
## - housing 2 759.90 913.65
## - credit_his 4 774.14 914.52
## - other_install 2 762.12 915.87
## - present_emp 4 775.88 916.26
## - n_people 1 756.65 917.08
## - present_resid 1 756.92 917.35
## - age 1 757.78 918.21
## - n_credits 1 759.24 919.67
## - telephone 1 761.06 921.49
## - foreign 1 761.69 922.13
## - duration 1 763.04 923.47
## <none> 756.39 923.50
## - amount 1 763.53 923.97
## - installment_rate 1 767.37 927.80
## - chk_acct 3 825.48 972.54
##
## Step: AIC=913.65
## response ~ chk_acct + duration + credit_his + amount + present_emp +
## installment_rate + present_resid + age + other_install +
## n_credits + n_people + telephone + foreign
##
## Df Deviance AIC
## - credit_his 4 778.62 905.63
## - other_install 2 765.66 906.04
## - n_people 1 760.17 907.23
## - present_emp 4 780.76 907.77
## - age 1 761.77 908.83
## - present_resid 1 762.02 909.08
## - n_credits 1 762.50 909.56
## - telephone 1 764.21 911.27
## - foreign 1 765.15 912.21
## <none> 759.90 913.65
## - duration 1 766.87 913.93
## - amount 1 767.22 914.28
## - installment_rate 1 771.04 918.10
## - chk_acct 3 833.14 966.83
##
## Step: AIC=905.63
## response ~ chk_acct + duration + amount + present_emp + installment_rate +
## present_resid + age + other_install + n_credits + n_people +
## telephone + foreign
##
## Df Deviance AIC
## - n_credits 1 778.64 898.96
## - n_people 1 778.78 899.11
## - present_resid 1 780.70 901.03
## - other_install 2 787.95 901.59
## - age 1 781.40 901.72
## - present_emp 4 801.58 901.85
## - telephone 1 782.80 903.12
## - foreign 1 785.14 905.46
## <none> 778.62 905.63
## - amount 1 785.58 905.90
## - duration 1 787.42 907.75
## - installment_rate 1 789.61 909.93
## - chk_acct 3 866.00 972.96
##
## Step: AIC=898.96
## response ~ chk_acct + duration + amount + present_emp + installment_rate +
## present_resid + age + other_install + n_people + telephone +
## foreign
##
## Df Deviance AIC
## - n_people 1 778.80 892.43
## - present_resid 1 780.77 894.41
## - other_install 2 787.96 894.92
## - age 1 781.40 895.04
## - present_emp 4 801.62 895.20
## - telephone 1 782.80 896.44
## - foreign 1 785.15 898.78
## <none> 778.64 898.96
## - amount 1 785.64 899.28
## - duration 1 787.43 901.07
## - installment_rate 1 789.64 903.27
## - chk_acct 3 866.47 966.74
##
## Step: AIC=892.43
## response ~ chk_acct + duration + amount + present_emp + installment_rate +
## present_resid + age + other_install + telephone + foreign
##
## Df Deviance AIC
## - present_resid 1 780.90 887.85
## - other_install 2 788.00 888.27
## - age 1 781.73 888.69
## - present_emp 4 802.13 889.03
## - telephone 1 782.92 889.87
## - foreign 1 785.33 892.28
## <none> 778.80 892.43
## - amount 1 785.83 892.79
## - duration 1 787.58 894.53
## - installment_rate 1 789.91 896.87
## - chk_acct 3 866.48 960.06
##
## Step: AIC=887.85
## response ~ chk_acct + duration + amount + present_emp + installment_rate +
## age + other_install + telephone + foreign
##
## Df Deviance AIC
## - present_emp 4 802.43 882.65
## - age 1 783.20 883.47
## - other_install 2 790.04 883.63
## - telephone 1 784.60 884.87
## - foreign 1 787.51 887.78
## - amount 1 787.52 887.79
## <none> 780.90 887.85
## - duration 1 790.25 890.52
## - installment_rate 1 791.92 892.19
## - chk_acct 3 870.54 957.44
##
## Step: AIC=882.65
## response ~ chk_acct + duration + amount + installment_rate +
## age + other_install + telephone + foreign
##
## Df Deviance AIC
## - other_install 2 810.90 877.74
## - telephone 1 806.58 880.11
## - foreign 1 808.55 882.08
## - age 1 808.68 882.22
## - amount 1 808.91 882.44
## <none> 802.43 882.65
## - duration 1 809.71 883.24
## - installment_rate 1 812.15 885.69
## - chk_acct 3 900.46 960.63
##
## Step: AIC=877.74
## response ~ chk_acct + duration + amount + installment_rate +
## age + telephone + foreign
##
## Df Deviance AIC
## - telephone 1 815.03 875.19
## - age 1 816.16 876.32
## - foreign 1 816.47 876.63
## - amount 1 817.38 877.54
## <none> 810.90 877.74
## - duration 1 818.52 878.68
## - installment_rate 1 820.41 880.57
## - chk_acct 3 909.11 955.90
##
## Step: AIC=875.19
## response ~ chk_acct + duration + amount + installment_rate +
## age + foreign
##
## Df Deviance AIC
## - amount 1 819.23 872.71
## - foreign 1 819.82 873.30
## - age 1 821.70 875.17
## <none> 815.03 875.19
## - installment_rate 1 823.47 876.95
## - duration 1 823.92 877.40
## - chk_acct 3 915.39 955.50
##
## Step: AIC=872.71
## response ~ chk_acct + duration + installment_rate + age + foreign
##
## Df Deviance AIC
## - foreign 1 823.52 870.31
## - installment_rate 1 824.44 871.23
## - age 1 824.78 871.57
## <none> 819.23 872.71
## - duration 1 851.13 897.92
## - chk_acct 3 918.43 951.85
##
## Step: AIC=870.31
## response ~ chk_acct + duration + installment_rate + age
##
## Df Deviance AIC
## - age 1 828.92 869.03
## - installment_rate 1 829.50 869.61
## <none> 823.52 870.31
## - duration 1 858.12 898.23
## - chk_acct 3 920.42 947.16
##
## Step: AIC=869.03
## response ~ chk_acct + duration + installment_rate
##
## Df Deviance AIC
## - installment_rate 1 834.25 867.67
## <none> 828.92 869.03
## - duration 1 864.27 897.69
## - chk_acct 3 926.95 947.00
##
## Step: AIC=867.67
## response ~ chk_acct + duration
##
## Df Deviance AIC
## <none> 834.25 867.67
## - duration 1 870.67 897.41
## - chk_acct 3 930.49 943.86
AIC(germandata.glm.BIC)
## [1] 844.2468
BIC(germandata.glm.BIC)
## [1] 867.6699
summary(germandata.glm.BIC)
##
## Call:
## glm(formula = response ~ chk_acct + duration, family = binomial,
## data = germandata.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6540 -0.8661 -0.4857 0.9688 2.3389
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.952041 0.198218 -4.803 1.56e-06 ***
## chk_acctA12 -0.415088 0.202532 -2.049 0.04041 *
## chk_acctA13 -1.142595 0.369528 -3.092 0.00199 **
## chk_acctA14 -1.960184 0.225469 -8.694 < 2e-16 ***
## duration 0.040686 0.006906 5.891 3.83e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 972.25 on 799 degrees of freedom
## Residual deviance: 834.25 on 795 degrees of freedom
## AIC: 844.25
##
## Number of Fisher Scoring iterations: 4
mean.residual.d.BIC <- germandata.glm.BIC$deviance/germandata.glm.BIC$df.residual
mean.residual.d.BIC
## [1] 1.049367
LASSO
# scale numerical variables
nums <- sapply(germandata.train, is.numeric)
nums <- as.vector(which(nums==TRUE))
germandata.nums.std <- scale(german.data[,nums])
# change categorical variables to dummy variables
dummy <- model.matrix( ~ chk_acct+credit_his+purpose+saving_acct+present_emp+
marital_sex+other_debtors+property+other_install+housing+job+telephone+foreign-1,
data= german.data)
y.std <- german.data$response
germandata.std <- cbind(dummy,germandata.nums.std,y.std)
head(germandata.std)
## chk_acctA11 chk_acctA12 chk_acctA13 chk_acctA14 credit_hisA31
## 1 1 0 0 0 0
## 2 0 1 0 0 0
## 3 0 0 0 1 0
## 4 1 0 0 0 0
## 5 1 0 0 0 0
## 6 0 0 0 1 0
## credit_hisA32 credit_hisA33 credit_hisA34 purposeA41 purposeA410
## 1 0 0 1 0 0
## 2 1 0 0 0 0
## 3 0 0 1 0 0
## 4 1 0 0 0 0
## 5 0 1 0 0 0
## 6 1 0 0 0 0
## purposeA42 purposeA43 purposeA44 purposeA45 purposeA46 purposeA48
## 1 0 1 0 0 0 0
## 2 0 1 0 0 0 0
## 3 0 0 0 0 1 0
## 4 1 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 1 0
## purposeA49 saving_acctA62 saving_acctA63 saving_acctA64 saving_acctA65
## 1 0 0 0 0 1
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 1
## present_empA72 present_empA73 present_empA74 present_empA75
## 1 0 0 0 1
## 2 0 1 0 0
## 3 0 0 1 0
## 4 0 0 1 0
## 5 0 1 0 0
## 6 0 1 0 0
## marital_sexA92 marital_sexA93 marital_sexA94 other_debtorsA102
## 1 0 1 0 0
## 2 1 0 0 0
## 3 0 1 0 0
## 4 0 1 0 0
## 5 0 1 0 0
## 6 0 1 0 0
## other_debtorsA103 propertyA122 propertyA123 propertyA124
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 1 1 0 0
## 5 0 0 0 1
## 6 0 0 0 1
## other_installA142 other_installA143 housingA152 housingA153 jobA172
## 1 0 1 1 0 0
## 2 0 1 1 0 0
## 3 0 1 1 0 1
## 4 0 1 0 1 0
## 5 0 1 0 1 0
## 6 0 1 0 1 1
## jobA173 jobA174 telephoneA192 foreignA202 duration amount
## 1 1 0 1 0 -1.2358595 -0.7447588
## 2 1 0 0 0 2.2470700 0.9493418
## 3 0 0 0 0 -0.7382981 -0.4163541
## 4 1 0 0 0 1.7495086 1.6334296
## 5 1 0 0 0 0.2568246 0.5663801
## 6 0 0 1 0 1.2519473 2.0489838
## installment_rate present_resid age n_credits n_people y.std
## 1 0.91801781 1.0464631 2.76507291 1.0265652 -0.4280754 1
## 2 -0.86974813 -0.7655942 -1.19080809 -0.7045734 -0.4280754 2
## 3 -0.86974813 0.1404344 1.18272051 -0.7045734 2.3337012 1
## 4 -0.86974813 1.0464631 0.83108664 -0.7045734 2.3337012 1
## 5 0.02413484 1.0464631 1.53435438 1.0265652 2.3337012 2
## 6 -0.86974813 1.0464631 -0.04799802 -0.7045734 2.3337012 1
dim(germandata.std)
## [1] 1000 50
X.train <- as.matrix(germandata.std[trainrows,-50])
Y.train <- as.factor(germandata.std[trainrows, 50])
library(glmnet)
lasso.fit<- glmnet(x=X.train, y=Y.train, family = "binomial", alpha = 1)
summary(lasso.fit)
## Length Class Mode
## a0 76 -none- numeric
## beta 3724 dgCMatrix S4
## df 76 -none- numeric
## dim 2 -none- numeric
## lambda 76 -none- numeric
## dev.ratio 76 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
plot(lasso.fit, xvar = "lambda", label=TRUE)

cv.lasso<- cv.glmnet(x = X.train, y = Y.train, family = "binomial", alpha = 1, nfolds = 10)
plot(cv.lasso)

cv.lasso$lambda.min
## [1] 0.005578583
coef(lasso.fit, s=cv.lasso$lambda.min)
## 50 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.00777559
## chk_acctA11 0.37166592
## chk_acctA12 .
## chk_acctA13 -0.50844111
## chk_acctA14 -1.22534962
## credit_hisA31 0.32612154
## credit_hisA32 .
## credit_hisA33 -0.21399751
## credit_hisA34 -0.75321515
## purposeA41 -1.44364971
## purposeA410 -1.22121852
## purposeA42 -0.44145508
## purposeA43 -0.61670344
## purposeA44 -0.33754980
## purposeA45 0.11918168
## purposeA46 0.05552924
## purposeA48 -0.81661282
## purposeA49 -0.24642018
## saving_acctA62 -0.08048020
## saving_acctA63 -0.15969592
## saving_acctA64 -0.73023981
## saving_acctA65 -0.65496338
## present_empA72 0.19479054
## present_empA73 .
## present_empA74 -0.65913015
## present_empA75 -0.33492568
## marital_sexA92 .
## marital_sexA93 -0.27173384
## marital_sexA94 .
## other_debtorsA102 0.09789027
## other_debtorsA103 -0.47461284
## propertyA122 0.21376361
## propertyA123 0.09163231
## propertyA124 0.35070600
## other_installA142 -0.13957622
## other_installA143 -0.49138634
## housingA152 -0.27145292
## housingA153 .
## jobA172 .
## jobA173 .
## jobA174 .
## telephoneA192 -0.26164989
## foreignA202 -0.85692089
## duration 0.30013208
## amount 0.34005439
## installment_rate 0.27168363
## present_resid 0.06207061
## age -0.13786597
## n_credits 0.10649131
## n_people .
#++++++++++++++++++++ mean residual diviance of LASSO
germandata.glm.lasso <- glm(response~.-n_people-job, family = "binomial", data = germandata.train)
summary(germandata.glm.lasso)
##
## Call:
## glm(formula = response ~ . - n_people - job, family = "binomial",
## data = germandata.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1003 -0.6622 -0.3514 0.6386 2.5483
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.171e+00 1.049e+00 1.117 0.264098
## chk_acctA12 -3.286e-01 2.503e-01 -1.313 0.189291
## chk_acctA13 -1.055e+00 4.186e-01 -2.521 0.011690 *
## chk_acctA14 -1.676e+00 2.650e-01 -6.325 2.53e-10 ***
## duration 2.662e-02 1.040e-02 2.559 0.010487 *
## credit_hisA31 -4.295e-02 6.152e-01 -0.070 0.944342
## credit_hisA32 -5.250e-01 4.630e-01 -1.134 0.256835
## credit_hisA33 -9.025e-01 5.089e-01 -1.774 0.076127 .
## credit_hisA34 -1.492e+00 4.817e-01 -3.098 0.001951 **
## purposeA41 -1.984e+00 4.486e-01 -4.422 9.79e-06 ***
## purposeA410 -2.241e+00 9.007e-01 -2.488 0.012829 *
## purposeA42 -9.439e-01 2.965e-01 -3.184 0.001454 **
## purposeA43 -9.500e-01 2.853e-01 -3.330 0.000868 ***
## purposeA44 -9.719e-01 8.615e-01 -1.128 0.259256
## purposeA45 -6.826e-02 6.075e-01 -0.112 0.910533
## purposeA46 -6.997e-02 4.778e-01 -0.146 0.883568
## purposeA48 -1.814e+00 1.220e+00 -1.487 0.137022
## purposeA49 -7.789e-01 3.801e-01 -2.049 0.040477 *
## amount 1.503e-04 4.957e-05 3.031 0.002437 **
## saving_acctA62 -3.851e-01 3.309e-01 -1.164 0.244511
## saving_acctA63 -3.245e-01 4.637e-01 -0.700 0.484043
## saving_acctA64 -1.011e+00 5.273e-01 -1.917 0.055178 .
## saving_acctA65 -8.986e-01 3.021e-01 -2.975 0.002930 **
## present_empA72 6.830e-02 4.303e-01 0.159 0.873891
## present_empA73 -1.353e-01 4.075e-01 -0.332 0.739906
## present_empA74 -1.010e+00 4.577e-01 -2.206 0.027356 *
## present_empA75 -6.549e-01 4.254e-01 -1.540 0.123636
## installment_rate 3.399e-01 1.010e-01 3.366 0.000762 ***
## marital_sexA92 -8.055e-01 4.347e-01 -1.853 0.063847 .
## marital_sexA93 -1.036e+00 4.229e-01 -2.451 0.014261 *
## marital_sexA94 -6.953e-01 5.209e-01 -1.335 0.181984
## other_debtorsA102 2.820e-01 4.671e-01 0.604 0.546028
## other_debtorsA103 -7.361e-01 5.073e-01 -1.451 0.146771
## present_resid 1.402e-01 9.992e-02 1.403 0.160559
## propertyA122 5.791e-01 2.912e-01 1.989 0.046748 *
## propertyA123 3.244e-01 2.670e-01 1.215 0.224338
## propertyA124 8.674e-01 4.650e-01 1.865 0.062116 .
## age -1.963e-02 1.040e-02 -1.887 0.059175 .
## other_installA142 -4.456e-01 4.642e-01 -0.960 0.337149
## other_installA143 -7.075e-01 2.695e-01 -2.626 0.008646 **
## housingA152 -3.700e-01 2.596e-01 -1.425 0.154119
## housingA153 -4.570e-01 5.262e-01 -0.869 0.385118
## n_credits 3.182e-01 2.104e-01 1.512 0.130547
## telephoneA192 -4.165e-01 2.216e-01 -1.880 0.060174 .
## foreignA202 -1.376e+00 6.522e-01 -2.110 0.034828 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 972.25 on 799 degrees of freedom
## Residual deviance: 695.25 on 755 degrees of freedom
## AIC: 785.25
##
## Number of Fisher Scoring iterations: 5
mean.residual.d.LASSO <- germandata.glm.lasso$deviance/germandata.glm.lasso$df.residual
mean.residual.d.LASSO # not rigorous
## [1] 0.9208569
pred.lasso.train<- predict(germandata.glm.lasso, newx = X.train, s=cv.lasso$lambda.min)
In-Sample Prediction of the Three Models
Misclassification Rate
pred.glm.train.AIC <- predict(germandata.glm.AIC, type = "response")
pred.glm.train.BIC <- predict(germandata.glm.BIC, type = "response")
pred.glm.train.LASSO<- predict(lasso.fit, newx = X.train, s=cv.lasso$lambda.min)
pcut <- 1/6
class.pred.glm.train.AIC <- (pred.glm.train.AIC>pcut)*1
class.pred.glm.train.BIC <- (pred.glm.train.BIC>pcut)*1
class.pred.glm.train.LASSO <- (pred.glm.train.LASSO>pcut)*1
MR.AIC <- mean(germandata.train$response!=class.pred.glm.train.AIC)
MR.AIC
## [1] 0.3325
MR.BIC <- mean(germandata.train$response!=class.pred.glm.train.BIC)
MR.BIC
## [1] 0.4175
MR.LASSO <- mean(germandata.train$response!=class.pred.glm.train.LASSO)
MR.LASSO
## [1] 0.2125
Area Under the Curve
roc_curve.aic <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.train.AIC, main = "AIC")

roc_curve.aic$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8404868 1.24557e-52 NA
roc_curve.bic <-roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.train.BIC, main = "BIC")

roc_curve.bic$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.7513209 1.215515e-29 NA
roc_curve.lasso <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.train.LASSO, main = "LASSO")

roc_curve.lasso$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.83614 2.3854e-51 NA
Out-of-Sample Prediction Using Selected Model
pred.glm.test.AIC <- predict(germandata.glm.AIC, newdata = germandata.test, type = "response")
roc_curve.test=roc.plot(x=(germandata.test$response==1),pred = pred.glm.test.AIC)

roc_curve.test$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.7534469 4.412475e-09 NA
pcut <- 1/6
class.pred.glm.test.AIC <- (pred.glm.test.AIC>pcut)*1
table(germandata.test$response, class.pred.glm.test.AIC, dnn = c("True", "Predicted"))
## Predicted
## True 0 1
## 0 75 62
## 1 9 54
MR.AIC.test <- mean(germandata.test$response!=class.pred.glm.test.AIC)
MR.AIC.test
## [1] 0.355
# Out-of-Sample Cost (Asymmetric Misclassification Rate)
costfunc = function(obs, pred.p, pcut){
weight1 = 5 # define the weight for "true=1 but pred=0" (FN)
weight0 = 1 # define the weight for "true=0 but pred=1" (FP)
c1 = (obs==1)&(pred.p<pcut) # count for "true=1 but pred=0" (FN)
c0 = (obs==0)&(pred.p>=pcut) # count for "true=0 but pred=1" (FP)
cost = mean(weight1*c1 + weight0*c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
}
cost.test = costfunc(obs = germandata.test$response, pred.p = pred.glm.test.AIC, pcut = pcut)
cost.test
## [1] 0.535
Grid Search for Optimal Cutoff Probability
costfunc = function(obs, pred.p, pcut){
weight1 = 5 # define the weight for "true=1 but pred=0" (FN)
weight0 = 1 # define the weight for "true=0 but pred=1" (FP)
c1 = (obs==1)&(pred.p<pcut) # count for "true=1 but pred=0" (FN)
c0 = (obs==0)&(pred.p>=pcut) # count for "true=0 but pred=1" (FP)
cost = mean(weight1*c1 + weight0*c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
}
p.seq = seq(0.01, 1, 0.01)
cost = rep(0, length(p.seq))
for(i in 1:length(p.seq)){
cost[i] = costfunc(obs = germandata.train$response, pred.p = pred.glm.train.AIC, pcut = p.seq[i])
}
plot(p.seq, cost)

optimal.pcut.glm = p.seq[which(cost==min(cost))]
optimal.pcut.glm
## [1] 0.12
class.pred.glm.opt.cut <- (pred.glm.test.AIC>optimal.pcut.glm)*1
MR.opt.pcut<- mean(germandata.test$response!= class.pred.glm.opt.cut)
MR.opt.pcut
## [1] 0.39
FPR.opt.cut <- sum(germandata.test$response==0 & class.pred.glm.opt.cut==1)/sum(germandata.test$response==0)
FPR.opt.cut
## [1] 0.5255474
FNR.opt.cut <- sum(germandata.test$response==1 & class.pred.glm.opt.cut==0)/sum(germandata.test$response==1)
FNR.opt.cut
## [1] 0.0952381
cost.opt.cut.train <- costfunc(obs = germandata.train$response, pred.p = pred.glm.train.AIC, pcut = optimal.pcut.glm)
cost.opt.cut.train
## [1] 0.46625
cost.opt.cut.test <- costfunc(obs = germandata.test$response, pred.p=pred.glm.test.AIC, pcut = optimal.pcut.glm)
cost.opt.cut.test
## [1] 0.51
Compare Cross Validation and Validation Set Approach
# Specify Cost Function
## First cost function - asymmetric misclassification rate
costamr <- function(r, pi) {
weight1 = 5
weight0 = 1
c1 = (r == 1) & (pi < optimal.pcut.glm) #logical vector - true if actual 1 but predict 0
c0 = (r == 0) & (pi > optimal.pcut.glm) #logical vecotr - true if actual 0 but predict 1
return(mean(weight1 * c1 + weight0 * c0))
}
costauc <- function(r, pi){roc.plot(x = r == "1", pred = pi)$roc.vol$Area}
alldata.glm0 <- glm(response~., family = binomial, german.data)
alldata.glm0.AIC <- step(alldata.glm0)
## Start: AIC=993.82
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + present_resid + property + age + other_install +
## housing + n_credits + job + n_people + telephone + foreign
##
## Df Deviance AIC
## - job 3 896.56 988.56
## - property 3 899.08 991.08
## - present_resid 1 895.82 991.82
## - n_people 1 896.94 992.94
## <none> 895.82 993.82
## - n_credits 1 897.89 993.89
## - present_emp 4 904.03 994.03
## - housing 2 900.05 994.05
## - telephone 1 898.06 994.06
## - age 1 898.34 994.34
## - marital_sex 3 905.15 997.15
## - other_debtors 2 903.24 997.24
## - foreign 1 901.88 997.88
## - other_install 2 903.98 997.98
## - amount 1 904.28 1000.28
## - duration 1 904.87 1000.87
## - saving_acct 4 915.63 1005.63
## - installment_rate 1 910.27 1006.27
## - credit_his 4 917.62 1007.62
## - purpose 9 931.12 1011.12
## - chk_acct 3 962.05 1054.05
##
## Step: AIC=988.56
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + present_resid + property + age + other_install +
## housing + n_credits + n_people + telephone + foreign
##
## Df Deviance AIC
## - property 3 899.79 985.79
## - present_resid 1 896.57 986.57
## - n_people 1 897.67 987.67
## - present_emp 4 904.32 988.32
## - n_credits 1 898.47 988.47
## <none> 896.56 988.56
## - housing 2 900.60 988.60
## - telephone 1 899.13 989.13
## - age 1 899.19 989.19
## - marital_sex 3 905.83 991.83
## - other_debtors 2 903.87 991.87
## - foreign 1 902.67 992.67
## - other_install 2 904.95 992.95
## - amount 1 905.31 995.31
## - duration 1 905.85 995.85
## - saving_acct 4 917.02 1001.02
## - installment_rate 1 911.45 1001.45
## - credit_his 4 918.12 1002.12
## - purpose 9 931.82 1005.82
## - chk_acct 3 962.35 1048.35
##
## Step: AIC=985.79
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + present_resid + age + other_install + housing +
## n_credits + n_people + telephone + foreign
##
## Df Deviance AIC
## - present_resid 1 899.81 983.81
## - n_people 1 900.79 984.79
## - housing 2 903.47 985.47
## - n_credits 1 901.49 985.49
## <none> 899.79 985.79
## - telephone 1 901.81 985.81
## - present_emp 4 907.85 985.85
## - age 1 902.52 986.52
## - marital_sex 3 908.67 988.67
## - foreign 1 905.83 989.83
## - other_debtors 2 908.05 990.05
## - other_install 2 908.87 990.87
## - amount 1 909.80 993.80
## - duration 1 909.99 993.99
## - saving_acct 4 919.78 997.78
## - installment_rate 1 915.56 999.56
## - credit_his 4 921.66 999.66
## - purpose 9 936.35 1004.35
## - chk_acct 3 967.78 1047.78
##
## Step: AIC=983.81
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + age + other_install + housing + n_credits +
## n_people + telephone + foreign
##
## Df Deviance AIC
## - n_people 1 900.81 982.81
## - n_credits 1 901.53 983.53
## - telephone 1 901.81 983.81
## <none> 899.81 983.81
## - present_emp 4 907.86 983.86
## - housing 2 903.95 983.95
## - age 1 902.53 984.53
## - marital_sex 3 908.69 986.69
## - foreign 1 905.88 987.88
## - other_debtors 2 908.08 988.08
## - other_install 2 908.87 988.87
## - amount 1 909.80 991.80
## - duration 1 910.05 992.05
## - saving_acct 4 919.78 995.78
## - installment_rate 1 915.59 997.59
## - credit_his 4 921.66 997.66
## - purpose 9 936.35 1002.35
## - chk_acct 3 968.09 1046.09
##
## Step: AIC=982.81
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + age + other_install + housing + n_credits +
## telephone + foreign
##
## Df Deviance AIC
## - n_credits 1 902.80 982.80
## <none> 900.81 982.81
## - present_emp 4 908.82 982.82
## - telephone 1 902.90 982.90
## - housing 2 905.01 983.01
## - age 1 903.38 983.38
## - marital_sex 3 908.75 984.75
## - foreign 1 906.79 986.79
## - other_debtors 2 908.83 986.83
## - other_install 2 909.90 987.90
## - amount 1 910.50 990.50
## - duration 1 910.95 990.95
## - saving_acct 4 920.53 994.53
## - installment_rate 1 915.95 995.95
## - credit_his 4 923.28 997.28
## - purpose 9 937.61 1001.61
## - chk_acct 3 969.35 1045.35
##
## Step: AIC=982.8
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + marital_sex +
## other_debtors + age + other_install + housing + telephone +
## foreign
##
## Df Deviance AIC
## - present_emp 4 910.50 982.50
## - telephone 1 904.72 982.72
## <none> 902.80 982.80
## - housing 2 907.17 983.17
## - age 1 905.21 983.21
## - marital_sex 3 910.47 984.47
## - other_debtors 2 910.88 986.88
## - foreign 1 909.14 987.14
## - other_install 2 912.57 988.57
## - duration 1 912.50 990.50
## - amount 1 912.54 990.54
## - saving_acct 4 922.72 994.72
## - credit_his 4 923.43 995.43
## - installment_rate 1 917.65 995.65
## - purpose 9 940.20 1002.20
## - chk_acct 3 971.04 1045.04
##
## Step: AIC=982.5
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + installment_rate + marital_sex + other_debtors +
## age + other_install + housing + telephone + foreign
##
## Df Deviance AIC
## <none> 910.50 982.50
## - telephone 1 912.82 982.82
## - age 1 912.96 982.96
## - housing 2 915.01 983.01
## - foreign 1 916.63 986.63
## - marital_sex 3 920.98 986.98
## - other_debtors 2 919.41 987.41
## - other_install 2 920.76 988.76
## - duration 1 918.79 988.79
## - amount 1 920.07 990.07
## - saving_acct 4 931.53 995.53
## - installment_rate 1 925.92 995.92
## - credit_his 4 932.30 996.30
## - purpose 9 947.78 1001.78
## - chk_acct 3 979.75 1045.75
cv.result.amr = cv.glm(german.data, glmfit=alldata.glm0.AIC, cost=costamr, K=5)
cv.result.amr$delta[2]
## [1] 0.5104
cv.result.auc = cv.glm(german.data, glmfit=alldata.glm0.AIC, cost=costauc, K=5)











cv.result.auc$delta[2]
## [1] 0.7850393
Classification Tree
Grow a Large Tree
library(rpart)
germandata.largetree <- rpart(formula = response~., data = germandata.train,
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)), cp=0.001)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.3
prp(germandata.largetree, extra = 1)

plotcp(germandata.largetree)

printcp(germandata.largetree)
##
## Classification tree:
## rpart(formula = response ~ ., data = germandata.train, parms = list(loss = matrix(c(0,
## 5, 1, 0), nrow = 2)), cp = 0.001)
##
## Variables actually used in tree construction:
## [1] age amount chk_acct credit_his duration
## [6] housing job marital_sex other_debtors other_install
## [11] present_emp property purpose saving_acct
##
## Root node error: 563/800 = 0.70375
##
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.1438721 0 1.00000 5.0000 0.11470
## 2 0.0799290 1 0.85613 2.6128 0.11870
## 3 0.0284192 2 0.77620 2.6057 0.11911
## 4 0.0275311 5 0.69094 2.6021 0.11892
## 5 0.0248668 7 0.63588 2.6181 0.11913
## 6 0.0195382 9 0.58615 2.5133 0.11798
## 7 0.0177620 10 0.56661 2.5435 0.11847
## 8 0.0142096 11 0.54885 2.5098 0.11805
## 9 0.0115453 12 0.53464 2.4263 0.11693
## 10 0.0097691 14 0.51155 2.2877 0.11476
## 11 0.0088810 16 0.49201 2.3037 0.11503
## 12 0.0071048 17 0.48313 2.2753 0.11442
## 13 0.0041445 18 0.47602 2.1989 0.11321
## 14 0.0035524 21 0.46359 2.2078 0.11334
## 15 0.0011841 27 0.43872 2.1847 0.11288
## 16 0.0010000 30 0.43517 2.1776 0.11244
Prune the Tree
tree.default0.01 <- rpart(response~., data = germandata.train, method = "class",
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)), cp=0.01)
prp(tree.default0.01)

pruned0.008 <- rpart(response~., data = germandata.train, method = "class",
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)),cp=0.008)
prp(pruned0.008, extra = 1)

Misclassification Rate
# in-sample misclassification rate
germandata.train.largetree.pred <- predict(germandata.largetree, type = "class")
germandata.train.default0.01 <- predict(tree.default0.01, type = "class")
germandata.train.pruned0.008 <- predict(pruned0.008, type = "class")
MR.train.largetree <- mean(germandata.train$response!=germandata.train.largetree.pred)
MR.train.largetree
## [1] 0.26125
MR.train.0.01 <- mean(germandata.train$response!=germandata.train.default0.01)
MR.train.0.01
## [1] 0.325
MR.train.0.008 <- mean(germandata.train$response!=germandata.train.pruned0.008)
MR.train.0.008
## [1] 0.305
# out-of-sample prediction using classification tree
germandata.test.largetree.pred <- predict(germandata.largetree, germandata.test, type = "class")
germandata.test.default0.01 <- predict(tree.default0.01, germandata.test, type = "class")
germandata.test.pruned0.008 <- predict(pruned0.008, germandata.test, type = "class")
MR.test.largetree <- mean(germandata.test$response!=germandata.test.largetree.pred)
MR.test.largetree
## [1] 0.385
MR.test.0.01 <- mean(germandata.test$response!=germandata.test.default0.01)
MR.test.0.01
## [1] 0.39
MR.test.0.008 <- mean(germandata.test$response!=germandata.test.pruned0.008)
MR.test.0.008
## [1] 0.38
germandata.test.pred.tree2= predict(pruned0.008, germandata.test, type = "prob")
germandata.train.pred.tree2 = predict(pruned0.008,germandata.train, type="prob")
roc_curve.bic <- roc.plot(x=(germandata.test$response == "1"), pred = germandata.test.pred.tree2[,2])

roc_curve.bic$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.6671301 4.240929e-05 NA
roc_curve.bic.train <- roc.plot(x=(germandata.train$response == "1"), pred = germandata.train.pred.tree2[,2])

roc_curve.bic$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.6671301 4.240929e-05 NA
cost.cart <- function(r, pi){
weight1 = 5
weight0 = 1
c1 = (r==1)&(pi==0) #logical vector - true if actual 1 but predict 0
c0 = (r==0)&(pi==1) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
cost.cart(germandata.test$response,germandata.test.pruned0.008)
## [1] 0.7
cost.cart(germandata.test$response,germandata.test.default0.01)
## [1] 0.67