## 'data.frame': 1000 obs. of 21 variables:
## $ chk_acct : Factor w/ 4 levels "A11","A12","A13",..: 1 2 4 1 1 4 4 2 4 2 ...
## $ duration : int 6 48 12 42 24 36 24 36 12 30 ...
## $ credit_his : Factor w/ 5 levels "A30","A31","A32",..: 5 3 5 3 4 3 3 3 3 5 ...
## $ purpose : Factor w/ 10 levels "A40","A41","A410",..: 5 5 8 4 1 8 4 2 5 1 ...
## $ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ saving_acct : Factor w/ 5 levels "A61","A62","A63",..: 5 1 1 1 1 5 3 1 4 1 ...
## $ present_emp : Factor w/ 5 levels "A71","A72","A73",..: 5 3 4 4 3 3 5 3 4 1 ...
## $ installment_rate: int 4 2 2 2 3 2 3 2 2 4 ...
## $ sex : Factor w/ 4 levels "A91","A92","A93",..: 3 2 3 3 3 3 3 3 1 4 ...
## $ other_debtor : Factor w/ 3 levels "A101","A102",..: 1 1 1 3 1 1 1 1 1 1 ...
## $ present_resid : int 4 2 3 4 4 4 4 2 4 2 ...
## $ property : Factor w/ 4 levels "A121","A122",..: 1 1 1 2 4 4 2 3 1 3 ...
## $ age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ other_install : Factor w/ 3 levels "A141","A142",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ housing : Factor w/ 3 levels "A151","A152",..: 2 2 2 3 3 3 2 1 2 2 ...
## $ n_credits : int 2 1 1 1 2 1 1 1 1 2 ...
## $ job : Factor w/ 4 levels "A171","A172",..: 3 3 2 3 3 2 3 4 2 4 ...
## $ n_people : int 1 1 2 2 2 2 1 1 1 1 ...
## $ telephone : Factor w/ 2 levels "A191","A192": 2 1 1 1 1 2 1 2 1 1 ...
## $ foreign : Factor w/ 2 levels "A201","A202": 1 1 1 1 1 1 1 1 1 1 ...
## $ response : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 1 1 2 ...
## chk_acct duration credit_his purpose amount
## A11:274 Min. : 4.0 A30: 40 A43 :280 Min. : 250
## A12:269 1st Qu.:12.0 A31: 49 A40 :234 1st Qu.: 1366
## A13: 63 Median :18.0 A32:530 A42 :181 Median : 2320
## A14:394 Mean :20.9 A33: 88 A41 :103 Mean : 3271
## 3rd Qu.:24.0 A34:293 A49 : 97 3rd Qu.: 3972
## Max. :72.0 A46 : 50 Max. :18424
## (Other): 55
## saving_acct present_emp installment_rate sex other_debtor
## A61:603 A71: 62 Min. :1.000 A91: 50 A101:907
## A62:103 A72:172 1st Qu.:2.000 A92:310 A102: 41
## A63: 63 A73:339 Median :3.000 A93:548 A103: 52
## A64: 48 A74:174 Mean :2.973 A94: 92
## A65:183 A75:253 3rd Qu.:4.000
## Max. :4.000
##
## present_resid property age other_install housing
## Min. :1.000 A121:282 Min. :19.00 A141:139 A151:179
## 1st Qu.:2.000 A122:232 1st Qu.:27.00 A142: 47 A152:713
## Median :3.000 A123:332 Median :33.00 A143:814 A153:108
## Mean :2.845 A124:154 Mean :35.55
## 3rd Qu.:4.000 3rd Qu.:42.00
## Max. :4.000 Max. :75.00
##
## n_credits job n_people telephone foreign response
## Min. :1.000 A171: 22 Min. :1.000 A191:596 A201:963 0:700
## 1st Qu.:1.000 A172:200 1st Qu.:1.000 A192:404 A202: 37 1:300
## Median :1.000 A173:630 Median :1.000
## Mean :1.407 A174:148 Mean :1.155
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :4.000 Max. :2.000
##
rows <- sample(nrow(german_credit))
german_credit_randomized <- german_credit[rows, ]
split_german_credit <- round(nrow(german_credit_randomized)*0.75)
train_german_credit <- german_credit_randomized[1:split_german_credit, ]
test_german_credit <- german_credit_randomized[(split_german_credit + 1):nrow(german_credit_randomized), ]
For logit model, AIC came out to be 791.727. For probit model, AIC came out to be 790.26. And, for cloglog model, AIC came out to be 789.07. The significant parameters are similar for all the three link functions.
For logit model, BIC came out to be 1017.82 For probit model, AIC came out to be 1016.63 And, for cloglog model, AIC came out to be 1015.46 The significant parameters are similar for all the three link functions.
null_log <- glm(formula = response~1, family = "binomial", data = train_german_credit)
full_log <- glm(formula = response ~ . , family = "binomial",
data = train_german_credit)
summary(full_log)
##
## Call:
## glm(formula = response ~ ., family = "binomial", data = train_german_credit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0764 -0.7138 -0.3931 0.7203 2.4910
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.711e-01 1.233e+00 -0.625 0.53180
## chk_acctA12 -4.136e-01 2.548e-01 -1.623 0.10459
## chk_acctA13 -9.493e-01 4.092e-01 -2.320 0.02035 *
## chk_acctA14 -1.695e+00 2.626e-01 -6.454 1.09e-10 ***
## duration 2.576e-02 1.102e-02 2.338 0.01939 *
## credit_hisA31 4.822e-01 6.214e-01 0.776 0.43771
## credit_hisA32 -4.181e-01 4.735e-01 -0.883 0.37721
## credit_hisA33 -1.099e+00 5.431e-01 -2.024 0.04298 *
## credit_hisA34 -1.440e+00 4.831e-01 -2.980 0.00288 **
## purposeA41 -2.021e+00 4.548e-01 -4.444 8.82e-06 ***
## purposeA410 -1.912e+00 1.011e+00 -1.891 0.05859 .
## purposeA42 -7.281e-01 2.949e-01 -2.469 0.01353 *
## purposeA43 -6.425e-01 2.774e-01 -2.316 0.02053 *
## purposeA44 -7.444e-02 8.880e-01 -0.084 0.93319
## purposeA45 -6.122e-02 6.009e-01 -0.102 0.91886
## purposeA46 3.119e-01 4.478e-01 0.696 0.48615
## purposeA48 -2.125e+00 1.224e+00 -1.736 0.08264 .
## purposeA49 -6.586e-01 3.869e-01 -1.702 0.08875 .
## amount 1.141e-04 5.274e-05 2.163 0.03050 *
## saving_acctA62 1.497e-02 3.298e-01 0.045 0.96380
## saving_acctA63 -2.506e-01 4.375e-01 -0.573 0.56671
## saving_acctA64 -7.828e-01 5.740e-01 -1.364 0.17266
## saving_acctA65 -5.563e-01 2.973e-01 -1.871 0.06134 .
## present_empA72 -3.733e-01 5.062e-01 -0.737 0.46089
## present_empA73 -3.436e-01 4.819e-01 -0.713 0.47578
## present_empA74 -1.129e+00 5.255e-01 -2.147 0.03176 *
## present_empA75 -6.814e-01 5.007e-01 -1.361 0.17356
## installment_rate 2.908e-01 9.883e-02 2.942 0.00326 **
## sexA92 -1.267e-01 4.368e-01 -0.290 0.77182
## sexA93 -6.055e-01 4.342e-01 -1.394 0.16319
## sexA94 2.986e-02 5.096e-01 0.059 0.95328
## other_debtorA102 8.926e-01 4.749e-01 1.880 0.06017 .
## other_debtorA103 -1.148e+00 5.004e-01 -2.294 0.02180 *
## present_resid -2.243e-03 9.911e-02 -0.023 0.98194
## propertyA122 4.166e-01 2.878e-01 1.447 0.14784
## propertyA123 3.315e-01 2.706e-01 1.225 0.22049
## propertyA124 5.784e-01 4.672e-01 1.238 0.21567
## age -3.334e-03 1.020e-02 -0.327 0.74387
## other_installA142 8.858e-02 4.813e-01 0.184 0.85398
## other_installA143 -5.101e-01 2.770e-01 -1.842 0.06553 .
## housingA152 -4.642e-01 2.744e-01 -1.692 0.09070 .
## housingA153 -4.187e-01 5.361e-01 -0.781 0.43485
## n_credits 4.202e-01 2.145e-01 1.959 0.05010 .
## jobA172 7.151e-01 7.902e-01 0.905 0.36546
## jobA173 8.205e-01 7.662e-01 1.071 0.28427
## jobA174 8.639e-01 7.846e-01 1.101 0.27089
## n_people 4.322e-01 2.843e-01 1.520 0.12840
## telephoneA192 -3.449e-01 2.258e-01 -1.528 0.12658
## foreignA202 -1.214e+00 6.325e-01 -1.920 0.05486 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 932.61 on 749 degrees of freedom
## Residual deviance: 693.44 on 701 degrees of freedom
## AIC: 791.44
##
## Number of Fisher Scoring iterations: 5
AIC(full_log)
## [1] 791.4415
BIC(full_log)
## [1] 1017.825
full_probit <- glm(formula = response ~ . , family = binomial(link = "probit"),
data = train_german_credit)
summary(full_probit)
##
## Call:
## glm(formula = response ~ ., family = binomial(link = "probit"),
## data = train_german_credit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9741 -0.7236 -0.3761 0.7323 2.5157
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.133e-01 7.258e-01 -0.707 0.4794
## chk_acctA12 -2.451e-01 1.504e-01 -1.629 0.1032
## chk_acctA13 -5.290e-01 2.384e-01 -2.219 0.0265 *
## chk_acctA14 -1.009e+00 1.503e-01 -6.715 1.88e-11 ***
## duration 1.491e-02 6.424e-03 2.321 0.0203 *
## credit_hisA31 3.161e-01 3.653e-01 0.865 0.3869
## credit_hisA32 -2.326e-01 2.791e-01 -0.833 0.4046
## credit_hisA33 -6.244e-01 3.184e-01 -1.961 0.0499 *
## credit_hisA34 -8.332e-01 2.836e-01 -2.938 0.0033 **
## purposeA41 -1.203e+00 2.575e-01 -4.672 2.98e-06 ***
## purposeA410 -1.220e+00 5.921e-01 -2.061 0.0393 *
## purposeA42 -4.257e-01 1.724e-01 -2.469 0.0135 *
## purposeA43 -3.717e-01 1.607e-01 -2.312 0.0208 *
## purposeA44 -5.621e-02 5.088e-01 -0.110 0.9120
## purposeA45 -1.260e-02 3.552e-01 -0.035 0.9717
## purposeA46 1.816e-01 2.633e-01 0.690 0.4904
## purposeA48 -1.253e+00 6.776e-01 -1.849 0.0645 .
## purposeA49 -3.683e-01 2.242e-01 -1.643 0.1004
## amount 6.655e-05 3.081e-05 2.160 0.0308 *
## saving_acctA62 -4.705e-04 1.922e-01 -0.002 0.9980
## saving_acctA63 -1.659e-01 2.456e-01 -0.675 0.4994
## saving_acctA64 -4.448e-01 3.243e-01 -1.371 0.1703
## saving_acctA65 -3.093e-01 1.688e-01 -1.832 0.0669 .
## present_empA72 -2.231e-01 2.983e-01 -0.748 0.4546
## present_empA73 -1.943e-01 2.841e-01 -0.684 0.4941
## present_empA74 -6.584e-01 3.075e-01 -2.141 0.0323 *
## present_empA75 -4.062e-01 2.941e-01 -1.381 0.1672
## installment_rate 1.687e-01 5.740e-02 2.938 0.0033 **
## sexA92 -6.066e-02 2.582e-01 -0.235 0.8143
## sexA93 -3.283e-01 2.555e-01 -1.285 0.1988
## sexA94 2.819e-02 3.009e-01 0.094 0.9254
## other_debtorA102 5.144e-01 2.800e-01 1.837 0.0662 .
## other_debtorA103 -6.360e-01 2.824e-01 -2.252 0.0243 *
## present_resid -9.645e-06 5.786e-02 0.000 0.9999
## propertyA122 2.728e-01 1.661e-01 1.642 0.1005
## propertyA123 2.071e-01 1.569e-01 1.320 0.1869
## propertyA124 3.321e-01 2.709e-01 1.226 0.2202
## age -2.530e-03 5.954e-03 -0.425 0.6709
## other_installA142 3.509e-02 2.850e-01 0.123 0.9020
## other_installA143 -3.021e-01 1.614e-01 -1.872 0.0613 .
## housingA152 -2.721e-01 1.603e-01 -1.697 0.0897 .
## housingA153 -1.946e-01 3.113e-01 -0.625 0.5319
## n_credits 2.553e-01 1.236e-01 2.065 0.0389 *
## jobA172 4.541e-01 4.628e-01 0.981 0.3265
## jobA173 5.219e-01 4.489e-01 1.163 0.2450
## jobA174 5.597e-01 4.596e-01 1.218 0.2234
## n_people 2.473e-01 1.658e-01 1.492 0.1357
## telephoneA192 -2.022e-01 1.301e-01 -1.554 0.1201
## foreignA202 -7.138e-01 3.580e-01 -1.994 0.0462 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 932.61 on 749 degrees of freedom
## Residual deviance: 692.26 on 701 degrees of freedom
## AIC: 790.26
##
## Number of Fisher Scoring iterations: 6
AIC(full_probit)
## [1] 790.2556
BIC(full_probit)
## [1] 1016.639
full_cloglog <- glm(formula = response ~ . , family = binomial(link = "cloglog"),
data = train_german_credit)
summary(full_cloglog)
##
## Call:
## glm(formula = response ~ ., family = binomial(link = "cloglog"),
## data = train_german_credit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9386 -0.7100 -0.4134 0.6267 2.3699
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.348e+00 9.318e-01 -1.446 0.148123
## chk_acctA12 -2.955e-01 1.826e-01 -1.618 0.105561
## chk_acctA13 -7.712e-01 3.339e-01 -2.310 0.020891 *
## chk_acctA14 -1.320e+00 2.053e-01 -6.433 1.25e-10 ***
## duration 2.029e-02 7.812e-03 2.597 0.009398 **
## credit_hisA31 4.344e-01 3.998e-01 1.086 0.277259
## credit_hisA32 -2.592e-01 3.148e-01 -0.823 0.410375
## credit_hisA33 -8.085e-01 3.805e-01 -2.125 0.033591 *
## credit_hisA34 -1.076e+00 3.324e-01 -3.238 0.001202 **
## purposeA41 -1.578e+00 3.627e-01 -4.351 1.36e-05 ***
## purposeA410 -1.838e+00 8.044e-01 -2.285 0.022334 *
## purposeA42 -5.545e-01 2.148e-01 -2.581 0.009852 **
## purposeA43 -4.943e-01 2.042e-01 -2.421 0.015479 *
## purposeA44 -1.042e-01 5.711e-01 -0.183 0.855188
## purposeA45 -8.930e-02 4.455e-01 -0.200 0.841134
## purposeA46 1.932e-01 3.142e-01 0.615 0.538564
## purposeA48 -1.738e+00 1.056e+00 -1.646 0.099769 .
## purposeA49 -5.917e-01 2.840e-01 -2.084 0.037177 *
## amount 8.381e-05 3.810e-05 2.200 0.027828 *
## saving_acctA62 9.048e-02 2.379e-01 0.380 0.703661
## saving_acctA63 -4.024e-01 3.716e-01 -1.083 0.278885
## saving_acctA64 -6.753e-01 4.559e-01 -1.481 0.138541
## saving_acctA65 -4.681e-01 2.353e-01 -1.989 0.046702 *
## present_empA72 -3.351e-01 3.496e-01 -0.959 0.337768
## present_empA73 -2.418e-01 3.308e-01 -0.731 0.464923
## present_empA74 -8.875e-01 3.708e-01 -2.394 0.016687 *
## present_empA75 -5.124e-01 3.522e-01 -1.455 0.145728
## installment_rate 2.556e-01 7.432e-02 3.439 0.000584 ***
## sexA92 -1.262e-01 3.149e-01 -0.401 0.688646
## sexA93 -5.022e-01 3.153e-01 -1.592 0.111273
## sexA94 -1.807e-03 3.721e-01 -0.005 0.996125
## other_debtorA102 6.828e-01 3.337e-01 2.046 0.040762 *
## other_debtorA103 -9.048e-01 4.001e-01 -2.261 0.023737 *
## present_resid 1.398e-02 7.309e-02 0.191 0.848278
## propertyA122 3.051e-01 2.223e-01 1.372 0.169957
## propertyA123 2.154e-01 2.077e-01 1.037 0.299654
## propertyA124 5.123e-01 3.343e-01 1.533 0.125354
## age -2.860e-03 7.607e-03 -0.376 0.706952
## other_installA142 -8.459e-03 3.449e-01 -0.025 0.980436
## other_installA143 -4.102e-01 2.038e-01 -2.013 0.044133 *
## housingA152 -3.537e-01 1.970e-01 -1.796 0.072532 .
## housingA153 -3.861e-01 3.816e-01 -1.012 0.311659
## n_credits 3.326e-01 1.573e-01 2.115 0.034469 *
## jobA172 6.978e-01 5.997e-01 1.164 0.244596
## jobA173 7.767e-01 5.794e-01 1.341 0.180059
## jobA174 7.373e-01 5.973e-01 1.234 0.217047
## n_people 3.815e-01 2.126e-01 1.795 0.072688 .
## telephoneA192 -2.386e-01 1.689e-01 -1.413 0.157789
## foreignA202 -8.874e-01 5.271e-01 -1.684 0.092253 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 932.61 on 749 degrees of freedom
## Residual deviance: 691.08 on 701 degrees of freedom
## AIC: 789.08
##
## Number of Fisher Scoring iterations: 16
AIC(full_cloglog)
## [1] 789.0764
BIC(full_cloglog)
## [1] 1015.46
stepwise_AICmodel <- step(full_log,direction = "both")
stepwise_BICmodel <- step(full_log,direction = "both", k = log(nrow(train_german_credit)))
Variables selected after applying Stepwise AIC are chk_acct, duration, credit_his, purpose, amount, present_emp, installment_rate, sex, other_debtor, other_install, n_credits, n_people and foreign
AIC is 778.2989039
Variables selected after applying Stepwise AIC are chk_acct and duration
BIC is 852.5304001
dummy <- model.matrix(~., data = german_credit) #converting data to numeric matrix
dummy <- dummy[,-1]
nrow(german_credit)
## [1] 1000
rows_dummy <- sample(nrow(dummy))
dummy_randomized <- dummy[rows_dummy, ]
split_dummy <- round(nrow(dummy_randomized)*0.75)
train_dummy <- dummy_randomized[1:split_dummy, ]
test_dummy <- dummy_randomized[(split_dummy + 1):nrow(dummy_randomized), ]
credit_lasso <- cv.glmnet(x = as.matrix(train_dummy[,-49]), y = train_dummy[,49],
family = "binomial", type.measure = "class", alpha = 1)
plot(credit_lasso)
coef(credit_lasso, credit_lasso$lambda.min)
## 49 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.5613264252
## chk_acctA12 -0.1420174360
## chk_acctA13 -0.6622696525
## chk_acctA14 -1.4872929361
## duration 0.0206773121
## credit_hisA31 0.5067554283
## credit_hisA32 -0.1705706879
## credit_hisA33 -0.3736114895
## credit_hisA34 -1.2091923212
## purposeA41 -1.0111826105
## purposeA410 -0.8213943124
## purposeA42 -0.4842731717
## purposeA43 -0.6815762862
## purposeA44 0.0020436481
## purposeA45 .
## purposeA46 0.0819945789
## purposeA48 -1.0366689351
## purposeA49 -0.2324811942
## amount 0.0001184591
## saving_acctA62 -0.5223704470
## saving_acctA63 -0.6220660805
## saving_acctA64 -1.0229609816
## saving_acctA65 -0.7526951004
## present_empA72 0.2154133169
## present_empA73 .
## present_empA74 -0.3740548135
## present_empA75 .
## installment_rate 0.2376622322
## sexA92 .
## sexA93 -0.4153892160
## sexA94 0.0078418519
## other_debtorA102 0.3058138572
## other_debtorA103 -0.8905784569
## present_resid .
## propertyA122 0.1741980172
## propertyA123 0.0828950145
## propertyA124 0.4542421897
## age -0.0103108439
## other_installA142 .
## other_installA143 -0.0786072726
## housingA152 -0.2152687022
## housingA153 -0.3380414897
## n_credits 0.2165851270
## jobA172 .
## jobA173 0.0941302125
## jobA174 .
## n_people 0.1511730774
## telephoneA192 -0.1738092064
## foreignA202 -0.7650523339
Variables selected for in sample prediction are based on Stepwise AIC model.
The logistic model has been fitted based on the variable selected from the above the step, The ROC curve is plotted and the area under the curve was found to be 0.8182. The false negative rate come out as 0.221.
best_insample <- glm(formula = response ~ chk_acct + duration + credit_his + purpose + amount + present_emp + installment_rate + sex + other_debtor + other_install+ foreign + n_credits + n_people, family = "binomial", data = train_german_credit)
insample_pred <- predict(best_insample,
type = "response")
insample_ROC <- roc(train_german_credit$response,insample_pred)
plot(insample_ROC, col = "blue")
auc(insample_ROC)
## Area under the curve: 0.8182
insamplecutoff_prob <- coords(insample_ROC,"best",ret = "threshold")
insample_class <- ifelse(insample_pred > insamplecutoff_prob, 1, 0)
table(insample_class,train_german_credit$response,
dnn = list("predicted", "actual"))
## actual
## predicted 0 1
## 0 359 52
## 1 156 183
The ROC curve is plotted and the area under the curve was found to be 0.8124 The false negative rate is calculated to be 0.138
best_log <- glm(formula = response ~ chk_acct + duration + credit_his + purpose + amount + present_emp + installment_rate + sex + other_debtor + other_install+ foreign + n_credits + n_people, family = "binomial", data = train_german_credit)
train_prob <- predict(best_log, type = "response")
german_credit_prob <- predict(best_log, newdata = test_german_credit[,-21], type = "response")
ROC <- roc(test_german_credit$response,german_credit_prob)
plot(ROC, col = "blue")
auc(ROC)
## Area under the curve: 0.8124
cutoff_prob <- coords(ROC,"best",ret = "threshold")
german_credit_pred <- ifelse(german_credit_prob > cutoff_prob, 1, 0)
table(german_credit_pred,test_german_credit$response,
dnn = list("predicted", "actual"))
## actual
## predicted 0 1
## 0 123 9
## 1 62 56
The asymmetric cost function has been used to calculate the optimal cut off rate. After using the optimal cut off value, the false negative rate came down to 0.138 from 0.061
costfunc <- function(obs, pred.p, pcut) {
weight1 <- 5 # define the weight for "true=1 but pred=0" (FN)
weight0 <- 1 # define the weight for "true=0 but pred=1" (FP)
c1 <- (obs == 1) & (pred.p < pcut) # count for "true=1 but pred=0" (FN)
c0 <- (obs == 0) & (pred.p >= pcut) # count for "true=0 but pred=1" (FP)
cost <- mean(weight1 * c1 + weight0 * c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
}
p.seq <- seq(0.01, 1, 0.01)
cost <- rep(0, length(p.seq))
for (i in 1:length(p.seq)) {
cost[i] = costfunc(obs = train_german_credit$response,
pred.p = train_prob, pcut = p.seq[i])
}
plot(p.seq, cost)
optimal.pcut.glm0 <- p.seq[which(cost == min(cost))]
german_credit_pred <- ifelse(german_credit_prob > optimal.pcut.glm0, 1, 0)
table(german_credit_pred,test_german_credit$response,
dnn = list("predicted", "actual"))
## actual
## predicted 0 1
## 0 95 4
## 1 90 61
Based on the cross validation function, the misclassification rate is found out to be 0.535.
pcut <- optimal.pcut.glm0
costfunc2 <- function(obs, pred.p){
weight1 <- 5 # define the weight for "true=1 but pred=0" (FN)
weight0 <- 1 # define the weight for "true=0 but pred=1" (FP)
c1 <- (obs == 1) & (pred.p < pcut) # count for "true=1 but pred=0" (FN)
c0 <- (obs == 0) & (pred.p >= pcut) # count for "true=0 but pred=1" (FP)
cost <- mean(weight1 * c1 + weight0 * c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
}
credit_glm1 <- glm(response~. , family = binomial, data = german_credit)
cv.result <- cv.glm(data = german_credit, glmfit = credit_glm1, cost = costfunc2, K = 4)
cv.result$delta[2]
## [1] 0.535
The false negative rate is 0.076 and is higher than that of the logistic regression model (0.061).
tree_model <- rpart(formula = response ~ chk_acct + duration + credit_his + purpose + amount + present_emp + installment_rate + sex + other_debtor + other_install+ foreign + n_credits + n_people, data = train_german_credit, method = "class", parms = list(loss = matrix(c(0,5,1,0), nrow = 2)))
prp(tree_model, extra = 1)
tree_predict <- predict(tree_model, test_german_credit[,-21], type = "class")
table(test_german_credit$response, tree_predict, dnn = c("Actual","Predicted"))
## Predicted
## Actual 0 1
## 0 82 103
## 1 5 60
The false negative rate is 0.061 and is same as that of the logistic regression (assymetric) model. The overall misclassification rate is 0.424 compared to logistic regression’s misclassification rate as 0.535
credit_rpart <- rpart(formula = response ~ . , data = train_german_credit, method = "class",
parms = list(loss = matrix(c(0,5,1,0), nrow = 2)), cp = 0.0001)
plotcp(credit_rpart)
prp(prune(credit_rpart, cp = 0.012)) # Pruning the tree
credit_rpartbest <- rpart(formula = response ~ . , data = train_german_credit, method = "class",
parms = list(loss = matrix(c(0,5,1,0), nrow = 2)), cp = 0.012)
prp(credit_rpartbest, extra = 1)
credit_testpredict <- predict(credit_rpartbest, test_german_credit[,-21], type = "class")
table(test_german_credit$response, credit_testpredict, dnn = c("Actual","Predicted"))
## Predicted
## Actual 0 1
## 0 83 102
## 1 4 61