1 Acknowledgement:

The materials below are partially based on discussion with Bolun Zhou and Nan Li.This work would not be possible otherwise.

2 Summary

Models In-sample misclassification Out-of-sample misclassification
GLM 0.339 0.352
Classification tree 0.332 0.360
Generalized Additive Models (GAM) 0.201 0.276
Neutral Network 0.332 0.289

go to top

3 EDA

3.1 Define a cost function below

creditcost <- function(observed, predicted) {
          weight1 = 5
          weight0 = 1
          c1 = (observed == 1) & (predicted == 0)  #logical vector - true if actual 1 but predict 0
          c0 = (observed == 0) & (predicted == 1)  #logical vecotr - true if actual 0 but predict 1
          return(mean(weight1 * c1 + weight0 * c0))
}

cost1 <- function(r, pi) {
          mean(((r == 0) & (pi > pcut)) | ((r == 1) & (pi < pcut)))
}

3.2 Load the Dataset

# German Credit score
german_credit <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")

colnames(german_credit) <- c("chk_acct","duration","credit_his",
                             "purpose","amount","saving_acct",
                             "present_emp","installment_rate",
                             "sex","other_debtor","present_resid",
                             "property","age","other_install",
                             "housing","n_credits","job","n_people",
                             "telephone","foreign","response")

#orginal response coding 1= good, 2 = bad
#we need 0 = good, 1 = bad
german_credit$response <- german_credit$response - 1
summary(german_credit)
##  chk_acct     duration    credit_his    purpose        amount     
##  A11:274   Min.   : 4.0   A30: 40    A43    :280   Min.   :  250  
##  A12:269   1st Qu.:12.0   A31: 49    A40    :234   1st Qu.: 1366  
##  A13: 63   Median :18.0   A32:530    A42    :181   Median : 2320  
##  A14:394   Mean   :20.9   A33: 88    A41    :103   Mean   : 3271  
##            3rd Qu.:24.0   A34:293    A49    : 97   3rd Qu.: 3972  
##            Max.   :72.0              A46    : 50   Max.   :18424  
##                                      (Other): 55                  
##  saving_acct present_emp installment_rate  sex      other_debtor
##  A61:603     A71: 62     Min.   :1.000    A91: 50   A101:907    
##  A62:103     A72:172     1st Qu.:2.000    A92:310   A102: 41    
##  A63: 63     A73:339     Median :3.000    A93:548   A103: 52    
##  A64: 48     A74:174     Mean   :2.973    A94: 92               
##  A65:183     A75:253     3rd Qu.:4.000                          
##                          Max.   :4.000                          
##                                                                 
##  present_resid   property        age        other_install housing   
##  Min.   :1.000   A121:282   Min.   :19.00   A141:139      A151:179  
##  1st Qu.:2.000   A122:232   1st Qu.:27.00   A142: 47      A152:713  
##  Median :3.000   A123:332   Median :33.00   A143:814      A153:108  
##  Mean   :2.845   A124:154   Mean   :35.55                           
##  3rd Qu.:4.000              3rd Qu.:42.00                           
##  Max.   :4.000              Max.   :75.00                           
##                                                                     
##    n_credits       job         n_people     telephone  foreign   
##  Min.   :1.000   A171: 22   Min.   :1.000   A191:596   A201:963  
##  1st Qu.:1.000   A172:200   1st Qu.:1.000   A192:404   A202: 37  
##  Median :1.000   A173:630   Median :1.000                        
##  Mean   :1.407   A174:148   Mean   :1.155                        
##  3rd Qu.:2.000              3rd Qu.:1.000                        
##  Max.   :4.000              Max.   :2.000                        
##                                                                  
##     response  
##  Min.   :0.0  
##  1st Qu.:0.0  
##  Median :0.0  
##  Mean   :0.3  
##  3rd Qu.:1.0  
##  Max.   :1.0  
## 

3.3 Scatterplots

corcredit <- cor(german_credit[,sapply(german_credit, is.numeric)])
corcredit
##                     duration      amount installment_rate present_resid
## duration          1.00000000  0.62498420       0.07474882   0.034067202
## amount            0.62498420  1.00000000      -0.27131570   0.028926323
## installment_rate  0.07474882 -0.27131570       1.00000000   0.049302371
## present_resid     0.03406720  0.02892632       0.04930237   1.000000000
## age              -0.03613637  0.03271642       0.05826568   0.266419184
## n_credits        -0.01128360  0.02079455       0.02166874   0.089625233
## n_people         -0.02383448  0.01714215      -0.07120694   0.042643426
## response          0.21492667  0.15473864       0.07240394   0.002967159
##                          age   n_credits     n_people     response
## duration         -0.03613637 -0.01128360 -0.023834475  0.214926665
## amount            0.03271642  0.02079455  0.017142154  0.154738641
## installment_rate  0.05826568  0.02166874 -0.071206943  0.072403937
## present_resid     0.26641918  0.08962523  0.042643426  0.002967159
## age               1.00000000  0.14925358  0.118200833 -0.091127409
## n_credits         0.14925358  1.00000000  0.109666700 -0.045732489
## n_people          0.11820083  0.10966670  1.000000000 -0.003014853
## response         -0.09112741 -0.04573249 -0.003014853  1.000000000
corrplot(corcredit, method = "circle", type = "lower")

multi.hist(german_credit[,sapply(german_credit, is.numeric)])

go to top

3.4 Training and Testing Data

set.seed(13003367)
index <- sample(nrow(german_credit),nrow(german_credit)*0.75)
german_credit.train = german_credit[index,]
german_credit.test = german_credit[-index,]

go to top

3.5 Correlation plot and Historgram for Training Data

corcredit_train<- cor(german_credit.train[,sapply(german_credit.train, is.numeric)])
corcredit_train
##                     duration       amount installment_rate present_resid
## duration          1.00000000  0.636463514       0.05308679    0.04103881
## amount            0.63646351  1.000000000      -0.27953064    0.01861770
## installment_rate  0.05308679 -0.279530643       1.00000000    0.02412715
## present_resid     0.04103881  0.018617700       0.02412715    1.00000000
## age              -0.05868659  0.002539451       0.04679671    0.27478090
## n_credits        -0.05899402 -0.013994091       0.02806994    0.09472021
## n_people         -0.05044958  0.017221588      -0.07169067    0.03159107
## response          0.24258418  0.144924276       0.05078628   -0.02513619
##                           age   n_credits     n_people     response
## duration         -0.058686590 -0.05899402 -0.050449583  0.242584181
## amount            0.002539451 -0.01399409  0.017221588  0.144924276
## installment_rate  0.046796710  0.02806994 -0.071690665  0.050786283
## present_resid     0.274780897  0.09472021  0.031591075 -0.025136188
## age               1.000000000  0.14848121  0.127464991 -0.113388208
## n_credits         0.148481208  1.00000000  0.094696388 -0.045928552
## n_people          0.127464991  0.09469639  1.000000000  0.004905938
## response         -0.113388208 -0.04592855  0.004905938  1.000000000
corrplot(corcredit_train, method = "circle", type = "lower")

multi.hist(german_credit.train[,sapply(german_credit.train, is.numeric)])

go to top

4 GLM

4.1 logistic regression model

credit.glm0<- glm(response~chk_acct+duration+credit_his+purpose+saving_acct+present_emp+installment_rate+sex+other_debtor+other_install+housing
, family=binomial, data=german_credit.train)
summary(credit.glm0)
## 
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + purpose + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     other_install + housing, family = binomial, data = german_credit.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2122  -0.6683  -0.3739   0.6477   2.7609  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.994246   0.882645   2.259 0.023859 *  
## chk_acctA12        -0.353052   0.254249  -1.389 0.164953    
## chk_acctA13        -1.296411   0.445150  -2.912 0.003588 ** 
## chk_acctA14        -1.731445   0.273922  -6.321 2.60e-10 ***
## duration            0.059414   0.008842   6.720 1.82e-11 ***
## credit_hisA31      -0.703610   0.669191  -1.051 0.293059    
## credit_hisA32      -1.200999   0.540284  -2.223 0.026222 *  
## credit_hisA33      -1.591907   0.598062  -2.662 0.007773 ** 
## credit_hisA34      -1.779766   0.563392  -3.159 0.001583 ** 
## purposeA41         -1.462401   0.422753  -3.459 0.000542 ***
## purposeA410        -2.128417   1.098049  -1.938 0.052579 .  
## purposeA42         -0.860418   0.301567  -2.853 0.004329 ** 
## purposeA43         -0.951082   0.289518  -3.285 0.001020 ** 
## purposeA44         -1.584501   1.008969  -1.570 0.116318    
## purposeA45         -0.437712   0.655701  -0.668 0.504422    
## purposeA46          0.219608   0.480036   0.457 0.647325    
## purposeA48        -15.470178 516.272534  -0.030 0.976095    
## purposeA49         -0.505941   0.374718  -1.350 0.176955    
## saving_acctA62     -0.513466   0.329444  -1.559 0.119094    
## saving_acctA63     -0.210824   0.434261  -0.485 0.627338    
## saving_acctA64     -2.200037   0.782410  -2.812 0.004925 ** 
## saving_acctA65     -0.902995   0.305571  -2.955 0.003126 ** 
## present_empA72      0.126075   0.430060   0.293 0.769403    
## present_empA73     -0.311434   0.404896  -0.769 0.441793    
## present_empA74     -0.869820   0.456916  -1.904 0.056952 .  
## present_empA75     -0.629723   0.427148  -1.474 0.140414    
## installment_rate    0.259887   0.092799   2.801 0.005102 ** 
## sexA92             -0.310684   0.442379  -0.702 0.482491    
## sexA93             -0.830914   0.434200  -1.914 0.055663 .  
## sexA94             -0.458736   0.514179  -0.892 0.372302    
## other_debtorA102    0.273243   0.457863   0.597 0.550656    
## other_debtorA103   -1.264655   0.497591  -2.542 0.011036 *  
## other_installA142  -0.084417   0.506373  -0.167 0.867599    
## other_installA143  -0.750687   0.278105  -2.699 0.006949 ** 
## housingA152        -0.554014   0.257881  -2.148 0.031688 *  
## housingA153        -0.630730   0.399290  -1.580 0.114192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 902.33  on 749  degrees of freedom
## Residual deviance: 654.75  on 714  degrees of freedom
## AIC: 726.75
## 
## Number of Fisher Scoring iterations: 14
credit.glm0$deviance
## [1] 654.7543
AIC(credit.glm0)
## [1] 726.7543
BIC(credit.glm0)
## [1] 893.0769
hist(predict(credit.glm0))

hist(predict(credit.glm0,type="response"))

4.2 ROC curve

pred.glm.train<- (predict(credit.glm0, type="response"))
pred <- prediction(pred.glm.train, german_credit.train$response)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.8393322
table(predict(credit.glm0,type="response") > (1/6))
## 
## FALSE  TRUE 
##   323   427

go to top

4.3 misclassification rate

pcut1<- 1/6
class.glm.train<- (pred.glm.train>pcut1)*1
MR_train<- mean(german_credit.train$response!=class.glm.train)
MR_train
## [1] 0.3386667
FPR_train<- sum(german_credit.train$response==0 & class.glm.train==1)/sum(german_credit.train$response==0)
FPR_train
## [1] 0.435272
FNR_train<- sum(german_credit.train$response==1 & class.glm.train==0)/sum(german_credit.train$response==1)
FNR_train
## [1] 0.1013825

4.4 out of sample prediction

pred.glm.test<- predict(credit.glm0, newdata = german_credit.test, type="response")
pred <- prediction(pred.glm.test, german_credit.test$response)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7562225

4.5 misclassification rate for out of sample

pcut<- 1/6
class.glm.test<- (pred.glm.test>pcut)*1
MR_test<- mean(german_credit.test$response!=class.glm.test)
MR_test
## [1] 0.352
FPR_test<- sum(german_credit.test$response==0 & class.glm.test==1)/sum(german_credit.test$response==0)
FPR_test
## [1] 0.4191617
FNR_test<- sum(german_credit.test$response==1 & class.glm.test==0)/sum(german_credit.test$response==1)
FNR_test
## [1] 0.2168675

5 Classification tree

Here we make the assumption that false negative cost 5 times of false positive.

credit.rpart <- rpart(formula = response~ ., data =german_credit.train,  method = "class", parms = list(loss=matrix(c(0,5,1,0), nrow = 2)))
prp(credit.rpart, extra = 1)

5.1 Prediction using classification tree (In-sample prediction)

credit.train.pred.tree1<- predict(credit.rpart, german_credit.train, type="class")
table(german_credit.train$response, credit.train.pred.tree1, dnn=c("Truth","Predicted"))
##      Predicted
## Truth   0   1
##     0 301 232
##     1  17 200

Missclassfication rate:

MR=(232+17)/(301+232+17+200)
MR
## [1] 0.332
creditcost(german_credit.test$response, credit.train.pred.tree1)
## [1] 1.14

5.2 Prediction using classification tree (out of -sample prediction)

credit.test.pred.tree1<- predict(credit.rpart, german_credit.test, type="class")
table(german_credit.test$response, credit.test.pred.tree1, dnn=c("Truth","Predicted"))
##      Predicted
## Truth  0  1
##     0 93 74
##     1 16 67
MR=(74+16)/(93+74+16+67)
MR
## [1] 0.36
creditcost(german_credit.train$response, credit.train.pred.tree1)
## [1] 0.4226667

go to top

6 Generalized Additive Models (GAM)

#########################################################################
## Create a formula for a model with a large number of variables:
gam_formula <- as.formula(paste("response~s(duration)+amount+age+chk_acct+credit_his+purpose+saving_acct+present_emp+installment_rate+sex+other_debtor+present_resid+property+other_install+housing+n_credits+job+n_people+telephone+foreign"))
fitGAM <- gam(formula = gam_formula, family = binomial, data = german_credit.train)
summary(fitGAM)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## response ~ s(duration) + amount + age + chk_acct + credit_his + 
##     purpose + saving_acct + present_emp + installment_rate + 
##     sex + other_debtor + present_resid + property + other_install + 
##     housing + n_credits + job + n_people + telephone + foreign
## 
## Parametric coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.864e+00  1.392e+00   1.339 0.180546    
## amount             1.091e-04  5.467e-05   1.995 0.046043 *  
## age               -1.290e-02  1.104e-02  -1.169 0.242522    
## chk_acctA12       -3.481e-01  2.647e-01  -1.315 0.188588    
## chk_acctA13       -1.246e+00  4.633e-01  -2.690 0.007155 ** 
## chk_acctA14       -1.820e+00  2.862e-01  -6.360 2.02e-10 ***
## credit_hisA31     -3.694e-01  7.152e-01  -0.517 0.605491    
## credit_hisA32     -9.350e-01  5.901e-01  -1.584 0.113091    
## credit_hisA33     -1.546e+00  6.208e-01  -2.491 0.012745 *  
## credit_hisA34     -1.751e+00  5.906e-01  -2.964 0.003036 ** 
## purposeA41        -1.617e+00  4.477e-01  -3.612 0.000304 ***
## purposeA410       -2.248e+00  1.182e+00  -1.902 0.057201 .  
## purposeA42        -1.010e+00  3.214e-01  -3.142 0.001680 ** 
## purposeA43        -1.005e+00  3.002e-01  -3.349 0.000810 ***
## purposeA44        -1.593e+00  1.007e+00  -1.581 0.113768    
## purposeA45        -5.908e-01  6.702e-01  -0.881 0.378071    
## purposeA46         1.758e-01  4.925e-01   0.357 0.721101    
## purposeA48        -3.659e+01  2.740e+07   0.000 0.999999    
## purposeA49        -5.651e-01  3.903e-01  -1.448 0.147604    
## saving_acctA62    -6.116e-01  3.442e-01  -1.777 0.075591 .  
## saving_acctA63    -1.226e-01  4.464e-01  -0.275 0.783510    
## saving_acctA64    -2.222e+00  8.029e-01  -2.768 0.005638 ** 
## saving_acctA65    -8.797e-01  3.155e-01  -2.788 0.005296 ** 
## present_empA72    -1.092e-01  5.142e-01  -0.212 0.831881    
## present_empA73    -6.084e-01  4.968e-01  -1.225 0.220733    
## present_empA74    -1.187e+00  5.399e-01  -2.198 0.027940 *  
## present_empA75    -8.287e-01  5.013e-01  -1.653 0.098284 .  
## installment_rate   3.418e-01  1.060e-01   3.225 0.001259 ** 
## sexA92            -2.731e-01  4.682e-01  -0.583 0.559764    
## sexA93            -8.944e-01  4.590e-01  -1.948 0.051361 .  
## sexA94            -3.870e-01  5.425e-01  -0.713 0.475662    
## other_debtorA102   3.342e-01  4.682e-01   0.714 0.475417    
## other_debtorA103  -1.162e+00  5.168e-01  -2.248 0.024591 *  
## present_resid     -4.579e-02  1.043e-01  -0.439 0.660569    
## propertyA122       2.259e-01  3.033e-01   0.745 0.456401    
## propertyA123       4.147e-01  2.809e-01   1.476 0.139877    
## propertyA124       3.597e-01  5.212e-01   0.690 0.490073    
## other_installA142 -1.362e-01  5.131e-01  -0.265 0.790665    
## other_installA143 -7.578e-01  2.868e-01  -2.642 0.008243 ** 
## housingA152       -5.684e-01  2.776e-01  -2.048 0.040607 *  
## housingA153       -6.815e-01  5.730e-01  -1.189 0.234321    
## n_credits          4.518e-01  2.414e-01   1.872 0.061214 .  
## jobA172            3.233e-01  7.976e-01   0.405 0.685259    
## jobA173            5.457e-01  7.665e-01   0.712 0.476465    
## jobA174            1.552e-01  7.836e-01   0.198 0.842963    
## n_people           3.519e-01  3.005e-01   1.171 0.241547    
## telephoneA192     -3.657e-01  2.519e-01  -1.451 0.146687    
## foreignA202       -1.207e+00  7.248e-01  -1.665 0.095856 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##             edf Ref.df Chi.sq  p-value    
## s(duration)   1      1  15.12 0.000101 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.289   Deviance explained = 29.8%
## UBRE = -0.024333  Scale est. = 1         n = 750
##insample
fittedGAM.insample.prob <- predict(fitGAM)
fittedGAM.insample <- as.numeric(fittedGAM.insample.prob > pcut)
TrainMisGAM <- mean(ifelse(german_credit.train$response != fittedGAM.insample, 1, 0))
TrainMisGAM
## [1] 0.2013333
##out-sample
fittedGAM.outsample.prob <- predict(fitGAM, german_credit.test)
fittedGAM.outsample <- as.numeric(fittedGAM.outsample.prob > pcut)
TestMisGAM <- mean(ifelse(german_credit.test$response != fittedGAM.outsample, 1, 0))
TestMisGAM
## [1] 0.276

go to top

7 Neutral Network

library(nnet)
## Warning: package 'nnet' was built under R version 3.4.4
## 
## Attaching package: 'nnet'
## The following object is masked from 'package:mgcv':
## 
##     multinom
credit.nnet <- nnet(response~., data=german_credit.train, size=1, maxit=500)
## # weights:  51
## initial  value 229.054492 
## final  value 217.000000 
## converged
prob.nnet= predict(credit.nnet,german_credit.test)
pred.nnet = as.numeric(prob.nnet >1/6)
table(german_credit.test$response,pred.nnet, dnn=c("Observed","Predicted"))
##         Predicted
## Observed   0
##        0 167
##        1  83

In-sample

mean(ifelse(german_credit.test$response!= pred.nnet, 1, 0))
## [1] 0.332
creditcost(german_credit.test$response, pred.nnet)
## [1] 1.66

Out of-sample

mean(ifelse(german_credit.train$response!= pred.nnet, 1, 0))
## [1] 0.2893333
creditcost(german_credit.train$response, pred.nnet)
## [1] 1.446667

go to top