The materials below are partially based on discussion with Bolun Zhou and Nan Li.This work would not be possible otherwise.
| Models | In-sample misclassification | Out-of-sample misclassification |
|---|---|---|
| GLM | 0.339 | 0.352 |
| Classification tree | 0.332 | 0.360 |
| Generalized Additive Models (GAM) | 0.201 | 0.276 |
| Neutral Network | 0.332 | 0.289 |
creditcost <- function(observed, predicted) {
weight1 = 5
weight0 = 1
c1 = (observed == 1) & (predicted == 0) #logical vector - true if actual 1 but predict 0
c0 = (observed == 0) & (predicted == 1) #logical vecotr - true if actual 0 but predict 1
return(mean(weight1 * c1 + weight0 * c0))
}
cost1 <- function(r, pi) {
mean(((r == 0) & (pi > pcut)) | ((r == 1) & (pi < pcut)))
}
# German Credit score
german_credit <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german_credit) <- c("chk_acct","duration","credit_his",
"purpose","amount","saving_acct",
"present_emp","installment_rate",
"sex","other_debtor","present_resid",
"property","age","other_install",
"housing","n_credits","job","n_people",
"telephone","foreign","response")
#orginal response coding 1= good, 2 = bad
#we need 0 = good, 1 = bad
german_credit$response <- german_credit$response - 1
summary(german_credit)
## chk_acct duration credit_his purpose amount
## A11:274 Min. : 4.0 A30: 40 A43 :280 Min. : 250
## A12:269 1st Qu.:12.0 A31: 49 A40 :234 1st Qu.: 1366
## A13: 63 Median :18.0 A32:530 A42 :181 Median : 2320
## A14:394 Mean :20.9 A33: 88 A41 :103 Mean : 3271
## 3rd Qu.:24.0 A34:293 A49 : 97 3rd Qu.: 3972
## Max. :72.0 A46 : 50 Max. :18424
## (Other): 55
## saving_acct present_emp installment_rate sex other_debtor
## A61:603 A71: 62 Min. :1.000 A91: 50 A101:907
## A62:103 A72:172 1st Qu.:2.000 A92:310 A102: 41
## A63: 63 A73:339 Median :3.000 A93:548 A103: 52
## A64: 48 A74:174 Mean :2.973 A94: 92
## A65:183 A75:253 3rd Qu.:4.000
## Max. :4.000
##
## present_resid property age other_install housing
## Min. :1.000 A121:282 Min. :19.00 A141:139 A151:179
## 1st Qu.:2.000 A122:232 1st Qu.:27.00 A142: 47 A152:713
## Median :3.000 A123:332 Median :33.00 A143:814 A153:108
## Mean :2.845 A124:154 Mean :35.55
## 3rd Qu.:4.000 3rd Qu.:42.00
## Max. :4.000 Max. :75.00
##
## n_credits job n_people telephone foreign
## Min. :1.000 A171: 22 Min. :1.000 A191:596 A201:963
## 1st Qu.:1.000 A172:200 1st Qu.:1.000 A192:404 A202: 37
## Median :1.000 A173:630 Median :1.000
## Mean :1.407 A174:148 Mean :1.155
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :4.000 Max. :2.000
##
## response
## Min. :0.0
## 1st Qu.:0.0
## Median :0.0
## Mean :0.3
## 3rd Qu.:1.0
## Max. :1.0
##
corcredit <- cor(german_credit[,sapply(german_credit, is.numeric)])
corcredit
## duration amount installment_rate present_resid
## duration 1.00000000 0.62498420 0.07474882 0.034067202
## amount 0.62498420 1.00000000 -0.27131570 0.028926323
## installment_rate 0.07474882 -0.27131570 1.00000000 0.049302371
## present_resid 0.03406720 0.02892632 0.04930237 1.000000000
## age -0.03613637 0.03271642 0.05826568 0.266419184
## n_credits -0.01128360 0.02079455 0.02166874 0.089625233
## n_people -0.02383448 0.01714215 -0.07120694 0.042643426
## response 0.21492667 0.15473864 0.07240394 0.002967159
## age n_credits n_people response
## duration -0.03613637 -0.01128360 -0.023834475 0.214926665
## amount 0.03271642 0.02079455 0.017142154 0.154738641
## installment_rate 0.05826568 0.02166874 -0.071206943 0.072403937
## present_resid 0.26641918 0.08962523 0.042643426 0.002967159
## age 1.00000000 0.14925358 0.118200833 -0.091127409
## n_credits 0.14925358 1.00000000 0.109666700 -0.045732489
## n_people 0.11820083 0.10966670 1.000000000 -0.003014853
## response -0.09112741 -0.04573249 -0.003014853 1.000000000
corrplot(corcredit, method = "circle", type = "lower")
multi.hist(german_credit[,sapply(german_credit, is.numeric)])
set.seed(13003367)
index <- sample(nrow(german_credit),nrow(german_credit)*0.75)
german_credit.train = german_credit[index,]
german_credit.test = german_credit[-index,]
corcredit_train<- cor(german_credit.train[,sapply(german_credit.train, is.numeric)])
corcredit_train
## duration amount installment_rate present_resid
## duration 1.00000000 0.636463514 0.05308679 0.04103881
## amount 0.63646351 1.000000000 -0.27953064 0.01861770
## installment_rate 0.05308679 -0.279530643 1.00000000 0.02412715
## present_resid 0.04103881 0.018617700 0.02412715 1.00000000
## age -0.05868659 0.002539451 0.04679671 0.27478090
## n_credits -0.05899402 -0.013994091 0.02806994 0.09472021
## n_people -0.05044958 0.017221588 -0.07169067 0.03159107
## response 0.24258418 0.144924276 0.05078628 -0.02513619
## age n_credits n_people response
## duration -0.058686590 -0.05899402 -0.050449583 0.242584181
## amount 0.002539451 -0.01399409 0.017221588 0.144924276
## installment_rate 0.046796710 0.02806994 -0.071690665 0.050786283
## present_resid 0.274780897 0.09472021 0.031591075 -0.025136188
## age 1.000000000 0.14848121 0.127464991 -0.113388208
## n_credits 0.148481208 1.00000000 0.094696388 -0.045928552
## n_people 0.127464991 0.09469639 1.000000000 0.004905938
## response -0.113388208 -0.04592855 0.004905938 1.000000000
corrplot(corcredit_train, method = "circle", type = "lower")
multi.hist(german_credit.train[,sapply(german_credit.train, is.numeric)])
credit.glm0<- glm(response~chk_acct+duration+credit_his+purpose+saving_acct+present_emp+installment_rate+sex+other_debtor+other_install+housing
, family=binomial, data=german_credit.train)
summary(credit.glm0)
##
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + purpose +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## other_install + housing, family = binomial, data = german_credit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2122 -0.6683 -0.3739 0.6477 2.7609
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.994246 0.882645 2.259 0.023859 *
## chk_acctA12 -0.353052 0.254249 -1.389 0.164953
## chk_acctA13 -1.296411 0.445150 -2.912 0.003588 **
## chk_acctA14 -1.731445 0.273922 -6.321 2.60e-10 ***
## duration 0.059414 0.008842 6.720 1.82e-11 ***
## credit_hisA31 -0.703610 0.669191 -1.051 0.293059
## credit_hisA32 -1.200999 0.540284 -2.223 0.026222 *
## credit_hisA33 -1.591907 0.598062 -2.662 0.007773 **
## credit_hisA34 -1.779766 0.563392 -3.159 0.001583 **
## purposeA41 -1.462401 0.422753 -3.459 0.000542 ***
## purposeA410 -2.128417 1.098049 -1.938 0.052579 .
## purposeA42 -0.860418 0.301567 -2.853 0.004329 **
## purposeA43 -0.951082 0.289518 -3.285 0.001020 **
## purposeA44 -1.584501 1.008969 -1.570 0.116318
## purposeA45 -0.437712 0.655701 -0.668 0.504422
## purposeA46 0.219608 0.480036 0.457 0.647325
## purposeA48 -15.470178 516.272534 -0.030 0.976095
## purposeA49 -0.505941 0.374718 -1.350 0.176955
## saving_acctA62 -0.513466 0.329444 -1.559 0.119094
## saving_acctA63 -0.210824 0.434261 -0.485 0.627338
## saving_acctA64 -2.200037 0.782410 -2.812 0.004925 **
## saving_acctA65 -0.902995 0.305571 -2.955 0.003126 **
## present_empA72 0.126075 0.430060 0.293 0.769403
## present_empA73 -0.311434 0.404896 -0.769 0.441793
## present_empA74 -0.869820 0.456916 -1.904 0.056952 .
## present_empA75 -0.629723 0.427148 -1.474 0.140414
## installment_rate 0.259887 0.092799 2.801 0.005102 **
## sexA92 -0.310684 0.442379 -0.702 0.482491
## sexA93 -0.830914 0.434200 -1.914 0.055663 .
## sexA94 -0.458736 0.514179 -0.892 0.372302
## other_debtorA102 0.273243 0.457863 0.597 0.550656
## other_debtorA103 -1.264655 0.497591 -2.542 0.011036 *
## other_installA142 -0.084417 0.506373 -0.167 0.867599
## other_installA143 -0.750687 0.278105 -2.699 0.006949 **
## housingA152 -0.554014 0.257881 -2.148 0.031688 *
## housingA153 -0.630730 0.399290 -1.580 0.114192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 902.33 on 749 degrees of freedom
## Residual deviance: 654.75 on 714 degrees of freedom
## AIC: 726.75
##
## Number of Fisher Scoring iterations: 14
credit.glm0$deviance
## [1] 654.7543
AIC(credit.glm0)
## [1] 726.7543
BIC(credit.glm0)
## [1] 893.0769
hist(predict(credit.glm0))
hist(predict(credit.glm0,type="response"))
pred.glm.train<- (predict(credit.glm0, type="response"))
pred <- prediction(pred.glm.train, german_credit.train$response)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.8393322
table(predict(credit.glm0,type="response") > (1/6))
##
## FALSE TRUE
## 323 427
pcut1<- 1/6
class.glm.train<- (pred.glm.train>pcut1)*1
MR_train<- mean(german_credit.train$response!=class.glm.train)
MR_train
## [1] 0.3386667
FPR_train<- sum(german_credit.train$response==0 & class.glm.train==1)/sum(german_credit.train$response==0)
FPR_train
## [1] 0.435272
FNR_train<- sum(german_credit.train$response==1 & class.glm.train==0)/sum(german_credit.train$response==1)
FNR_train
## [1] 0.1013825
pred.glm.test<- predict(credit.glm0, newdata = german_credit.test, type="response")
pred <- prediction(pred.glm.test, german_credit.test$response)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7562225
pcut<- 1/6
class.glm.test<- (pred.glm.test>pcut)*1
MR_test<- mean(german_credit.test$response!=class.glm.test)
MR_test
## [1] 0.352
FPR_test<- sum(german_credit.test$response==0 & class.glm.test==1)/sum(german_credit.test$response==0)
FPR_test
## [1] 0.4191617
FNR_test<- sum(german_credit.test$response==1 & class.glm.test==0)/sum(german_credit.test$response==1)
FNR_test
## [1] 0.2168675
Here we make the assumption that false negative cost 5 times of false positive.
credit.rpart <- rpart(formula = response~ ., data =german_credit.train, method = "class", parms = list(loss=matrix(c(0,5,1,0), nrow = 2)))
prp(credit.rpart, extra = 1)
credit.train.pred.tree1<- predict(credit.rpart, german_credit.train, type="class")
table(german_credit.train$response, credit.train.pred.tree1, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 301 232
## 1 17 200
Missclassfication rate:
MR=(232+17)/(301+232+17+200)
MR
## [1] 0.332
creditcost(german_credit.test$response, credit.train.pred.tree1)
## [1] 1.14
credit.test.pred.tree1<- predict(credit.rpart, german_credit.test, type="class")
table(german_credit.test$response, credit.test.pred.tree1, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 93 74
## 1 16 67
MR=(74+16)/(93+74+16+67)
MR
## [1] 0.36
creditcost(german_credit.train$response, credit.train.pred.tree1)
## [1] 0.4226667
#########################################################################
## Create a formula for a model with a large number of variables:
gam_formula <- as.formula(paste("response~s(duration)+amount+age+chk_acct+credit_his+purpose+saving_acct+present_emp+installment_rate+sex+other_debtor+present_resid+property+other_install+housing+n_credits+job+n_people+telephone+foreign"))
fitGAM <- gam(formula = gam_formula, family = binomial, data = german_credit.train)
summary(fitGAM)
##
## Family: binomial
## Link function: logit
##
## Formula:
## response ~ s(duration) + amount + age + chk_acct + credit_his +
## purpose + saving_acct + present_emp + installment_rate +
## sex + other_debtor + present_resid + property + other_install +
## housing + n_credits + job + n_people + telephone + foreign
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.864e+00 1.392e+00 1.339 0.180546
## amount 1.091e-04 5.467e-05 1.995 0.046043 *
## age -1.290e-02 1.104e-02 -1.169 0.242522
## chk_acctA12 -3.481e-01 2.647e-01 -1.315 0.188588
## chk_acctA13 -1.246e+00 4.633e-01 -2.690 0.007155 **
## chk_acctA14 -1.820e+00 2.862e-01 -6.360 2.02e-10 ***
## credit_hisA31 -3.694e-01 7.152e-01 -0.517 0.605491
## credit_hisA32 -9.350e-01 5.901e-01 -1.584 0.113091
## credit_hisA33 -1.546e+00 6.208e-01 -2.491 0.012745 *
## credit_hisA34 -1.751e+00 5.906e-01 -2.964 0.003036 **
## purposeA41 -1.617e+00 4.477e-01 -3.612 0.000304 ***
## purposeA410 -2.248e+00 1.182e+00 -1.902 0.057201 .
## purposeA42 -1.010e+00 3.214e-01 -3.142 0.001680 **
## purposeA43 -1.005e+00 3.002e-01 -3.349 0.000810 ***
## purposeA44 -1.593e+00 1.007e+00 -1.581 0.113768
## purposeA45 -5.908e-01 6.702e-01 -0.881 0.378071
## purposeA46 1.758e-01 4.925e-01 0.357 0.721101
## purposeA48 -3.659e+01 2.740e+07 0.000 0.999999
## purposeA49 -5.651e-01 3.903e-01 -1.448 0.147604
## saving_acctA62 -6.116e-01 3.442e-01 -1.777 0.075591 .
## saving_acctA63 -1.226e-01 4.464e-01 -0.275 0.783510
## saving_acctA64 -2.222e+00 8.029e-01 -2.768 0.005638 **
## saving_acctA65 -8.797e-01 3.155e-01 -2.788 0.005296 **
## present_empA72 -1.092e-01 5.142e-01 -0.212 0.831881
## present_empA73 -6.084e-01 4.968e-01 -1.225 0.220733
## present_empA74 -1.187e+00 5.399e-01 -2.198 0.027940 *
## present_empA75 -8.287e-01 5.013e-01 -1.653 0.098284 .
## installment_rate 3.418e-01 1.060e-01 3.225 0.001259 **
## sexA92 -2.731e-01 4.682e-01 -0.583 0.559764
## sexA93 -8.944e-01 4.590e-01 -1.948 0.051361 .
## sexA94 -3.870e-01 5.425e-01 -0.713 0.475662
## other_debtorA102 3.342e-01 4.682e-01 0.714 0.475417
## other_debtorA103 -1.162e+00 5.168e-01 -2.248 0.024591 *
## present_resid -4.579e-02 1.043e-01 -0.439 0.660569
## propertyA122 2.259e-01 3.033e-01 0.745 0.456401
## propertyA123 4.147e-01 2.809e-01 1.476 0.139877
## propertyA124 3.597e-01 5.212e-01 0.690 0.490073
## other_installA142 -1.362e-01 5.131e-01 -0.265 0.790665
## other_installA143 -7.578e-01 2.868e-01 -2.642 0.008243 **
## housingA152 -5.684e-01 2.776e-01 -2.048 0.040607 *
## housingA153 -6.815e-01 5.730e-01 -1.189 0.234321
## n_credits 4.518e-01 2.414e-01 1.872 0.061214 .
## jobA172 3.233e-01 7.976e-01 0.405 0.685259
## jobA173 5.457e-01 7.665e-01 0.712 0.476465
## jobA174 1.552e-01 7.836e-01 0.198 0.842963
## n_people 3.519e-01 3.005e-01 1.171 0.241547
## telephoneA192 -3.657e-01 2.519e-01 -1.451 0.146687
## foreignA202 -1.207e+00 7.248e-01 -1.665 0.095856 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(duration) 1 1 15.12 0.000101 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.289 Deviance explained = 29.8%
## UBRE = -0.024333 Scale est. = 1 n = 750
##insample
fittedGAM.insample.prob <- predict(fitGAM)
fittedGAM.insample <- as.numeric(fittedGAM.insample.prob > pcut)
TrainMisGAM <- mean(ifelse(german_credit.train$response != fittedGAM.insample, 1, 0))
TrainMisGAM
## [1] 0.2013333
##out-sample
fittedGAM.outsample.prob <- predict(fitGAM, german_credit.test)
fittedGAM.outsample <- as.numeric(fittedGAM.outsample.prob > pcut)
TestMisGAM <- mean(ifelse(german_credit.test$response != fittedGAM.outsample, 1, 0))
TestMisGAM
## [1] 0.276
library(nnet)
## Warning: package 'nnet' was built under R version 3.4.4
##
## Attaching package: 'nnet'
## The following object is masked from 'package:mgcv':
##
## multinom
credit.nnet <- nnet(response~., data=german_credit.train, size=1, maxit=500)
## # weights: 51
## initial value 229.054492
## final value 217.000000
## converged
prob.nnet= predict(credit.nnet,german_credit.test)
pred.nnet = as.numeric(prob.nnet >1/6)
table(german_credit.test$response,pred.nnet, dnn=c("Observed","Predicted"))
## Predicted
## Observed 0
## 0 167
## 1 83
In-sample
mean(ifelse(german_credit.test$response!= pred.nnet, 1, 0))
## [1] 0.332
creditcost(german_credit.test$response, pred.nnet)
## [1] 1.66
Out of-sample
mean(ifelse(german_credit.train$response!= pred.nnet, 1, 0))
## [1] 0.2893333
creditcost(german_credit.train$response, pred.nnet)
## [1] 1.446667