The German credit scoring data is a dataset provided by Prof. Hogmann in the file german.data. The data set has information of 1000 individuals, on the basis of which they have been classified as risky or not. The variable response in the dataset corresponds to the risk label, 1 has been classified as good and 2 has been classified as bad. In our analysis, we have changed these labels to 0 and 1, 0 corresponding to a good credit record and 1 corresponding to a bad one.
library(ROCR)
library(ggplot2)
library(tibble)
library(class)
library(dplyr)
library(readxl)
library(tibble)
library(boot)
library(glmnet)
library(MASS)
library(rpart)
library(rpart.plot)
# Input Data
german_credit <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german_credit) <- c("chk_acct", "duration", "credit_his", "purpose",
"amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor",
"present_resid", "property", "age", "other_install", "housing", "n_credits",
"job", "n_people", "telephone", "foreign", "response")
german_credit$response <- german_credit$response - 1
german_credit$response <- as.factor(german_credit$response)
# Splitting data into train and test
set.seed(13472138)
index <- sample(nrow(german_credit),nrow(german_credit)*0.70)
german.train = german_credit[index,]
german.test = german_credit[-index,]
# Summary of the data
summary(german_credit)
## chk_acct duration credit_his purpose amount saving_acct
## A11:274 Min. : 4.0 A30: 40 A43 :280 Min. : 250 A61:603
## A12:269 1st Qu.:12.0 A31: 49 A40 :234 1st Qu.: 1366 A62:103
## A13: 63 Median :18.0 A32:530 A42 :181 Median : 2320 A63: 63
## A14:394 Mean :20.9 A33: 88 A41 :103 Mean : 3271 A64: 48
## 3rd Qu.:24.0 A34:293 A49 : 97 3rd Qu.: 3972 A65:183
## Max. :72.0 A46 : 50 Max. :18424
## (Other): 55
## present_emp installment_rate sex other_debtor present_resid property
## A71: 62 Min. :1.000 A91: 50 A101:907 Min. :1.000 A121:282
## A72:172 1st Qu.:2.000 A92:310 A102: 41 1st Qu.:2.000 A122:232
## A73:339 Median :3.000 A93:548 A103: 52 Median :3.000 A123:332
## A74:174 Mean :2.973 A94: 92 Mean :2.845 A124:154
## A75:253 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :4.000 Max. :4.000
##
## age other_install housing n_credits job
## Min. :19.00 A141:139 A151:179 Min. :1.000 A171: 22
## 1st Qu.:27.00 A142: 47 A152:713 1st Qu.:1.000 A172:200
## Median :33.00 A143:814 A153:108 Median :1.000 A173:630
## Mean :35.55 Mean :1.407 A174:148
## 3rd Qu.:42.00 3rd Qu.:2.000
## Max. :75.00 Max. :4.000
##
## n_people telephone foreign response
## Min. :1.000 A191:596 A201:963 0:700
## 1st Qu.:1.000 A192:404 A202: 37 1:300
## Median :1.000
## Mean :1.155
## 3rd Qu.:1.000
## Max. :2.000
##
# Visualizing distribution of data
ggplot(german_credit, aes(x=response, y=amount, fill=response)) +
geom_boxplot()
ggplot(german_credit, aes(x=response, y=duration, fill=response)) +
geom_boxplot()
ggplot(german_credit, aes(x=response, y=installment_rate, fill=response)) +
geom_boxplot()
ggplot(german_credit, aes(x=response, y=age, fill=response)) +
geom_boxplot()
ggplot(german_credit, aes(x=response, y=n_credits, fill=response)) +
geom_boxplot()
ggplot(german_credit, aes(x=response, y=n_people, fill=response)) +
geom_boxplot()
# Visualization of Categorical Data
ggplot(german_credit, aes(chk_acct, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(credit_his, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(purpose, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(other_debtor, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(sex, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(other_install, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
ggplot(german_credit, aes(foreign, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
Running Logistic Regression on all variables. Using stepwise AIC to select most important variables
german.glm0<- glm(response~., family=binomial, data=german.train)
summary(german.glm0)
##
## Call:
## glm(formula = response ~ ., family = binomial, data = german.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6528 -0.6763 -0.3246 0.6780 2.6199
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.860e-01 1.290e+00 -0.454 0.64966
## chk_acctA12 -3.226e-01 2.676e-01 -1.206 0.22795
## chk_acctA13 -1.079e+00 4.550e-01 -2.372 0.01768 *
## chk_acctA14 -1.963e+00 2.967e-01 -6.617 3.67e-11 ***
## duration 3.533e-02 1.213e-02 2.913 0.00358 **
## credit_hisA31 9.005e-01 7.062e-01 1.275 0.20228
## credit_hisA32 -5.349e-02 5.848e-01 -0.091 0.92712
## credit_hisA33 -4.204e-01 6.421e-01 -0.655 0.51265
## credit_hisA34 -8.857e-01 5.965e-01 -1.485 0.13761
## purposeA41 -1.441e+00 4.456e-01 -3.233 0.00122 **
## purposeA410 4.492e-02 1.052e+00 0.043 0.96594
## purposeA42 -5.103e-01 3.127e-01 -1.632 0.10266
## purposeA43 -8.464e-01 3.042e-01 -2.783 0.00539 **
## purposeA44 -3.723e-01 8.555e-01 -0.435 0.66337
## purposeA45 -3.261e-01 6.654e-01 -0.490 0.62401
## purposeA46 3.105e-02 4.998e-01 0.062 0.95046
## purposeA48 -1.973e+00 1.317e+00 -1.498 0.13424
## purposeA49 -3.633e-01 4.105e-01 -0.885 0.37625
## amount 1.113e-04 5.657e-05 1.968 0.04912 *
## saving_acctA62 -3.958e-01 3.455e-01 -1.146 0.25197
## saving_acctA63 -1.329e-01 4.531e-01 -0.293 0.76933
## saving_acctA64 -1.686e+00 6.726e-01 -2.506 0.01221 *
## saving_acctA65 -1.040e+00 3.355e-01 -3.099 0.00194 **
## present_empA72 -4.061e-02 5.032e-01 -0.081 0.93568
## present_empA73 -3.969e-01 4.907e-01 -0.809 0.41860
## present_empA74 -1.260e+00 5.357e-01 -2.353 0.01863 *
## present_empA75 -4.144e-01 5.111e-01 -0.811 0.41748
## installment_rate 3.658e-01 1.118e-01 3.273 0.00106 **
## sexA92 -1.018e-01 4.918e-01 -0.207 0.83603
## sexA93 -8.231e-01 4.931e-01 -1.669 0.09507 .
## sexA94 -2.326e-01 5.577e-01 -0.417 0.67670
## other_debtorA102 6.796e-01 5.376e-01 1.264 0.20613
## other_debtorA103 -1.159e+00 5.059e-01 -2.292 0.02193 *
## present_resid -9.671e-02 1.057e-01 -0.915 0.36011
## propertyA122 2.609e-01 3.038e-01 0.859 0.39047
## propertyA123 1.866e-01 2.865e-01 0.651 0.51492
## propertyA124 4.593e-01 5.321e-01 0.863 0.38809
## age -5.643e-03 1.158e-02 -0.487 0.62601
## other_installA142 2.069e-02 5.332e-01 0.039 0.96905
## other_installA143 -5.441e-01 3.098e-01 -1.756 0.07903 .
## housingA152 -6.978e-01 2.881e-01 -2.422 0.01543 *
## housingA153 -3.018e-01 5.923e-01 -0.510 0.61030
## n_credits 4.074e-01 2.352e-01 1.732 0.08327 .
## jobA172 6.604e-01 7.594e-01 0.870 0.38444
## jobA173 7.706e-01 7.275e-01 1.059 0.28948
## jobA174 7.519e-01 7.533e-01 0.998 0.31824
## n_people 2.974e-01 3.287e-01 0.905 0.36557
## telephoneA192 -3.137e-01 2.496e-01 -1.257 0.20877
## foreignA202 -1.460e+00 7.754e-01 -1.883 0.05971 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 871.48 on 699 degrees of freedom
## Residual deviance: 606.27 on 651 degrees of freedom
## AIC: 704.27
##
## Number of Fisher Scoring iterations: 5
german.glm1 <- step(german.glm0)
## Start: AIC=704.27
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + property + age + other_install + housing +
## n_credits + job + n_people + telephone + foreign
##
## Df Deviance AIC
## - property 3 607.40 699.40
## - job 3 607.50 699.50
## - age 1 606.51 702.51
## - n_people 1 607.08 703.08
## - present_resid 1 607.11 703.11
## - telephone 1 607.86 703.86
## - other_install 2 610.18 704.18
## <none> 606.27 704.27
## - purpose 9 624.62 704.62
## - n_credits 1 609.29 705.29
## - amount 1 610.16 706.16
## - housing 2 612.22 706.22
## - foreign 1 610.66 706.66
## - sex 3 615.08 707.08
## - other_debtor 2 614.17 708.17
## - present_emp 4 618.79 708.79
## - credit_his 4 619.80 709.80
## - duration 1 614.87 710.87
## - saving_acct 4 622.96 712.96
## - installment_rate 1 617.38 713.38
## - chk_acct 3 663.36 755.36
##
## Step: AIC=699.4
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + age + other_install + housing + n_credits +
## job + n_people + telephone + foreign
##
## Df Deviance AIC
## - job 3 608.73 694.73
## - age 1 607.69 697.69
## - present_resid 1 608.18 698.18
## - n_people 1 608.21 698.21
## - telephone 1 608.80 698.80
## <none> 607.40 699.40
## - other_install 2 611.53 699.53
## - purpose 9 626.00 700.00
## - n_credits 1 610.26 700.26
## - amount 1 611.61 701.61
## - foreign 1 611.80 701.80
## - sex 3 615.83 701.83
## - housing 2 614.97 702.97
## - other_debtor 2 615.67 703.67
## - present_emp 4 620.70 704.70
## - credit_his 4 620.84 704.84
## - duration 1 616.72 706.72
## - saving_acct 4 624.11 708.11
## - installment_rate 1 618.84 708.84
## - chk_acct 3 666.23 752.23
##
## Step: AIC=694.73
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + age + other_install + housing + n_credits +
## n_people + telephone + foreign
##
## Df Deviance AIC
## - age 1 609.10 693.10
## - n_people 1 609.39 693.39
## - present_resid 1 609.45 693.45
## - telephone 1 609.88 693.88
## - purpose 9 626.63 694.63
## <none> 608.73 694.73
## - other_install 2 613.04 695.04
## - n_credits 1 611.45 695.45
## - sex 3 616.97 696.97
## - amount 1 613.07 697.07
## - foreign 1 613.16 697.16
## - housing 2 616.64 698.64
## - other_debtor 2 617.13 699.13
## - present_emp 4 621.21 699.21
## - credit_his 4 621.88 699.88
## - duration 1 618.51 702.51
## - saving_acct 4 625.82 703.82
## - installment_rate 1 620.72 704.72
## - chk_acct 3 666.74 746.74
##
## Step: AIC=693.1
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + other_install + housing + n_credits + n_people +
## telephone + foreign
##
## Df Deviance AIC
## - n_people 1 609.65 691.65
## - present_resid 1 609.99 691.99
## - telephone 1 610.41 692.41
## - purpose 9 627.05 693.05
## <none> 609.10 693.10
## - other_install 2 613.34 693.34
## - n_credits 1 611.85 693.85
## - sex 3 617.44 695.44
## - amount 1 613.53 695.53
## - foreign 1 613.65 695.65
## - housing 2 617.38 697.38
## - present_emp 4 621.54 697.54
## - other_debtor 2 617.59 697.59
## - credit_his 4 622.35 698.35
## - duration 1 618.98 700.98
## - saving_acct 4 626.23 702.23
## - installment_rate 1 620.99 702.99
## - chk_acct 3 667.48 745.48
##
## Step: AIC=691.65
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + other_install + housing + n_credits + telephone +
## foreign
##
## Df Deviance AIC
## - present_resid 1 610.48 690.48
## - telephone 1 610.92 690.92
## <none> 609.65 691.65
## - other_install 2 613.78 691.78
## - purpose 9 627.89 691.89
## - n_credits 1 612.69 692.69
## - sex 3 617.44 693.44
## - foreign 1 614.07 694.07
## - amount 1 614.11 694.11
## - housing 2 617.90 695.90
## - other_debtor 2 617.96 695.96
## - present_emp 4 622.34 696.34
## - credit_his 4 623.96 697.96
## - duration 1 619.26 699.26
## - saving_acct 4 626.64 700.64
## - installment_rate 1 621.21 701.21
## - chk_acct 3 668.08 744.08
##
## Step: AIC=690.48
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## other_install + housing + n_credits + telephone + foreign
##
## Df Deviance AIC
## - telephone 1 611.88 689.88
## <none> 610.48 690.48
## - other_install 2 614.74 690.74
## - purpose 9 629.19 691.19
## - n_credits 1 613.35 691.35
## - sex 3 618.18 692.18
## - foreign 1 614.77 692.77
## - amount 1 615.08 693.08
## - housing 2 617.91 693.91
## - other_debtor 2 618.89 694.89
## - present_emp 4 623.75 695.75
## - credit_his 4 625.25 697.25
## - duration 1 619.81 697.81
## - installment_rate 1 621.84 699.84
## - saving_acct 4 627.99 699.99
## - chk_acct 3 668.17 742.17
##
## Step: AIC=689.88
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## other_install + housing + n_credits + foreign
##
## Df Deviance AIC
## <none> 611.88 689.88
## - other_install 2 616.02 690.02
## - purpose 9 630.35 690.35
## - n_credits 1 614.56 690.56
## - amount 1 615.56 691.56
## - sex 3 619.68 691.68
## - foreign 1 615.78 691.78
## - housing 2 619.28 693.28
## - other_debtor 2 620.36 694.36
## - present_emp 4 625.57 695.57
## - credit_his 4 626.57 696.57
## - duration 1 622.02 698.02
## - installment_rate 1 622.68 698.68
## - saving_acct 4 629.08 699.08
## - chk_acct 3 671.93 743.93
Finding the optimal probability cutoff value
# predicting on train set
predict_logit_train <- predict(german.glm1, type="response")
# define a cost function with input "obs" being observed response
# and "pi" being predicted probability, and "pcut" being the threshold.
costfunc = function(obs, pred.p, pcut){
weight1 = 5 # define the weight for "true=1 but pred=0" (FN)
weight0 = 1 # define the weight for "true=0 but pred=1" (FP)
c1 = (obs==1)&(pred.p<pcut) # count for "true=1 but pred=0" (FN)
c0 = (obs==0)&(pred.p>=pcut) # count for "true=0 but pred=1" (FP)
cost = mean(weight1*c1 + weight0*c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
} # end of the function
# define a sequence from 0.01 to 1 by 0.01
p.seq = seq(0.01, 1, 0.01)
# write a loop for all p-cut to see which one provides the smallest cost
# first, need to define a 0 vector in order to save the value of cost from all pcut
cost = rep(0, length(p.seq))
for(i in 1:length(p.seq)){
cost[i] = costfunc(obs = german.train$response, pred.p = predict_logit_train, pcut = p.seq[i])
} # end of the loop
optimal.pcut = p.seq[which(cost==min(cost))][1]
optimal.pcut
## [1] 0.14
plot(p.seq, cost)
Train and Test Predictions
pred.glm.gtrain.glm0 <- predict(german.glm1, type = "response")
pred.glm.gtest.glm0 <- predict(german.glm1, newdata=german.test,type = "response")
pred.train <- as.numeric(pred.glm.gtrain.glm0 > optimal.pcut)
pred.test <- as.numeric(pred.glm.gtest.glm0 > optimal.pcut)
confusion_matrix_train <- table(german.train$response, pred.train)
confusion_matrix_train
## pred.train
## 0 1
## 0 251 229
## 1 15 205
confusion_matrix_test <- table(german.test$response, pred.test)
confusion_matrix_test
## pred.test
## 0 1
## 0 105 115
## 1 11 69
misclassification_rate_train <- round((confusion_matrix_train[2]+confusion_matrix_train[3])/sum(confusion_matrix_train), 2)
misclassification_rate_train
## [1] 0.35
misclassification_rate_test <- round((confusion_matrix_test[2]+confusion_matrix_test[3])/sum(confusion_matrix_test), 2)
misclassification_rate_test
## [1] 0.42
cat("train misclassfication rate:", misclassification_rate_train, "| test misclassfication rate:", misclassification_rate_test)
## train misclassfication rate: 0.35 | test misclassfication rate: 0.42
ROC Curve and AUC
#Train
#ROC
pred <- prediction(pred.glm.gtrain.glm0, german.train$response)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.8515436
#Test
#ROC
pred <- prediction(pred.glm.gtest.glm0, german.test$response)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7554545
3 Fold Cross Validation
# Performing Cross Validation Using Asymmetric Cost Function
costfunc = function(obs, pred.p){
weight1 = 5 # define the weight for "true=1 but pred=0" (FN)
weight0 = 1 # define the weight for "true=0 but pred=1" (FP)
c1 = (obs==1)&(pred.p<optimal.pcut) # count for "true=1 but pred=0" (FN)
c0 = (obs==0)&(pred.p>=optimal.pcut) # count for "true=0 but pred=1" (FP)
cost = mean(weight1*c1 + weight0*c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
} # end of the function
#pcut <- 1/6
german_credit.glm1 <- glm(response ~ chk_acct + duration + credit_his + purpose +
amount + saving_acct + installment_rate + sex + other_debtor + other_install +
housing + telephone + foreign, family = binomial, german_credit)
cv.result <- cv.glm(data = german_credit,glmfit = german_credit.glm1,cost = costfunc ,K=3)
# the assymetrical classification rate
cv.result$delta[2]
## [1] 0.53198
# Cross Validation with AUC as cost function
auccost = function(obs,pred.p)
{ #cost needs observed first then
pred <- prediction(pred.p,obs) #prediction needs predicted first then
cost = unlist(slot(performance(pred,"auc"),"y.values")) # AUC
return(cost)
}
cv.result <- cv.glm(data = german_credit,glmfit = german_credit.glm1,cost = auccost ,K=3)
# the assymetrical classification rate
cv.result$delta[2]
## [1] 0.7932289
Getting a lengthy tree by using all the variables and cp very low
german_largetree <- rpart(formula = response~., data = german.train,
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
prp(german_largetree, extra = 1, nn.font=40,box.palette = "red")
Pruning the model
plotcp(german_largetree)
printcp(german_largetree)
##
## Classification tree:
## rpart(formula = response ~ ., data = german.train, parms = list(loss = matrix(c(0,
## 5, 1, 0), nrow = 2)))
##
## Variables actually used in tree construction:
## [1] age chk_acct credit_his duration other_install
## [6] property purpose saving_acct sex
##
## Root node error: 480/700 = 0.68571
##
## n= 700
##
## CP nsplit rel error xerror xstd
## 1 0.206250 0 1.00000 5.0000 0.12794
## 2 0.066667 1 0.79375 2.3417 0.12602
## 3 0.023958 2 0.72708 3.1313 0.13463
## 4 0.018750 6 0.63125 2.9229 0.13293
## 5 0.016667 7 0.61250 2.8083 0.13163
## 6 0.014583 8 0.59583 2.7354 0.13094
## 7 0.010000 11 0.55208 2.5562 0.12870
Pruning the tree using optimal complexity parameter and then plotting the optimal tree
german.prunedtree <- rpart(response~., data = german.train, method = "class",
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)),cp=0.018750)
prp(german.prunedtree, extra = 1, nn.font=500,box.palette = "red")
Predictions
pred0<- predict(german.prunedtree, type="class")
table(german.train$response, pred0, dnn = c("True", "Pred"))
## Pred
## True 0 1
## 0 242 238
## 1 13 207
pred1<- predict(german.prunedtree, newdata=german.test, type="class")
table(german.test$response, pred1, dnn = c("True", "Pred"))
## Pred
## True 0 1
## 0 101 119
## 1 12 68
Train and Test Asymmetric Misclassfication Rate or Asymmetric Misclassification Cost
pred.glm.gtrain.tree <- predict(german.prunedtree, type = "prob")[,2]
pred.glm.gtest.tree <- predict(german.prunedtree, newdata=german.test,type = "prob")[,2]
pred.train <- as.numeric(pred.glm.gtrain.tree > optimal.pcut)
pred.test <- as.numeric(pred.glm.gtest.tree > optimal.pcut)
confusion_matrix_train <- table(german.train$response, pred.train)
confusion_matrix_train
## pred.train
## 0 1
## 0 242 238
## 1 13 207
confusion_matrix_test <- table(german.test$response, pred.test)
confusion_matrix_test
## pred.test
## 0 1
## 0 101 119
## 1 12 68
misclassification_rate_train <- round((confusion_matrix_train[2]+confusion_matrix_train[3])/sum(confusion_matrix_train), 2)
misclassification_rate_train
## [1] 0.36
misclassification_rate_test <- round((confusion_matrix_test[2]+confusion_matrix_test[3])/sum(confusion_matrix_test), 2)
misclassification_rate_test
## [1] 0.44
cat("train misclassfication rate:", misclassification_rate_train, "| test misclassfication rate:", misclassification_rate_test)
## train misclassfication rate: 0.36 | test misclassfication rate: 0.44
Test ROC and AUC
#ROC
pred_tree = prediction(pred.glm.gtest.tree, german.test$response)
perf_tree = performance(pred_tree, "tpr", "fpr")
plot(perf_tree, colorize=TRUE)
#AUC
unlist(slot(performance(pred_tree, "auc"), "y.values"))
## [1] 0.6640909
MAJOR FINDING: Misclassication Rate and AUC of Logistic Regression is better than Decision Tree. Therefore we can conclude prediction power of Logistic Regression > Decision Tree