library(ggplot2)
library(data.table)
library(gridExtra)
library(glmnet)
library(caret)
library("verification")
german_credit = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german_credit) = c("chk_acct", "duration", "credit_his", "purpose",
"amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor",
"present_resid", "property", "age", "other_install", "housing", "n_credits",
"job", "n_people", "telephone", "foreign", "response")
# orginal response coding 1= good, 2 = bad we need 0 = good, 1 = bad
german_credit$response = german_credit$response - 1
## For building the model, we need to divide the dataset in test and train
set.seed(12428258)
flag<-sample(nrow(german_credit),0.8*nrow(german_credit),replace = FALSE)
train<-german_credit[flag,]
test<-german_credit[-flag,]
dim(train)
## [1] 800 21
dim(test)
## [1] 200 21
#Exploratory Data Analysis
str(train)
## 'data.frame': 800 obs. of 21 variables:
## $ chk_acct : Factor w/ 4 levels "A11","A12","A13",..: 4 1 2 4 1 3 2 2 2 1 ...
## $ duration : int 7 36 18 6 18 12 12 10 9 24 ...
## $ credit_his : Factor w/ 5 levels "A30","A31","A32",..: 5 5 4 3 3 3 3 3 3 4 ...
## $ purpose : Factor w/ 10 levels "A40","A41","A410",..: 5 4 1 5 2 5 4 1 5 1 ...
## $ amount : int 730 2348 2899 1346 7511 1881 983 7308 1206 1333 ...
## $ saving_acct : Factor w/ 5 levels "A61","A62","A63",..: 5 1 5 2 5 1 4 1 1 1 ...
## $ present_emp : Factor w/ 5 levels "A71","A72","A73",..: 5 3 5 5 5 3 2 1 5 1 ...
## $ installment_rate: int 4 3 4 2 1 2 1 2 4 4 ...
## $ sex : Factor w/ 4 levels "A91","A92","A93",..: 3 4 3 3 3 2 2 3 2 3 ...
## $ other_debtor : Factor w/ 3 levels "A101","A102",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ present_resid : int 2 2 4 4 4 2 4 4 4 2 ...
## $ property : Factor w/ 4 levels "A121","A122",..: 2 2 3 4 2 3 1 4 1 1 ...
## $ age : int 46 46 43 42 51 44 19 70 25 43 ...
## $ other_install : Factor w/ 3 levels "A141","A142",..: 3 3 3 1 3 3 3 1 3 3 ...
## $ housing : Factor w/ 3 levels "A151","A152",..: 1 2 2 3 3 1 1 3 2 3 ...
## $ n_credits : int 2 2 1 1 1 1 1 1 1 2 ...
## $ job : Factor w/ 4 levels "A171","A172",..: 2 3 3 3 3 2 2 4 3 3 ...
## $ n_people : int 1 1 2 2 2 1 1 1 1 2 ...
## $ telephone : Factor w/ 2 levels "A191","A192": 2 2 1 2 2 2 1 2 1 1 ...
## $ foreign : Factor w/ 2 levels "A201","A202": 1 1 1 1 1 1 1 1 1 1 ...
## $ response : num 0 0 0 0 1 0 0 0 0 1 ...
table(train$response)
##
## 0 1
## 560 240
summary(train)
## chk_acct duration credit_his purpose amount
## A11:217 Min. : 4.00 A30: 35 A43 :232 Min. : 250
## A12:215 1st Qu.:12.00 A31: 40 A40 :181 1st Qu.: 1382
## A13: 51 Median :18.00 A32:422 A42 :151 Median : 2320
## A14:317 Mean :20.86 A33: 73 A41 : 81 Mean : 3229
## 3rd Qu.:24.00 A34:230 A49 : 72 3rd Qu.: 3960
## Max. :72.00 A46 : 36 Max. :18424
## (Other): 47
## saving_acct present_emp installment_rate sex other_debtor
## A61:487 A71: 49 Min. :1.000 A91: 45 A101:723
## A62: 81 A72:151 1st Qu.:2.000 A92:247 A102: 35
## A63: 44 A73:270 Median :3.000 A93:432 A103: 42
## A64: 42 A74:126 Mean :2.966 A94: 76
## A65:146 A75:204 3rd Qu.:4.000
## Max. :4.000
##
## present_resid property age other_install housing
## Min. :1.000 A121:230 Min. :19.00 A141:119 A151:147
## 1st Qu.:2.000 A122:179 1st Qu.:27.00 A142: 40 A152:562
## Median :3.000 A123:266 Median :33.00 A143:641 A153: 91
## Mean :2.855 A124:125 Mean :35.49
## 3rd Qu.:4.000 3rd Qu.:42.00
## Max. :4.000 Max. :75.00
##
## n_credits job n_people telephone foreign
## Min. :1.000 A171: 17 Min. :1.000 A191:477 A201:771
## 1st Qu.:1.000 A172:156 1st Qu.:1.000 A192:323 A202: 29
## Median :1.000 A173:511 Median :1.000
## Mean :1.403 A174:116 Mean :1.151
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :4.000 Max. :2.000
##
## response
## Min. :0.0
## 1st Qu.:0.0
## Median :0.0
## Mean :0.3
## 3rd Qu.:1.0
## Max. :1.0
##
## Check for categorical variables
# indices of categorical variables-
cat<-c(1,3,4,6,7,9,10,12,14,15,17,19,20)
## Continueous variables
ggplot(data = train,aes(x =factor(response),y = duration))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = amount))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = installment_rate))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = present_resid))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = age))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = n_credits))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = n_people))+geom_boxplot()

### Duration and amount seem to have some impact of reponse
cor(train[,-cat])[,8]
## duration amount installment_rate present_resid
## 0.206373426 0.130613716 0.099433367 -0.005429762
## age n_credits n_people response
## -0.086874187 -0.031677233 -0.002283937 1.000000000
## Categorical Variables
ggplot(data = train, aes(x = train[,1], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[1],y='Response',title=colnames(train)[1])

ggplot(data = train, aes(x = train[,3], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[3],y='Response',title=colnames(train)[3])

ggplot(data = train, aes(x = train[,4], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[4],y='Response',title=colnames(train)[4])

ggplot(data = train, aes(x = train[,6], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[6],y='Response',title=colnames(train)[6])

ggplot(data = train, aes(x = train[,7], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[7],y='Response',title=colnames(train)[7])

ggplot(data = train, aes(x = train[,9], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[9],y='Response',title=colnames(train)[9])

ggplot(data = train, aes(x = train[,10], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[10],y='Response',title=colnames(train)[10])

ggplot(data = train, aes(x = train[,12], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[12],y='Response',title=colnames(train)[12])

ggplot(data = train, aes(x = train[,14], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[14],y='Response',title=colnames(train)[14])

ggplot(data = train, aes(x = train[,15], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[15],y='Response',title=colnames(train)[15])

ggplot(data = train, aes(x = train[,17], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[17],y='Response',title=colnames(train)[17])

ggplot(data = train, aes(x = train[,19], fill = factor(response))) +
geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[19],y='Response',title=colnames(train)[19])

## Modeling
full.glm<-glm(data = train,response~.,family = binomial)
summary(full.glm)
##
## Call:
## glm(formula = response ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4426 -0.7121 -0.3865 0.6821 2.5726
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.797e-01 1.235e+00 0.469 0.638741
## chk_acctA12 -3.921e-01 2.464e-01 -1.591 0.111591
## chk_acctA13 -9.260e-01 4.183e-01 -2.214 0.026832 *
## chk_acctA14 -1.582e+00 2.554e-01 -6.194 5.87e-10 ***
## duration 2.535e-02 1.026e-02 2.471 0.013477 *
## credit_hisA31 1.183e-01 6.056e-01 0.195 0.845138
## credit_hisA32 -5.498e-01 4.686e-01 -1.173 0.240680
## credit_hisA33 -7.275e-01 5.145e-01 -1.414 0.157361
## credit_hisA34 -1.532e+00 4.860e-01 -3.151 0.001627 **
## purposeA41 -1.513e+00 4.104e-01 -3.687 0.000227 ***
## purposeA410 -1.888e+00 9.042e-01 -2.088 0.036777 *
## purposeA42 -9.324e-01 2.969e-01 -3.141 0.001686 **
## purposeA43 -8.202e-01 2.741e-01 -2.992 0.002770 **
## purposeA44 -7.790e-01 8.319e-01 -0.936 0.349093
## purposeA45 -1.273e-01 5.739e-01 -0.222 0.824485
## purposeA46 2.013e-01 4.719e-01 0.427 0.669678
## purposeA48 -1.412e+00 1.190e+00 -1.186 0.235624
## purposeA49 -6.649e-01 3.792e-01 -1.754 0.079491 .
## amount 1.167e-04 5.271e-05 2.214 0.026806 *
## saving_acctA62 -3.959e-01 3.235e-01 -1.224 0.221041
## saving_acctA63 -1.282e-01 4.345e-01 -0.295 0.767955
## saving_acctA64 -1.121e+00 5.410e-01 -2.073 0.038179 *
## saving_acctA65 -1.089e+00 3.073e-01 -3.543 0.000395 ***
## present_empA72 -3.428e-01 4.798e-01 -0.714 0.474967
## present_empA73 -5.089e-01 4.635e-01 -1.098 0.272239
## present_empA74 -1.168e+00 5.098e-01 -2.291 0.021962 *
## present_empA75 -6.572e-01 4.642e-01 -1.416 0.156832
## installment_rate 3.530e-01 1.006e-01 3.507 0.000453 ***
## sexA92 -4.528e-01 4.173e-01 -1.085 0.277808
## sexA93 -7.762e-01 4.070e-01 -1.907 0.056519 .
## sexA94 -4.579e-01 4.879e-01 -0.938 0.347988
## other_debtorA102 6.212e-01 4.408e-01 1.409 0.158729
## other_debtorA103 -5.070e-01 4.519e-01 -1.122 0.261905
## present_resid 1.770e-03 9.816e-02 0.018 0.985618
## propertyA122 3.224e-01 2.851e-01 1.131 0.258154
## propertyA123 1.953e-01 2.613e-01 0.747 0.454902
## propertyA124 8.337e-01 4.716e-01 1.768 0.077081 .
## age -1.553e-02 1.006e-02 -1.543 0.122857
## other_installA142 -1.533e-01 4.505e-01 -0.340 0.733700
## other_installA143 -5.163e-01 2.632e-01 -1.962 0.049811 *
## housingA152 -5.886e-01 2.604e-01 -2.260 0.023792 *
## housingA153 -7.953e-01 5.310e-01 -1.498 0.134160
## n_credits 3.689e-01 2.122e-01 1.738 0.082160 .
## jobA172 6.941e-01 8.040e-01 0.863 0.387964
## jobA173 7.745e-01 7.743e-01 1.000 0.317163
## jobA174 7.045e-01 7.802e-01 0.903 0.366569
## n_people 1.107e-01 2.827e-01 0.392 0.695302
## telephoneA192 -3.339e-01 2.232e-01 -1.496 0.134747
## foreignA202 -1.919e+00 8.317e-01 -2.307 0.021027 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 977.38 on 799 degrees of freedom
## Residual deviance: 725.89 on 751 degrees of freedom
## AIC: 823.89
##
## Number of Fisher Scoring iterations: 5
AIC(full.glm)
## [1] 823.8938
summary(full.glm$fitted.values)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001006 0.089539 0.224454 0.300000 0.474623 0.954198
### Defining optimup cutoff probability
df = seq(0.01, 0.99, 0.01)
result = cbind(df, NA)
cost1 <- function(actual, predicted) {
weight1 = 5
weight0 = 1
c1 = (actual == 1) & (predicted < cutoff) #logical vector - true if actual bad but predicted good
c0 = (actual == 0) & (predicted > cutoff) #logical vecotr - true if actual good but predicted bad
return(mean(weight1 * c1 + weight0 * c0))
}
for (i in 1:length(df)) {
cutoff <- result[i, 1]
result[i, 2] <- cost1(train$response, predict(full.glm, type = "response"))
}
plot(result, ylab = "Cost in Training Set")

## We can see that the least cost is when coutoff probability is around 0.2
## We get the exact cutoff probability
result[which(result[,2]==min(result[,2])),1]
## df
## 0.14
## we get 0.14 as the cut off probability where the cost is minimum
## As we can see, many of the variables are not significant for the model.
glm.step.aic <- step(full.glm) ## Uses AIC by default
## Start: AIC=823.89
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + property + age + other_install + housing +
## n_credits + job + n_people + telephone + foreign
##
## Df Deviance AIC
## - job 3 726.98 818.98
## - property 3 729.35 821.35
## - present_resid 1 725.89 821.89
## - n_people 1 726.05 822.05
## - sex 3 730.53 822.53
## - other_debtor 2 729.45 823.45
## <none> 725.89 823.89
## - other_install 2 730.02 824.02
## - telephone 1 728.15 824.15
## - present_emp 4 734.30 824.30
## - age 1 728.32 824.32
## - n_credits 1 728.94 824.94
## - housing 2 731.54 825.54
## - amount 1 730.81 826.81
## - duration 1 732.06 828.06
## - foreign 1 733.25 829.25
## - purpose 9 753.24 833.24
## - saving_acct 4 743.25 833.25
## - installment_rate 1 738.67 834.67
## - credit_his 4 745.86 835.86
## - chk_acct 3 771.47 863.47
##
## Step: AIC=818.98
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + property + age + other_install + housing +
## n_credits + n_people + telephone + foreign
##
## Df Deviance AIC
## - property 3 730.51 816.51
## - present_resid 1 726.98 816.98
## - n_people 1 727.12 817.12
## - sex 3 731.78 817.78
## - other_debtor 2 730.47 818.47
## - present_emp 4 734.58 818.58
## <none> 726.98 818.98
## - other_install 2 731.05 819.05
## - telephone 1 729.30 819.30
## - age 1 729.62 819.62
## - n_credits 1 729.82 819.82
## - housing 2 732.45 820.45
## - amount 1 732.10 822.10
## - duration 1 733.47 823.47
## - foreign 1 734.39 824.39
## - purpose 9 753.83 827.83
## - saving_acct 4 745.28 829.28
## - installment_rate 1 740.55 830.55
## - credit_his 4 746.92 830.92
## - chk_acct 3 771.94 857.94
##
## Step: AIC=816.51
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + age + other_install + housing + n_credits +
## n_people + telephone + foreign
##
## Df Deviance AIC
## - present_resid 1 730.51 814.51
## - n_people 1 730.61 814.61
## - sex 3 735.18 815.18
## - present_emp 4 738.18 816.18
## - telephone 1 732.23 816.23
## <none> 730.51 816.51
## - other_debtor 2 734.71 816.71
## - n_credits 1 733.02 817.02
## - other_install 2 735.12 817.12
## - age 1 733.13 817.13
## - housing 2 736.19 818.19
## - amount 1 736.24 820.24
## - duration 1 737.93 821.93
## - foreign 1 738.28 822.28
## - saving_acct 4 748.02 826.02
## - purpose 9 758.03 826.03
## - credit_his 4 750.43 828.43
## - installment_rate 1 745.26 829.26
## - chk_acct 3 777.21 857.21
##
## Step: AIC=814.51
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## age + other_install + housing + n_credits + n_people + telephone +
## foreign
##
## Df Deviance AIC
## - n_people 1 730.61 812.61
## - sex 3 735.18 813.18
## - telephone 1 732.24 814.24
## - present_emp 4 738.29 814.29
## <none> 730.51 814.51
## - other_debtor 2 734.71 814.71
## - n_credits 1 733.03 815.03
## - other_install 2 735.13 815.13
## - age 1 733.17 815.17
## - housing 2 736.59 816.59
## - amount 1 736.26 818.26
## - duration 1 737.97 819.97
## - foreign 1 738.33 820.33
## - saving_acct 4 748.04 824.04
## - purpose 9 758.20 824.20
## - credit_his 4 750.45 826.45
## - installment_rate 1 745.28 827.28
## - chk_acct 3 777.35 855.35
##
## Step: AIC=812.61
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## age + other_install + housing + n_credits + telephone + foreign
##
## Df Deviance AIC
## - sex 3 735.21 811.21
## - present_emp 4 738.38 812.38
## - telephone 1 732.38 812.38
## <none> 730.61 812.61
## - other_debtor 2 734.77 812.77
## - age 1 733.23 813.23
## - n_credits 1 733.24 813.24
## - other_install 2 735.28 813.28
## - housing 2 736.75 814.75
## - amount 1 736.30 816.30
## - duration 1 738.06 818.06
## - foreign 1 738.37 818.37
## - saving_acct 4 748.08 822.08
## - purpose 9 758.32 822.32
## - credit_his 4 750.96 824.96
## - installment_rate 1 745.28 825.28
## - chk_acct 3 777.51 853.51
##
## Step: AIC=811.21
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + other_debtor +
## age + other_install + housing + n_credits + telephone + foreign
##
## Df Deviance AIC
## - telephone 1 736.93 810.93
## <none> 735.21 811.21
## - other_debtor 2 739.22 811.22
## - other_install 2 739.26 811.26
## - age 1 737.66 811.66
## - n_credits 1 737.67 811.67
## - present_emp 4 745.03 813.03
## - housing 2 742.21 814.21
## - amount 1 740.27 814.27
## - duration 1 742.97 816.97
## - foreign 1 743.64 817.64
## - purpose 9 762.60 820.60
## - saving_acct 4 752.84 820.84
## - installment_rate 1 747.91 821.91
## - credit_his 4 755.60 823.60
## - chk_acct 3 782.79 852.79
##
## Step: AIC=810.93
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + other_debtor +
## age + other_install + housing + n_credits + foreign
##
## Df Deviance AIC
## <none> 736.93 810.93
## - other_install 2 740.95 810.95
## - other_debtor 2 741.00 811.00
## - n_credits 1 739.17 811.17
## - age 1 739.84 811.84
## - amount 1 740.91 812.91
## - present_emp 4 747.06 813.06
## - housing 2 743.68 813.68
## - foreign 1 744.86 816.86
## - duration 1 745.67 817.67
## - saving_acct 4 754.85 820.85
## - installment_rate 1 748.90 820.90
## - purpose 9 765.16 821.16
## - credit_his 4 757.49 823.49
## - chk_acct 3 785.29 853.29
summary(glm.step.aic)
##
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + purpose +
## amount + saving_acct + present_emp + installment_rate + other_debtor +
## age + other_install + housing + n_credits + foreign, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4331 -0.7149 -0.4066 0.7187 2.5207
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.497e-01 9.271e-01 1.024 0.305642
## chk_acctA12 -4.171e-01 2.402e-01 -1.737 0.082455 .
## chk_acctA13 -9.397e-01 4.058e-01 -2.316 0.020574 *
## chk_acctA14 -1.613e+00 2.520e-01 -6.401 1.54e-10 ***
## duration 2.888e-02 9.826e-03 2.939 0.003296 **
## credit_hisA31 1.128e-01 5.948e-01 0.190 0.849591
## credit_hisA32 -5.647e-01 4.598e-01 -1.228 0.219438
## credit_hisA33 -7.149e-01 5.040e-01 -1.419 0.156011
## credit_hisA34 -1.528e+00 4.772e-01 -3.202 0.001364 **
## purposeA41 -1.452e+00 3.981e-01 -3.647 0.000265 ***
## purposeA410 -1.921e+00 8.509e-01 -2.258 0.023945 *
## purposeA42 -8.158e-01 2.884e-01 -2.829 0.004669 **
## purposeA43 -8.291e-01 2.689e-01 -3.083 0.002049 **
## purposeA44 -7.109e-01 8.066e-01 -0.881 0.378095
## purposeA45 -8.127e-02 5.648e-01 -0.144 0.885576
## purposeA46 3.488e-01 4.640e-01 0.752 0.452147
## purposeA48 -1.305e+00 1.158e+00 -1.127 0.259634
## purposeA49 -6.593e-01 3.674e-01 -1.794 0.072749 .
## amount 9.599e-05 4.821e-05 1.991 0.046471 *
## saving_acctA62 -2.908e-01 3.120e-01 -0.932 0.351353
## saving_acctA63 -1.729e-01 4.358e-01 -0.397 0.691633
## saving_acctA64 -1.044e+00 5.286e-01 -1.975 0.048289 *
## saving_acctA65 -1.098e+00 2.989e-01 -3.672 0.000241 ***
## present_empA72 -8.005e-02 4.279e-01 -0.187 0.851600
## present_empA73 -3.230e-01 4.048e-01 -0.798 0.424979
## present_empA74 -1.027e+00 4.559e-01 -2.253 0.024287 *
## present_empA75 -4.380e-01 4.120e-01 -1.063 0.287705
## installment_rate 3.272e-01 9.633e-02 3.397 0.000681 ***
## other_debtorA102 6.552e-01 4.337e-01 1.511 0.130886
## other_debtorA103 -5.219e-01 4.350e-01 -1.200 0.230182
## age -1.635e-02 9.687e-03 -1.688 0.091398 .
## other_installA142 -2.098e-01 4.475e-01 -0.469 0.639218
## other_installA143 -5.057e-01 2.567e-01 -1.970 0.048859 *
## housingA152 -6.292e-01 2.460e-01 -2.558 0.010528 *
## housingA153 -3.698e-01 3.776e-01 -0.979 0.327447
## n_credits 3.049e-01 2.047e-01 1.489 0.136513
## foreignA202 -1.901e+00 8.054e-01 -2.360 0.018283 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 977.38 on 799 degrees of freedom
## Residual deviance: 736.93 on 763 degrees of freedom
## AIC: 810.93
##
## Number of Fisher Scoring iterations: 5
## Step glm suggestd following model-
# glm(formula = response ~ chk_acct + duration + credit_his + purpose +
# amount + saving_acct + present_emp + installment_rate + other_debtor +
# age + other_install + housing + n_credits + foreign, family = binomial,
# data = train)
## Use stepwise model selection with BIC as cost function
glm.step.bic <- step(full.glm, k = log(nrow(train)))
## Start: AIC=1053.44
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + property + age + other_install + housing +
## n_credits + job + n_people + telephone + foreign
##
## Df Deviance AIC
## - purpose 9 753.24 1020.6
## - job 3 726.98 1034.5
## - present_emp 4 734.30 1035.1
## - property 3 729.35 1036.8
## - sex 3 730.53 1038.0
## - other_debtor 2 729.45 1043.6
## - saving_acct 4 743.25 1044.1
## - other_install 2 730.02 1044.2
## - housing 2 731.54 1045.7
## - credit_his 4 745.86 1046.7
## - present_resid 1 725.89 1046.8
## - n_people 1 726.05 1046.9
## - telephone 1 728.15 1049.0
## - age 1 728.32 1049.2
## - n_credits 1 728.94 1049.8
## - amount 1 730.81 1051.7
## - duration 1 732.06 1052.9
## <none> 725.89 1053.4
## - foreign 1 733.25 1054.1
## - installment_rate 1 738.67 1059.5
## - chk_acct 3 771.47 1079.0
##
## Step: AIC=1020.62
## response ~ chk_acct + duration + credit_his + amount + saving_acct +
## present_emp + installment_rate + sex + other_debtor + present_resid +
## property + age + other_install + housing + n_credits + job +
## n_people + telephone + foreign
##
## Df Deviance AIC
## - present_emp 4 760.17 1000.8
## - job 3 753.83 1001.2
## - sex 3 757.49 1004.8
## - property 3 757.54 1004.9
## - saving_acct 4 768.93 1009.6
## - other_install 2 755.96 1010.0
## - other_debtor 2 756.83 1010.9
## - housing 2 759.24 1013.2
## - n_people 1 753.41 1014.1
## - present_resid 1 753.46 1014.2
## - credit_his 4 773.53 1014.2
## - age 1 754.27 1015.0
## - telephone 1 755.73 1016.4
## - amount 1 755.78 1016.5
## - n_credits 1 757.35 1018.0
## - duration 1 759.05 1019.8
## <none> 753.24 1020.6
## - foreign 1 760.51 1021.2
## - installment_rate 1 766.24 1026.9
## - chk_acct 3 802.27 1049.6
##
## Step: AIC=1000.82
## response ~ chk_acct + duration + credit_his + amount + saving_acct +
## installment_rate + sex + other_debtor + present_resid + property +
## age + other_install + housing + n_credits + job + n_people +
## telephone + foreign
##
## Df Deviance AIC
## - job 3 760.25 980.84
## - property 3 764.42 985.02
## - sex 3 766.52 987.11
## - other_install 2 762.84 990.11
## - saving_acct 4 776.49 990.40
## - other_debtor 2 764.25 991.53
## - housing 2 765.53 992.81
## - n_people 1 760.31 994.27
## - credit_his 4 780.44 994.34
## - present_resid 1 760.87 994.83
## - age 1 761.52 995.48
## - amount 1 762.98 996.94
## - telephone 1 763.00 996.96
## - n_credits 1 763.71 997.67
## - duration 1 765.15 999.11
## <none> 760.17 1000.82
## - foreign 1 766.98 1000.95
## - installment_rate 1 773.66 1007.62
## - chk_acct 3 810.73 1031.32
##
## Step: AIC=980.84
## response ~ chk_acct + duration + credit_his + amount + saving_acct +
## installment_rate + sex + other_debtor + present_resid + property +
## age + other_install + housing + n_credits + n_people + telephone +
## foreign
##
## Df Deviance AIC
## - property 3 764.44 964.98
## - sex 3 766.59 967.13
## - other_install 2 762.94 970.17
## - saving_acct 4 776.96 970.82
## - other_debtor 2 764.27 971.50
## - housing 2 765.58 972.81
## - n_people 1 760.40 974.31
## - credit_his 4 780.51 974.36
## - present_resid 1 760.90 974.81
## - age 1 761.68 975.59
## - amount 1 763.11 977.02
## - telephone 1 763.73 977.64
## - n_credits 1 763.74 977.64
## - duration 1 765.35 979.26
## <none> 760.25 980.84
## - foreign 1 767.05 980.96
## - installment_rate 1 773.89 987.80
## - chk_acct 3 811.14 1011.67
##
## Step: AIC=964.98
## response ~ chk_acct + duration + credit_his + amount + saving_acct +
## installment_rate + sex + other_debtor + present_resid + age +
## other_install + housing + n_credits + n_people + telephone +
## foreign
##
## Df Deviance AIC
## - sex 3 770.86 951.34
## - saving_acct 4 780.53 954.33
## - other_install 2 767.70 954.87
## - other_debtor 2 769.50 956.67
## - credit_his 4 783.93 957.73
## - housing 2 770.74 957.90
## - n_people 1 764.54 958.40
## - present_resid 1 765.08 958.94
## - age 1 765.85 959.70
## - telephone 1 767.25 961.10
## - n_credits 1 767.52 961.38
## - amount 1 767.86 961.71
## - duration 1 770.22 964.07
## <none> 764.44 964.98
## - foreign 1 771.59 965.45
## - installment_rate 1 779.33 973.18
## - chk_acct 3 818.23 998.72
##
## Step: AIC=951.34
## response ~ chk_acct + duration + credit_his + amount + saving_acct +
## installment_rate + other_debtor + present_resid + age + other_install +
## housing + n_credits + n_people + telephone + foreign
##
## Df Deviance AIC
## - saving_acct 4 786.91 940.66
## - other_install 2 773.66 940.77
## - other_debtor 2 775.76 942.88
## - n_people 1 770.91 944.71
## - credit_his 4 791.45 945.19
## - present_resid 1 771.69 945.49
## - housing 2 778.54 945.65
## - age 1 772.21 946.01
## - telephone 1 773.71 947.51
## - n_credits 1 773.72 947.52
## - amount 1 773.78 947.58
## - duration 1 776.70 950.50
## <none> 770.86 951.34
## - foreign 1 778.65 952.45
## - installment_rate 1 783.10 956.90
## - chk_acct 3 825.61 986.04
##
## Step: AIC=940.66
## response ~ chk_acct + duration + credit_his + amount + installment_rate +
## other_debtor + present_resid + age + other_install + housing +
## n_credits + n_people + telephone + foreign
##
## Df Deviance AIC
## - other_install 2 788.90 929.28
## - other_debtor 2 791.42 931.79
## - n_people 1 787.02 934.08
## - housing 2 793.94 934.32
## - present_resid 1 788.13 935.19
## - credit_his 4 808.32 935.33
## - age 1 788.73 935.79
## - amount 1 789.21 936.27
## - n_credits 1 789.69 936.75
## - telephone 1 790.18 937.24
## - duration 1 792.63 939.70
## <none> 786.91 940.66
## - foreign 1 795.75 942.81
## - installment_rate 1 797.70 944.76
## - chk_acct 3 856.80 990.50
##
## Step: AIC=929.28
## response ~ chk_acct + duration + credit_his + amount + installment_rate +
## other_debtor + present_resid + age + housing + n_credits +
## n_people + telephone + foreign
##
## Df Deviance AIC
## - other_debtor 2 793.04 920.05
## - housing 2 795.43 922.44
## - n_people 1 788.96 922.65
## - present_resid 1 790.27 923.96
## - age 1 790.59 924.29
## - amount 1 791.17 924.87
## - telephone 1 791.97 925.66
## - n_credits 1 792.04 925.74
## - credit_his 4 813.93 927.57
## - duration 1 794.67 928.37
## <none> 788.90 929.28
## - foreign 1 797.55 931.24
## - installment_rate 1 799.75 933.44
## - chk_acct 3 858.65 978.97
##
## Step: AIC=920.05
## response ~ chk_acct + duration + credit_his + amount + installment_rate +
## present_resid + age + housing + n_credits + n_people + telephone +
## foreign
##
## Df Deviance AIC
## - n_people 1 793.15 913.47
## - housing 2 799.93 913.57
## - present_resid 1 794.47 914.80
## - age 1 794.76 915.09
## - amount 1 796.19 916.51
## - telephone 1 796.24 916.56
## - n_credits 1 796.26 916.59
## - credit_his 4 817.69 917.96
## - duration 1 798.34 918.67
## <none> 793.04 920.05
## - foreign 1 802.24 922.57
## - installment_rate 1 804.35 924.68
## - chk_acct 3 862.90 969.86
##
## Step: AIC=913.47
## response ~ chk_acct + duration + credit_his + amount + installment_rate +
## present_resid + age + housing + n_credits + telephone + foreign
##
## Df Deviance AIC
## - housing 2 800.01 906.96
## - present_resid 1 794.59 908.23
## - age 1 794.94 908.58
## - n_credits 1 796.29 909.93
## - amount 1 796.31 909.95
## - telephone 1 796.31 909.95
## - credit_his 4 817.71 911.29
## - duration 1 798.46 912.09
## <none> 793.15 913.47
## - foreign 1 802.42 916.06
## - installment_rate 1 804.61 918.25
## - chk_acct 3 863.04 963.31
##
## Step: AIC=906.96
## response ~ chk_acct + duration + credit_his + amount + installment_rate +
## present_resid + age + n_credits + telephone + foreign
##
## Df Deviance AIC
## - present_resid 1 800.19 900.46
## - age 1 802.78 903.05
## - telephone 1 802.84 903.11
## - n_credits 1 803.06 903.33
## - amount 1 803.96 904.23
## - duration 1 804.83 905.10
## - credit_his 4 826.62 906.83
## <none> 800.01 906.96
## - foreign 1 808.84 909.11
## - installment_rate 1 810.96 911.23
## - chk_acct 3 874.22 961.12
##
## Step: AIC=900.46
## response ~ chk_acct + duration + credit_his + amount + installment_rate +
## age + n_credits + telephone + foreign
##
## Df Deviance AIC
## - telephone 1 803.09 896.67
## - n_credits 1 803.16 896.74
## - age 1 803.48 897.07
## - amount 1 804.17 897.76
## - duration 1 804.92 898.51
## <none> 800.19 900.46
## - credit_his 4 826.98 900.51
## - foreign 1 808.92 902.51
## - installment_rate 1 811.04 904.62
## - chk_acct 3 874.23 954.44
##
## Step: AIC=896.67
## response ~ chk_acct + duration + credit_his + amount + installment_rate +
## age + n_credits + foreign
##
## Df Deviance AIC
## - amount 1 805.59 892.49
## - n_credits 1 805.70 892.60
## - age 1 807.27 894.17
## - duration 1 808.75 895.65
## <none> 803.09 896.67
## - credit_his 4 830.40 897.24
## - foreign 1 811.10 898.00
## - installment_rate 1 813.13 900.03
## - chk_acct 3 878.88 952.41
##
## Step: AIC=892.49
## response ~ chk_acct + duration + credit_his + installment_rate +
## age + n_credits + foreign
##
## Df Deviance AIC
## - n_credits 1 808.25 888.46
## - age 1 809.10 889.31
## <none> 805.59 892.49
## - foreign 1 813.20 893.42
## - installment_rate 1 813.26 893.48
## - credit_his 4 833.69 893.85
## - duration 1 824.43 904.64
## - chk_acct 3 880.67 947.51
##
## Step: AIC=888.46
## response ~ chk_acct + duration + credit_his + installment_rate +
## age + foreign
##
## Df Deviance AIC
## - age 1 811.58 885.11
## - credit_his 4 833.70 887.17
## <none> 808.25 888.46
## - installment_rate 1 815.84 889.37
## - foreign 1 816.21 889.74
## - duration 1 826.66 900.19
## - chk_acct 3 883.02 943.18
##
## Step: AIC=885.11
## response ~ chk_acct + duration + credit_his + installment_rate +
## foreign
##
## Df Deviance AIC
## <none> 811.58 885.11
## - installment_rate 1 818.77 885.61
## - foreign 1 819.03 885.87
## - credit_his 4 839.26 886.05
## - duration 1 830.18 897.03
## - chk_acct 3 886.47 939.95
summary(glm.step.bic)
##
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + installment_rate +
## foreign, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7899 -0.8079 -0.4734 0.9077 2.3453
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.254200 0.498269 -0.510 0.60993
## chk_acctA12 -0.470836 0.210253 -2.239 0.02513 *
## chk_acctA13 -1.044010 0.384441 -2.716 0.00661 **
## chk_acctA14 -1.804941 0.228269 -7.907 2.64e-15 ***
## duration 0.030090 0.007026 4.283 1.85e-05 ***
## credit_hisA31 -0.209733 0.518289 -0.405 0.68572
## credit_hisA32 -1.022521 0.400471 -2.553 0.01067 *
## credit_hisA33 -0.911731 0.465469 -1.959 0.05014 .
## credit_hisA34 -1.727040 0.429060 -4.025 5.69e-05 ***
## installment_rate 0.216842 0.081834 2.650 0.00805 **
## foreignA202 -1.741409 0.772964 -2.253 0.02427 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 977.38 on 799 degrees of freedom
## Residual deviance: 811.58 on 789 degrees of freedom
## AIC: 833.58
##
## Number of Fisher Scoring iterations: 5
## The model selected by stepwise model selection when BIC is used as criterion-
# glm(formula = response ~ chk_acct + duration + credit_his+installment_rate +
# foreign, family = binomial,
# data = german_credit)
## Now let's build these two models
glm.aic<-glm(formula = response ~ chk_acct + duration + credit_his + purpose +
amount + saving_acct + present_emp + installment_rate + other_debtor +
age + other_install + housing + n_credits + foreign, family = binomial,
data = train)
glm.bic<-glm(formula = response ~ chk_acct + duration + credit_his + installment_rate +
foreign, family = binomial, data = train)
## Comparing AIC and BIC of both models
AIC(glm.aic)
## [1] 810.9323
AIC(glm.bic)
## [1] 833.5811
BIC(glm.aic)
## [1] 984.263
BIC(glm.bic)
## [1] 885.1118
### Defining optimup cutoff probability
df = seq(0.01, 0.99, 0.01)
result = cbind(df, NA)
cost1 <- function(actual, predicted) {
weight1 = 5
weight0 = 1
c1 = (actual == 1) & (predicted < cutoff) #logical vector - true if actual bad but predicted good
c0 = (actual == 0) & (predicted > cutoff) #logical vecotr - true if actual good but predicted bad
return(mean(weight1 * c1 + weight0 * c0))
}
for (i in 1:length(df)) {
cutoff <- result[i, 1]
result[i, 2] <- cost1(train$response, predict(glm.aic, type = "response"))
}
plot(result, ylab = "Cost in Training Set")

## We can see that the least cost is when coutoff probability is around 0.2
## We get the exact cutoff probability
result[which(result[,2]==min(result[,2])),1]
## df
## 0.16
# Both criterion suggest different models
table(predict(full.glm, type = "response") > 0.14)
##
## FALSE TRUE
## 285 515
table(predict(glm.aic, type = "response") > 0.14)
##
## FALSE TRUE
## 286 514
table(predict(glm.bic, type = "response") > 0.14)
##
## FALSE TRUE
## 238 562
## 0.4 seems to be a better threshold for the analysis, however, the cost is higher for this cutoff probability
table(predict(full.glm, type = "response") > 0.4)
##
## FALSE TRUE
## 547 253
table(predict(glm.aic, type = "response") > 0.4)
##
## FALSE TRUE
## 550 250
table(predict(glm.bic, type = "response") > 0.4)
##
## FALSE TRUE
## 554 246
## Comparing insample performance of all the models
## Comparing all models
full.insample <- predict(full.glm, train, type = "response")
aic.insample <- predict(glm.aic, train, type = "response")
bic.insample <- predict(glm.bic, train, type = "response")
roc.plot(x = train$response == "1", pred = cbind(full.insample,aic.insample,bic.insample),
legend = TRUE, leg.text = c("Full Model", "AIC","BIC"))$roc.vol

## Model Area p.value binorm.area
## 1 Model 1 0.8282440 2.107096e-49 NA
## 2 Model 2 0.8216964 1.559410e-47 NA
## 3 Model 3 0.7707366 2.916515e-34 NA
### We can see that AIC based model and the full model have higher AUC, we will go ahead with AIC based model
## As it is less complex compared to full model
## Insample misclassification table
prob.insample <- predict(glm.aic, type = "response")
predicted.insample <- prob.insample > 0.16
predicted.insample <- as.numeric(predicted.insample)
## Confusion matrix,
confusionMatrix(predicted.insample, train$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 288 20
## 1 272 220
##
## Accuracy : 0.635
## 95% CI : (0.6006, 0.6684)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3315
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5143
## Specificity : 0.9167
## Pos Pred Value : 0.9351
## Neg Pred Value : 0.4472
## Prevalence : 0.7000
## Detection Rate : 0.3600
## Detection Prevalence : 0.3850
## Balanced Accuracy : 0.7155
##
## 'Positive' Class : 0
##
# Error Rate-
mean(ifelse(train$response != predicted.insample, 1, 0))
## [1] 0.365
## Accuracy is 63% and the error rate is .36
## out of sample prediction
prob.outsample<-predict(glm.aic,test,type = "response")
predicted.outsample<-ifelse(prob.outsample<0.16,0,1)
## Confusion matrix,
confusionMatrix(predicted.outsample, test$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 78 7
## 1 62 53
##
## Accuracy : 0.655
## 95% CI : (0.5847, 0.7206)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.9272
##
## Kappa : 0.3491
## Mcnemar's Test P-Value : 7.987e-11
##
## Sensitivity : 0.5571
## Specificity : 0.8833
## Pos Pred Value : 0.9176
## Neg Pred Value : 0.4609
## Prevalence : 0.7000
## Detection Rate : 0.3900
## Detection Prevalence : 0.4250
## Balanced Accuracy : 0.7202
##
## 'Positive' Class : 0
##
# out sample Error Rate-
mean(ifelse(test$response != predicted.outsample, 1, 0))
## [1] 0.345
## Out of sample accuracy is 60% and error rate is 0.39
## ROC
roc.plot(test$response == "1", prob.outsample)
roc.plot(test$response == "1", prob.outsample)$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8310714 6.188138e-14 NA
## AUC IS 0.83
## Comparing all models for out sample
aic.outsample <- predict(glm.aic, test, type = "response")
roc.plot(x = test$response == "1", pred = cbind(aic.outsample),
legend = TRUE, leg.text = c("AIC"))$roc.vol

## Model Area p.value binorm.area
## 1 Model 1 0.8310714 6.188138e-14 NA
## As we can see, AUC is best for Full and AIC based model