library(ggplot2)
library(data.table)
library(gridExtra)
library(glmnet)
library(caret)
library("verification")
german_credit = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")

colnames(german_credit) = c("chk_acct", "duration", "credit_his", "purpose", 
                            "amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor", 
                            "present_resid", "property", "age", "other_install", "housing", "n_credits", 
                            "job", "n_people", "telephone", "foreign", "response")

# orginal response coding 1= good, 2 = bad we need 0 = good, 1 = bad
german_credit$response = german_credit$response - 1


## For building the model, we need to divide the dataset in test and train
set.seed(12428258)
flag<-sample(nrow(german_credit),0.8*nrow(german_credit),replace = FALSE)
train<-german_credit[flag,]
test<-german_credit[-flag,]
dim(train)
## [1] 800  21
dim(test)
## [1] 200  21
#Exploratory Data Analysis
str(train)
## 'data.frame':    800 obs. of  21 variables:
##  $ chk_acct        : Factor w/ 4 levels "A11","A12","A13",..: 4 1 2 4 1 3 2 2 2 1 ...
##  $ duration        : int  7 36 18 6 18 12 12 10 9 24 ...
##  $ credit_his      : Factor w/ 5 levels "A30","A31","A32",..: 5 5 4 3 3 3 3 3 3 4 ...
##  $ purpose         : Factor w/ 10 levels "A40","A41","A410",..: 5 4 1 5 2 5 4 1 5 1 ...
##  $ amount          : int  730 2348 2899 1346 7511 1881 983 7308 1206 1333 ...
##  $ saving_acct     : Factor w/ 5 levels "A61","A62","A63",..: 5 1 5 2 5 1 4 1 1 1 ...
##  $ present_emp     : Factor w/ 5 levels "A71","A72","A73",..: 5 3 5 5 5 3 2 1 5 1 ...
##  $ installment_rate: int  4 3 4 2 1 2 1 2 4 4 ...
##  $ sex             : Factor w/ 4 levels "A91","A92","A93",..: 3 4 3 3 3 2 2 3 2 3 ...
##  $ other_debtor    : Factor w/ 3 levels "A101","A102",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ present_resid   : int  2 2 4 4 4 2 4 4 4 2 ...
##  $ property        : Factor w/ 4 levels "A121","A122",..: 2 2 3 4 2 3 1 4 1 1 ...
##  $ age             : int  46 46 43 42 51 44 19 70 25 43 ...
##  $ other_install   : Factor w/ 3 levels "A141","A142",..: 3 3 3 1 3 3 3 1 3 3 ...
##  $ housing         : Factor w/ 3 levels "A151","A152",..: 1 2 2 3 3 1 1 3 2 3 ...
##  $ n_credits       : int  2 2 1 1 1 1 1 1 1 2 ...
##  $ job             : Factor w/ 4 levels "A171","A172",..: 2 3 3 3 3 2 2 4 3 3 ...
##  $ n_people        : int  1 1 2 2 2 1 1 1 1 2 ...
##  $ telephone       : Factor w/ 2 levels "A191","A192": 2 2 1 2 2 2 1 2 1 1 ...
##  $ foreign         : Factor w/ 2 levels "A201","A202": 1 1 1 1 1 1 1 1 1 1 ...
##  $ response        : num  0 0 0 0 1 0 0 0 0 1 ...
table(train$response)
## 
##   0   1 
## 560 240
summary(train)
##  chk_acct     duration     credit_his    purpose        amount     
##  A11:217   Min.   : 4.00   A30: 35    A43    :232   Min.   :  250  
##  A12:215   1st Qu.:12.00   A31: 40    A40    :181   1st Qu.: 1382  
##  A13: 51   Median :18.00   A32:422    A42    :151   Median : 2320  
##  A14:317   Mean   :20.86   A33: 73    A41    : 81   Mean   : 3229  
##            3rd Qu.:24.00   A34:230    A49    : 72   3rd Qu.: 3960  
##            Max.   :72.00              A46    : 36   Max.   :18424  
##                                       (Other): 47                  
##  saving_acct present_emp installment_rate  sex      other_debtor
##  A61:487     A71: 49     Min.   :1.000    A91: 45   A101:723    
##  A62: 81     A72:151     1st Qu.:2.000    A92:247   A102: 35    
##  A63: 44     A73:270     Median :3.000    A93:432   A103: 42    
##  A64: 42     A74:126     Mean   :2.966    A94: 76               
##  A65:146     A75:204     3rd Qu.:4.000                          
##                          Max.   :4.000                          
##                                                                 
##  present_resid   property        age        other_install housing   
##  Min.   :1.000   A121:230   Min.   :19.00   A141:119      A151:147  
##  1st Qu.:2.000   A122:179   1st Qu.:27.00   A142: 40      A152:562  
##  Median :3.000   A123:266   Median :33.00   A143:641      A153: 91  
##  Mean   :2.855   A124:125   Mean   :35.49                           
##  3rd Qu.:4.000              3rd Qu.:42.00                           
##  Max.   :4.000              Max.   :75.00                           
##                                                                     
##    n_credits       job         n_people     telephone  foreign   
##  Min.   :1.000   A171: 17   Min.   :1.000   A191:477   A201:771  
##  1st Qu.:1.000   A172:156   1st Qu.:1.000   A192:323   A202: 29  
##  Median :1.000   A173:511   Median :1.000                        
##  Mean   :1.403   A174:116   Mean   :1.151                        
##  3rd Qu.:2.000              3rd Qu.:1.000                        
##  Max.   :4.000              Max.   :2.000                        
##                                                                  
##     response  
##  Min.   :0.0  
##  1st Qu.:0.0  
##  Median :0.0  
##  Mean   :0.3  
##  3rd Qu.:1.0  
##  Max.   :1.0  
## 
## Check for categorical variables
# indices of categorical variables-
cat<-c(1,3,4,6,7,9,10,12,14,15,17,19,20)

## Continueous variables
ggplot(data = train,aes(x =factor(response),y = duration))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = amount))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = installment_rate))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = present_resid))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = age))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = n_credits))+geom_boxplot()

ggplot(data = train,aes(x =factor(response),y = n_people))+geom_boxplot()

### Duration and amount seem to have some impact of reponse
cor(train[,-cat])[,8]
##         duration           amount installment_rate    present_resid 
##      0.206373426      0.130613716      0.099433367     -0.005429762 
##              age        n_credits         n_people         response 
##     -0.086874187     -0.031677233     -0.002283937      1.000000000
## Categorical Variables

ggplot(data = train, aes(x = train[,1], fill = factor(response))) +
    geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[1],y='Response',title=colnames(train)[1])

ggplot(data = train, aes(x = train[,3], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[3],y='Response',title=colnames(train)[3])

ggplot(data = train, aes(x = train[,4], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[4],y='Response',title=colnames(train)[4])

ggplot(data = train, aes(x = train[,6], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[6],y='Response',title=colnames(train)[6])

ggplot(data = train, aes(x = train[,7], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[7],y='Response',title=colnames(train)[7])

ggplot(data = train, aes(x = train[,9], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[9],y='Response',title=colnames(train)[9])

ggplot(data = train, aes(x = train[,10], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[10],y='Response',title=colnames(train)[10])

ggplot(data = train, aes(x = train[,12], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[12],y='Response',title=colnames(train)[12])

ggplot(data = train, aes(x = train[,14], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[14],y='Response',title=colnames(train)[14])

ggplot(data = train, aes(x = train[,15], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[15],y='Response',title=colnames(train)[15])

ggplot(data = train, aes(x = train[,17], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[17],y='Response',title=colnames(train)[17])

ggplot(data = train, aes(x = train[,19], fill = factor(response))) +
  geom_bar(stat='count', position='dodge')+labs(x=colnames(train)[19],y='Response',title=colnames(train)[19])

## Modeling
full.glm<-glm(data = train,response~.,family = binomial)
summary(full.glm)
## 
## Call:
## glm(formula = response ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4426  -0.7121  -0.3865   0.6821   2.5726  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        5.797e-01  1.235e+00   0.469 0.638741    
## chk_acctA12       -3.921e-01  2.464e-01  -1.591 0.111591    
## chk_acctA13       -9.260e-01  4.183e-01  -2.214 0.026832 *  
## chk_acctA14       -1.582e+00  2.554e-01  -6.194 5.87e-10 ***
## duration           2.535e-02  1.026e-02   2.471 0.013477 *  
## credit_hisA31      1.183e-01  6.056e-01   0.195 0.845138    
## credit_hisA32     -5.498e-01  4.686e-01  -1.173 0.240680    
## credit_hisA33     -7.275e-01  5.145e-01  -1.414 0.157361    
## credit_hisA34     -1.532e+00  4.860e-01  -3.151 0.001627 ** 
## purposeA41        -1.513e+00  4.104e-01  -3.687 0.000227 ***
## purposeA410       -1.888e+00  9.042e-01  -2.088 0.036777 *  
## purposeA42        -9.324e-01  2.969e-01  -3.141 0.001686 ** 
## purposeA43        -8.202e-01  2.741e-01  -2.992 0.002770 ** 
## purposeA44        -7.790e-01  8.319e-01  -0.936 0.349093    
## purposeA45        -1.273e-01  5.739e-01  -0.222 0.824485    
## purposeA46         2.013e-01  4.719e-01   0.427 0.669678    
## purposeA48        -1.412e+00  1.190e+00  -1.186 0.235624    
## purposeA49        -6.649e-01  3.792e-01  -1.754 0.079491 .  
## amount             1.167e-04  5.271e-05   2.214 0.026806 *  
## saving_acctA62    -3.959e-01  3.235e-01  -1.224 0.221041    
## saving_acctA63    -1.282e-01  4.345e-01  -0.295 0.767955    
## saving_acctA64    -1.121e+00  5.410e-01  -2.073 0.038179 *  
## saving_acctA65    -1.089e+00  3.073e-01  -3.543 0.000395 ***
## present_empA72    -3.428e-01  4.798e-01  -0.714 0.474967    
## present_empA73    -5.089e-01  4.635e-01  -1.098 0.272239    
## present_empA74    -1.168e+00  5.098e-01  -2.291 0.021962 *  
## present_empA75    -6.572e-01  4.642e-01  -1.416 0.156832    
## installment_rate   3.530e-01  1.006e-01   3.507 0.000453 ***
## sexA92            -4.528e-01  4.173e-01  -1.085 0.277808    
## sexA93            -7.762e-01  4.070e-01  -1.907 0.056519 .  
## sexA94            -4.579e-01  4.879e-01  -0.938 0.347988    
## other_debtorA102   6.212e-01  4.408e-01   1.409 0.158729    
## other_debtorA103  -5.070e-01  4.519e-01  -1.122 0.261905    
## present_resid      1.770e-03  9.816e-02   0.018 0.985618    
## propertyA122       3.224e-01  2.851e-01   1.131 0.258154    
## propertyA123       1.953e-01  2.613e-01   0.747 0.454902    
## propertyA124       8.337e-01  4.716e-01   1.768 0.077081 .  
## age               -1.553e-02  1.006e-02  -1.543 0.122857    
## other_installA142 -1.533e-01  4.505e-01  -0.340 0.733700    
## other_installA143 -5.163e-01  2.632e-01  -1.962 0.049811 *  
## housingA152       -5.886e-01  2.604e-01  -2.260 0.023792 *  
## housingA153       -7.953e-01  5.310e-01  -1.498 0.134160    
## n_credits          3.689e-01  2.122e-01   1.738 0.082160 .  
## jobA172            6.941e-01  8.040e-01   0.863 0.387964    
## jobA173            7.745e-01  7.743e-01   1.000 0.317163    
## jobA174            7.045e-01  7.802e-01   0.903 0.366569    
## n_people           1.107e-01  2.827e-01   0.392 0.695302    
## telephoneA192     -3.339e-01  2.232e-01  -1.496 0.134747    
## foreignA202       -1.919e+00  8.317e-01  -2.307 0.021027 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 977.38  on 799  degrees of freedom
## Residual deviance: 725.89  on 751  degrees of freedom
## AIC: 823.89
## 
## Number of Fisher Scoring iterations: 5
AIC(full.glm)
## [1] 823.8938
summary(full.glm$fitted.values)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001006 0.089539 0.224454 0.300000 0.474623 0.954198
### Defining optimup cutoff probability

df = seq(0.01, 0.99, 0.01)
result = cbind(df, NA)
cost1 <- function(actual, predicted) {
  weight1 = 5
  weight0 = 1
  c1 = (actual == 1) & (predicted < cutoff)  #logical vector - true if actual bad but predicted good
  c0 = (actual == 0) & (predicted > cutoff)  #logical vecotr - true if actual good but predicted bad
  return(mean(weight1 * c1 + weight0 * c0))
}
for (i in 1:length(df)) {
  cutoff <- result[i, 1]
  result[i, 2] <- cost1(train$response, predict(full.glm, type = "response"))
}
plot(result, ylab = "Cost in Training Set")

## We can see that the least cost is when coutoff probability is around 0.2 
## We get the exact cutoff probability
result[which(result[,2]==min(result[,2])),1]
##   df 
## 0.14
## we get 0.14 as the cut off probability where the cost is minimum

## As we can see, many of the variables are not significant for the model.
glm.step.aic <- step(full.glm) ## Uses AIC by default
## Start:  AIC=823.89
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + property + age + other_install + housing + 
##     n_credits + job + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - job               3   726.98 818.98
## - property          3   729.35 821.35
## - present_resid     1   725.89 821.89
## - n_people          1   726.05 822.05
## - sex               3   730.53 822.53
## - other_debtor      2   729.45 823.45
## <none>                  725.89 823.89
## - other_install     2   730.02 824.02
## - telephone         1   728.15 824.15
## - present_emp       4   734.30 824.30
## - age               1   728.32 824.32
## - n_credits         1   728.94 824.94
## - housing           2   731.54 825.54
## - amount            1   730.81 826.81
## - duration          1   732.06 828.06
## - foreign           1   733.25 829.25
## - purpose           9   753.24 833.24
## - saving_acct       4   743.25 833.25
## - installment_rate  1   738.67 834.67
## - credit_his        4   745.86 835.86
## - chk_acct          3   771.47 863.47
## 
## Step:  AIC=818.98
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + property + age + other_install + housing + 
##     n_credits + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - property          3   730.51 816.51
## - present_resid     1   726.98 816.98
## - n_people          1   727.12 817.12
## - sex               3   731.78 817.78
## - other_debtor      2   730.47 818.47
## - present_emp       4   734.58 818.58
## <none>                  726.98 818.98
## - other_install     2   731.05 819.05
## - telephone         1   729.30 819.30
## - age               1   729.62 819.62
## - n_credits         1   729.82 819.82
## - housing           2   732.45 820.45
## - amount            1   732.10 822.10
## - duration          1   733.47 823.47
## - foreign           1   734.39 824.39
## - purpose           9   753.83 827.83
## - saving_acct       4   745.28 829.28
## - installment_rate  1   740.55 830.55
## - credit_his        4   746.92 830.92
## - chk_acct          3   771.94 857.94
## 
## Step:  AIC=816.51
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + age + other_install + housing + n_credits + 
##     n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - present_resid     1   730.51 814.51
## - n_people          1   730.61 814.61
## - sex               3   735.18 815.18
## - present_emp       4   738.18 816.18
## - telephone         1   732.23 816.23
## <none>                  730.51 816.51
## - other_debtor      2   734.71 816.71
## - n_credits         1   733.02 817.02
## - other_install     2   735.12 817.12
## - age               1   733.13 817.13
## - housing           2   736.19 818.19
## - amount            1   736.24 820.24
## - duration          1   737.93 821.93
## - foreign           1   738.28 822.28
## - saving_acct       4   748.02 826.02
## - purpose           9   758.03 826.03
## - credit_his        4   750.43 828.43
## - installment_rate  1   745.26 829.26
## - chk_acct          3   777.21 857.21
## 
## Step:  AIC=814.51
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     age + other_install + housing + n_credits + n_people + telephone + 
##     foreign
## 
##                    Df Deviance    AIC
## - n_people          1   730.61 812.61
## - sex               3   735.18 813.18
## - telephone         1   732.24 814.24
## - present_emp       4   738.29 814.29
## <none>                  730.51 814.51
## - other_debtor      2   734.71 814.71
## - n_credits         1   733.03 815.03
## - other_install     2   735.13 815.13
## - age               1   733.17 815.17
## - housing           2   736.59 816.59
## - amount            1   736.26 818.26
## - duration          1   737.97 819.97
## - foreign           1   738.33 820.33
## - saving_acct       4   748.04 824.04
## - purpose           9   758.20 824.20
## - credit_his        4   750.45 826.45
## - installment_rate  1   745.28 827.28
## - chk_acct          3   777.35 855.35
## 
## Step:  AIC=812.61
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     age + other_install + housing + n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - sex               3   735.21 811.21
## - present_emp       4   738.38 812.38
## - telephone         1   732.38 812.38
## <none>                  730.61 812.61
## - other_debtor      2   734.77 812.77
## - age               1   733.23 813.23
## - n_credits         1   733.24 813.24
## - other_install     2   735.28 813.28
## - housing           2   736.75 814.75
## - amount            1   736.30 816.30
## - duration          1   738.06 818.06
## - foreign           1   738.37 818.37
## - saving_acct       4   748.08 822.08
## - purpose           9   758.32 822.32
## - credit_his        4   750.96 824.96
## - installment_rate  1   745.28 825.28
## - chk_acct          3   777.51 853.51
## 
## Step:  AIC=811.21
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + other_debtor + 
##     age + other_install + housing + n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - telephone         1   736.93 810.93
## <none>                  735.21 811.21
## - other_debtor      2   739.22 811.22
## - other_install     2   739.26 811.26
## - age               1   737.66 811.66
## - n_credits         1   737.67 811.67
## - present_emp       4   745.03 813.03
## - housing           2   742.21 814.21
## - amount            1   740.27 814.27
## - duration          1   742.97 816.97
## - foreign           1   743.64 817.64
## - purpose           9   762.60 820.60
## - saving_acct       4   752.84 820.84
## - installment_rate  1   747.91 821.91
## - credit_his        4   755.60 823.60
## - chk_acct          3   782.79 852.79
## 
## Step:  AIC=810.93
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + other_debtor + 
##     age + other_install + housing + n_credits + foreign
## 
##                    Df Deviance    AIC
## <none>                  736.93 810.93
## - other_install     2   740.95 810.95
## - other_debtor      2   741.00 811.00
## - n_credits         1   739.17 811.17
## - age               1   739.84 811.84
## - amount            1   740.91 812.91
## - present_emp       4   747.06 813.06
## - housing           2   743.68 813.68
## - foreign           1   744.86 816.86
## - duration          1   745.67 817.67
## - saving_acct       4   754.85 820.85
## - installment_rate  1   748.90 820.90
## - purpose           9   765.16 821.16
## - credit_his        4   757.49 823.49
## - chk_acct          3   785.29 853.29
summary(glm.step.aic)
## 
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + purpose + 
##     amount + saving_acct + present_emp + installment_rate + other_debtor + 
##     age + other_install + housing + n_credits + foreign, family = binomial, 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4331  -0.7149  -0.4066   0.7187   2.5207  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        9.497e-01  9.271e-01   1.024 0.305642    
## chk_acctA12       -4.171e-01  2.402e-01  -1.737 0.082455 .  
## chk_acctA13       -9.397e-01  4.058e-01  -2.316 0.020574 *  
## chk_acctA14       -1.613e+00  2.520e-01  -6.401 1.54e-10 ***
## duration           2.888e-02  9.826e-03   2.939 0.003296 ** 
## credit_hisA31      1.128e-01  5.948e-01   0.190 0.849591    
## credit_hisA32     -5.647e-01  4.598e-01  -1.228 0.219438    
## credit_hisA33     -7.149e-01  5.040e-01  -1.419 0.156011    
## credit_hisA34     -1.528e+00  4.772e-01  -3.202 0.001364 ** 
## purposeA41        -1.452e+00  3.981e-01  -3.647 0.000265 ***
## purposeA410       -1.921e+00  8.509e-01  -2.258 0.023945 *  
## purposeA42        -8.158e-01  2.884e-01  -2.829 0.004669 ** 
## purposeA43        -8.291e-01  2.689e-01  -3.083 0.002049 ** 
## purposeA44        -7.109e-01  8.066e-01  -0.881 0.378095    
## purposeA45        -8.127e-02  5.648e-01  -0.144 0.885576    
## purposeA46         3.488e-01  4.640e-01   0.752 0.452147    
## purposeA48        -1.305e+00  1.158e+00  -1.127 0.259634    
## purposeA49        -6.593e-01  3.674e-01  -1.794 0.072749 .  
## amount             9.599e-05  4.821e-05   1.991 0.046471 *  
## saving_acctA62    -2.908e-01  3.120e-01  -0.932 0.351353    
## saving_acctA63    -1.729e-01  4.358e-01  -0.397 0.691633    
## saving_acctA64    -1.044e+00  5.286e-01  -1.975 0.048289 *  
## saving_acctA65    -1.098e+00  2.989e-01  -3.672 0.000241 ***
## present_empA72    -8.005e-02  4.279e-01  -0.187 0.851600    
## present_empA73    -3.230e-01  4.048e-01  -0.798 0.424979    
## present_empA74    -1.027e+00  4.559e-01  -2.253 0.024287 *  
## present_empA75    -4.380e-01  4.120e-01  -1.063 0.287705    
## installment_rate   3.272e-01  9.633e-02   3.397 0.000681 ***
## other_debtorA102   6.552e-01  4.337e-01   1.511 0.130886    
## other_debtorA103  -5.219e-01  4.350e-01  -1.200 0.230182    
## age               -1.635e-02  9.687e-03  -1.688 0.091398 .  
## other_installA142 -2.098e-01  4.475e-01  -0.469 0.639218    
## other_installA143 -5.057e-01  2.567e-01  -1.970 0.048859 *  
## housingA152       -6.292e-01  2.460e-01  -2.558 0.010528 *  
## housingA153       -3.698e-01  3.776e-01  -0.979 0.327447    
## n_credits          3.049e-01  2.047e-01   1.489 0.136513    
## foreignA202       -1.901e+00  8.054e-01  -2.360 0.018283 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 977.38  on 799  degrees of freedom
## Residual deviance: 736.93  on 763  degrees of freedom
## AIC: 810.93
## 
## Number of Fisher Scoring iterations: 5
## Step glm suggestd following model-
#     glm(formula = response ~ chk_acct + duration + credit_his + purpose + 
#     amount + saving_acct + present_emp + installment_rate + other_debtor + 
#     age + other_install + housing + n_credits + foreign, family = binomial, 
#     data = train)



## Use stepwise model selection with BIC as cost function
glm.step.bic <- step(full.glm, k = log(nrow(train)))
## Start:  AIC=1053.44
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + property + age + other_install + housing + 
##     n_credits + job + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - purpose           9   753.24 1020.6
## - job               3   726.98 1034.5
## - present_emp       4   734.30 1035.1
## - property          3   729.35 1036.8
## - sex               3   730.53 1038.0
## - other_debtor      2   729.45 1043.6
## - saving_acct       4   743.25 1044.1
## - other_install     2   730.02 1044.2
## - housing           2   731.54 1045.7
## - credit_his        4   745.86 1046.7
## - present_resid     1   725.89 1046.8
## - n_people          1   726.05 1046.9
## - telephone         1   728.15 1049.0
## - age               1   728.32 1049.2
## - n_credits         1   728.94 1049.8
## - amount            1   730.81 1051.7
## - duration          1   732.06 1052.9
## <none>                  725.89 1053.4
## - foreign           1   733.25 1054.1
## - installment_rate  1   738.67 1059.5
## - chk_acct          3   771.47 1079.0
## 
## Step:  AIC=1020.62
## response ~ chk_acct + duration + credit_his + amount + saving_acct + 
##     present_emp + installment_rate + sex + other_debtor + present_resid + 
##     property + age + other_install + housing + n_credits + job + 
##     n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - present_emp       4   760.17 1000.8
## - job               3   753.83 1001.2
## - sex               3   757.49 1004.8
## - property          3   757.54 1004.9
## - saving_acct       4   768.93 1009.6
## - other_install     2   755.96 1010.0
## - other_debtor      2   756.83 1010.9
## - housing           2   759.24 1013.2
## - n_people          1   753.41 1014.1
## - present_resid     1   753.46 1014.2
## - credit_his        4   773.53 1014.2
## - age               1   754.27 1015.0
## - telephone         1   755.73 1016.4
## - amount            1   755.78 1016.5
## - n_credits         1   757.35 1018.0
## - duration          1   759.05 1019.8
## <none>                  753.24 1020.6
## - foreign           1   760.51 1021.2
## - installment_rate  1   766.24 1026.9
## - chk_acct          3   802.27 1049.6
## 
## Step:  AIC=1000.82
## response ~ chk_acct + duration + credit_his + amount + saving_acct + 
##     installment_rate + sex + other_debtor + present_resid + property + 
##     age + other_install + housing + n_credits + job + n_people + 
##     telephone + foreign
## 
##                    Df Deviance     AIC
## - job               3   760.25  980.84
## - property          3   764.42  985.02
## - sex               3   766.52  987.11
## - other_install     2   762.84  990.11
## - saving_acct       4   776.49  990.40
## - other_debtor      2   764.25  991.53
## - housing           2   765.53  992.81
## - n_people          1   760.31  994.27
## - credit_his        4   780.44  994.34
## - present_resid     1   760.87  994.83
## - age               1   761.52  995.48
## - amount            1   762.98  996.94
## - telephone         1   763.00  996.96
## - n_credits         1   763.71  997.67
## - duration          1   765.15  999.11
## <none>                  760.17 1000.82
## - foreign           1   766.98 1000.95
## - installment_rate  1   773.66 1007.62
## - chk_acct          3   810.73 1031.32
## 
## Step:  AIC=980.84
## response ~ chk_acct + duration + credit_his + amount + saving_acct + 
##     installment_rate + sex + other_debtor + present_resid + property + 
##     age + other_install + housing + n_credits + n_people + telephone + 
##     foreign
## 
##                    Df Deviance     AIC
## - property          3   764.44  964.98
## - sex               3   766.59  967.13
## - other_install     2   762.94  970.17
## - saving_acct       4   776.96  970.82
## - other_debtor      2   764.27  971.50
## - housing           2   765.58  972.81
## - n_people          1   760.40  974.31
## - credit_his        4   780.51  974.36
## - present_resid     1   760.90  974.81
## - age               1   761.68  975.59
## - amount            1   763.11  977.02
## - telephone         1   763.73  977.64
## - n_credits         1   763.74  977.64
## - duration          1   765.35  979.26
## <none>                  760.25  980.84
## - foreign           1   767.05  980.96
## - installment_rate  1   773.89  987.80
## - chk_acct          3   811.14 1011.67
## 
## Step:  AIC=964.98
## response ~ chk_acct + duration + credit_his + amount + saving_acct + 
##     installment_rate + sex + other_debtor + present_resid + age + 
##     other_install + housing + n_credits + n_people + telephone + 
##     foreign
## 
##                    Df Deviance    AIC
## - sex               3   770.86 951.34
## - saving_acct       4   780.53 954.33
## - other_install     2   767.70 954.87
## - other_debtor      2   769.50 956.67
## - credit_his        4   783.93 957.73
## - housing           2   770.74 957.90
## - n_people          1   764.54 958.40
## - present_resid     1   765.08 958.94
## - age               1   765.85 959.70
## - telephone         1   767.25 961.10
## - n_credits         1   767.52 961.38
## - amount            1   767.86 961.71
## - duration          1   770.22 964.07
## <none>                  764.44 964.98
## - foreign           1   771.59 965.45
## - installment_rate  1   779.33 973.18
## - chk_acct          3   818.23 998.72
## 
## Step:  AIC=951.34
## response ~ chk_acct + duration + credit_his + amount + saving_acct + 
##     installment_rate + other_debtor + present_resid + age + other_install + 
##     housing + n_credits + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - saving_acct       4   786.91 940.66
## - other_install     2   773.66 940.77
## - other_debtor      2   775.76 942.88
## - n_people          1   770.91 944.71
## - credit_his        4   791.45 945.19
## - present_resid     1   771.69 945.49
## - housing           2   778.54 945.65
## - age               1   772.21 946.01
## - telephone         1   773.71 947.51
## - n_credits         1   773.72 947.52
## - amount            1   773.78 947.58
## - duration          1   776.70 950.50
## <none>                  770.86 951.34
## - foreign           1   778.65 952.45
## - installment_rate  1   783.10 956.90
## - chk_acct          3   825.61 986.04
## 
## Step:  AIC=940.66
## response ~ chk_acct + duration + credit_his + amount + installment_rate + 
##     other_debtor + present_resid + age + other_install + housing + 
##     n_credits + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - other_install     2   788.90 929.28
## - other_debtor      2   791.42 931.79
## - n_people          1   787.02 934.08
## - housing           2   793.94 934.32
## - present_resid     1   788.13 935.19
## - credit_his        4   808.32 935.33
## - age               1   788.73 935.79
## - amount            1   789.21 936.27
## - n_credits         1   789.69 936.75
## - telephone         1   790.18 937.24
## - duration          1   792.63 939.70
## <none>                  786.91 940.66
## - foreign           1   795.75 942.81
## - installment_rate  1   797.70 944.76
## - chk_acct          3   856.80 990.50
## 
## Step:  AIC=929.28
## response ~ chk_acct + duration + credit_his + amount + installment_rate + 
##     other_debtor + present_resid + age + housing + n_credits + 
##     n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - other_debtor      2   793.04 920.05
## - housing           2   795.43 922.44
## - n_people          1   788.96 922.65
## - present_resid     1   790.27 923.96
## - age               1   790.59 924.29
## - amount            1   791.17 924.87
## - telephone         1   791.97 925.66
## - n_credits         1   792.04 925.74
## - credit_his        4   813.93 927.57
## - duration          1   794.67 928.37
## <none>                  788.90 929.28
## - foreign           1   797.55 931.24
## - installment_rate  1   799.75 933.44
## - chk_acct          3   858.65 978.97
## 
## Step:  AIC=920.05
## response ~ chk_acct + duration + credit_his + amount + installment_rate + 
##     present_resid + age + housing + n_credits + n_people + telephone + 
##     foreign
## 
##                    Df Deviance    AIC
## - n_people          1   793.15 913.47
## - housing           2   799.93 913.57
## - present_resid     1   794.47 914.80
## - age               1   794.76 915.09
## - amount            1   796.19 916.51
## - telephone         1   796.24 916.56
## - n_credits         1   796.26 916.59
## - credit_his        4   817.69 917.96
## - duration          1   798.34 918.67
## <none>                  793.04 920.05
## - foreign           1   802.24 922.57
## - installment_rate  1   804.35 924.68
## - chk_acct          3   862.90 969.86
## 
## Step:  AIC=913.47
## response ~ chk_acct + duration + credit_his + amount + installment_rate + 
##     present_resid + age + housing + n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - housing           2   800.01 906.96
## - present_resid     1   794.59 908.23
## - age               1   794.94 908.58
## - n_credits         1   796.29 909.93
## - amount            1   796.31 909.95
## - telephone         1   796.31 909.95
## - credit_his        4   817.71 911.29
## - duration          1   798.46 912.09
## <none>                  793.15 913.47
## - foreign           1   802.42 916.06
## - installment_rate  1   804.61 918.25
## - chk_acct          3   863.04 963.31
## 
## Step:  AIC=906.96
## response ~ chk_acct + duration + credit_his + amount + installment_rate + 
##     present_resid + age + n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - present_resid     1   800.19 900.46
## - age               1   802.78 903.05
## - telephone         1   802.84 903.11
## - n_credits         1   803.06 903.33
## - amount            1   803.96 904.23
## - duration          1   804.83 905.10
## - credit_his        4   826.62 906.83
## <none>                  800.01 906.96
## - foreign           1   808.84 909.11
## - installment_rate  1   810.96 911.23
## - chk_acct          3   874.22 961.12
## 
## Step:  AIC=900.46
## response ~ chk_acct + duration + credit_his + amount + installment_rate + 
##     age + n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - telephone         1   803.09 896.67
## - n_credits         1   803.16 896.74
## - age               1   803.48 897.07
## - amount            1   804.17 897.76
## - duration          1   804.92 898.51
## <none>                  800.19 900.46
## - credit_his        4   826.98 900.51
## - foreign           1   808.92 902.51
## - installment_rate  1   811.04 904.62
## - chk_acct          3   874.23 954.44
## 
## Step:  AIC=896.67
## response ~ chk_acct + duration + credit_his + amount + installment_rate + 
##     age + n_credits + foreign
## 
##                    Df Deviance    AIC
## - amount            1   805.59 892.49
## - n_credits         1   805.70 892.60
## - age               1   807.27 894.17
## - duration          1   808.75 895.65
## <none>                  803.09 896.67
## - credit_his        4   830.40 897.24
## - foreign           1   811.10 898.00
## - installment_rate  1   813.13 900.03
## - chk_acct          3   878.88 952.41
## 
## Step:  AIC=892.49
## response ~ chk_acct + duration + credit_his + installment_rate + 
##     age + n_credits + foreign
## 
##                    Df Deviance    AIC
## - n_credits         1   808.25 888.46
## - age               1   809.10 889.31
## <none>                  805.59 892.49
## - foreign           1   813.20 893.42
## - installment_rate  1   813.26 893.48
## - credit_his        4   833.69 893.85
## - duration          1   824.43 904.64
## - chk_acct          3   880.67 947.51
## 
## Step:  AIC=888.46
## response ~ chk_acct + duration + credit_his + installment_rate + 
##     age + foreign
## 
##                    Df Deviance    AIC
## - age               1   811.58 885.11
## - credit_his        4   833.70 887.17
## <none>                  808.25 888.46
## - installment_rate  1   815.84 889.37
## - foreign           1   816.21 889.74
## - duration          1   826.66 900.19
## - chk_acct          3   883.02 943.18
## 
## Step:  AIC=885.11
## response ~ chk_acct + duration + credit_his + installment_rate + 
##     foreign
## 
##                    Df Deviance    AIC
## <none>                  811.58 885.11
## - installment_rate  1   818.77 885.61
## - foreign           1   819.03 885.87
## - credit_his        4   839.26 886.05
## - duration          1   830.18 897.03
## - chk_acct          3   886.47 939.95
summary(glm.step.bic)
## 
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + installment_rate + 
##     foreign, family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7899  -0.8079  -0.4734   0.9077   2.3453  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.254200   0.498269  -0.510  0.60993    
## chk_acctA12      -0.470836   0.210253  -2.239  0.02513 *  
## chk_acctA13      -1.044010   0.384441  -2.716  0.00661 ** 
## chk_acctA14      -1.804941   0.228269  -7.907 2.64e-15 ***
## duration          0.030090   0.007026   4.283 1.85e-05 ***
## credit_hisA31    -0.209733   0.518289  -0.405  0.68572    
## credit_hisA32    -1.022521   0.400471  -2.553  0.01067 *  
## credit_hisA33    -0.911731   0.465469  -1.959  0.05014 .  
## credit_hisA34    -1.727040   0.429060  -4.025 5.69e-05 ***
## installment_rate  0.216842   0.081834   2.650  0.00805 ** 
## foreignA202      -1.741409   0.772964  -2.253  0.02427 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 977.38  on 799  degrees of freedom
## Residual deviance: 811.58  on 789  degrees of freedom
## AIC: 833.58
## 
## Number of Fisher Scoring iterations: 5
## The model selected by stepwise model selection when BIC is used as criterion-
#   glm(formula = response ~ chk_acct + duration + credit_his+installment_rate + 
#   foreign, family = binomial, 
#   data = german_credit)


## Now let's build these two models
glm.aic<-glm(formula = response ~ chk_acct + duration + credit_his + purpose + 
               amount + saving_acct + present_emp + installment_rate + other_debtor + 
               age + other_install + housing + n_credits + foreign, family = binomial, 
             data = train)

glm.bic<-glm(formula = response ~ chk_acct + duration + credit_his + installment_rate + 
               foreign, family = binomial, data = train)
               
               
## Comparing AIC and BIC of both models
AIC(glm.aic)
## [1] 810.9323
AIC(glm.bic)
## [1] 833.5811
BIC(glm.aic)
## [1] 984.263
BIC(glm.bic)
## [1] 885.1118
### Defining optimup cutoff probability

df = seq(0.01, 0.99, 0.01)
result = cbind(df, NA)
cost1 <- function(actual, predicted) {
  weight1 = 5
  weight0 = 1
  c1 = (actual == 1) & (predicted < cutoff)  #logical vector - true if actual bad but predicted good
  c0 = (actual == 0) & (predicted > cutoff)  #logical vecotr - true if actual good but predicted bad
  return(mean(weight1 * c1 + weight0 * c0))
}
for (i in 1:length(df)) {
  cutoff <- result[i, 1]
  result[i, 2] <- cost1(train$response, predict(glm.aic, type = "response"))
}
plot(result, ylab = "Cost in Training Set")

## We can see that the least cost is when coutoff probability is around 0.2 
## We get the exact cutoff probability
result[which(result[,2]==min(result[,2])),1]
##   df 
## 0.16
# Both criterion suggest different models


table(predict(full.glm, type = "response") > 0.14)
## 
## FALSE  TRUE 
##   285   515
table(predict(glm.aic, type = "response") > 0.14)
## 
## FALSE  TRUE 
##   286   514
table(predict(glm.bic, type = "response") > 0.14)
## 
## FALSE  TRUE 
##   238   562
## 0.4 seems to be a better threshold for the analysis, however, the cost is higher for this cutoff probability
table(predict(full.glm, type = "response") > 0.4)
## 
## FALSE  TRUE 
##   547   253
table(predict(glm.aic, type = "response") > 0.4)
## 
## FALSE  TRUE 
##   550   250
table(predict(glm.bic, type = "response") > 0.4)
## 
## FALSE  TRUE 
##   554   246
## Comparing insample performance of all the models
## Comparing all models
full.insample <- predict(full.glm, train, type = "response")
aic.insample <- predict(glm.aic, train, type = "response")
bic.insample <- predict(glm.bic, train, type = "response")

roc.plot(x = train$response == "1", pred = cbind(full.insample,aic.insample,bic.insample), 
         legend = TRUE, leg.text = c("Full Model", "AIC","BIC"))$roc.vol

##      Model      Area      p.value binorm.area
## 1 Model  1 0.8282440 2.107096e-49          NA
## 2 Model  2 0.8216964 1.559410e-47          NA
## 3 Model  3 0.7707366 2.916515e-34          NA
### We can see that AIC based model and the full model have higher AUC, we will go ahead with AIC based model
## As it is less complex compared to full model

## Insample misclassification table 
prob.insample <- predict(glm.aic, type = "response")
predicted.insample <- prob.insample > 0.16
predicted.insample <- as.numeric(predicted.insample)

## Confusion matrix,
confusionMatrix(predicted.insample, train$response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 288  20
##          1 272 220
##                                           
##                Accuracy : 0.635           
##                  95% CI : (0.6006, 0.6684)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3315          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5143          
##             Specificity : 0.9167          
##          Pos Pred Value : 0.9351          
##          Neg Pred Value : 0.4472          
##              Prevalence : 0.7000          
##          Detection Rate : 0.3600          
##    Detection Prevalence : 0.3850          
##       Balanced Accuracy : 0.7155          
##                                           
##        'Positive' Class : 0               
## 
# Error Rate-
mean(ifelse(train$response != predicted.insample, 1, 0))
## [1] 0.365
## Accuracy is 63% and the error rate is .36

## out of sample prediction
prob.outsample<-predict(glm.aic,test,type = "response")
predicted.outsample<-ifelse(prob.outsample<0.16,0,1)
## Confusion matrix,
confusionMatrix(predicted.outsample, test$response)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 78  7
##          1 62 53
##                                           
##                Accuracy : 0.655           
##                  95% CI : (0.5847, 0.7206)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.9272          
##                                           
##                   Kappa : 0.3491          
##  Mcnemar's Test P-Value : 7.987e-11       
##                                           
##             Sensitivity : 0.5571          
##             Specificity : 0.8833          
##          Pos Pred Value : 0.9176          
##          Neg Pred Value : 0.4609          
##              Prevalence : 0.7000          
##          Detection Rate : 0.3900          
##    Detection Prevalence : 0.4250          
##       Balanced Accuracy : 0.7202          
##                                           
##        'Positive' Class : 0               
## 
# out sample Error Rate-
mean(ifelse(test$response != predicted.outsample, 1, 0))
## [1] 0.345
## Out of sample accuracy is 60% and error rate is 0.39

## ROC 
roc.plot(test$response == "1", prob.outsample)
roc.plot(test$response == "1", prob.outsample)$roc.vol
##      Model      Area      p.value binorm.area
## 1 Model  1 0.8310714 6.188138e-14          NA
## AUC IS 0.83

## Comparing all models for out sample
aic.outsample <- predict(glm.aic, test, type = "response")

roc.plot(x = test$response == "1", pred = cbind(aic.outsample), 
         legend = TRUE, leg.text = c("AIC"))$roc.vol

##      Model      Area      p.value binorm.area
## 1 Model  1 0.8310714 6.188138e-14          NA
## As we can see, AUC is best for Full and AIC based model