Install packages and load libraries, Load bank additional csv, remove missing values, remove duration column, change variables as factor, recode pdays, assign train test split.

summary(bank_train)
##       age                 job          marital                   education   
##  Min.   :18.00   admin.     :814   divorced: 360   basic.4y           : 331  
##  1st Qu.:32.00   blue-collar:670   married :2000   basic.6y           : 184  
##  Median :38.00   technician :564   single  : 925   basic.9y           : 456  
##  Mean   :40.27   services   :328   unknown :   8   high.school        : 751  
##  3rd Qu.:47.00   management :249                   professional.course: 433  
##  Max.   :86.00   retired    :138                   university.degree  :1001  
##                  (Other)    :530                   unknown            : 137  
##     default        housing          loan           contact         month     
##  no     :2655   no     :1487   no     :2678   cellular :2119   may    :1083  
##  unknown: 638   unknown:  82   unknown:  82   telephone:1174   jul    : 569  
##                 yes    :1724   yes    : 533                    aug    : 525  
##                                                                jun    : 421  
##                                                                nov    : 357  
##                                                                apr    : 165  
##                                                                (Other): 173  
##  day_of_week           campaign                           pdays     
##  Length:3293        Min.   : 1.000   More than a week        :  28  
##  Class :character   1st Qu.: 1.000   Not Previously Contacted:3155  
##  Mode  :character   Median : 2.000   One Week                : 110  
##                     Mean   : 2.529                                  
##                     3rd Qu.: 3.000                                  
##                     Max.   :35.000                                  
##                                                                     
##     previous            poutcome     emp.var.rate      cons.price.idx 
##  Min.   :0.000   failure    : 373   Min.   :-3.40000   Min.   :92.20  
##  1st Qu.:0.000   nonexistent:2800   1st Qu.:-1.80000   1st Qu.:93.08  
##  Median :0.000   success    : 120   Median : 1.10000   Median :93.88  
##  Mean   :0.201                      Mean   : 0.08521   Mean   :93.58  
##  3rd Qu.:0.000                      3rd Qu.: 1.40000   3rd Qu.:93.99  
##  Max.   :6.000                      Max.   : 1.40000   Max.   :94.77  
##                                                                       
##  cons.conf.idx      euribor3m      nr.employed         y         
##  Min.   :-50.80   Min.   :0.635   Min.   :4964   Min.   :0.0000  
##  1st Qu.:-42.70   1st Qu.:1.334   1st Qu.:5099   1st Qu.:0.0000  
##  Median :-41.80   Median :4.857   Median :5191   Median :0.0000  
##  Mean   :-40.41   Mean   :3.620   Mean   :5166   Mean   :0.1136  
##  3rd Qu.:-36.40   3rd Qu.:4.961   3rd Qu.:5228   3rd Qu.:0.0000  
##  Max.   :-26.90   Max.   :5.045   Max.   :5228   Max.   :1.0000  
## 

#Using Full model

full_model = glm(formula = y ~., data = bank_train, family = binomial)

cooks_sd_cutoff = 3*sd(cooks.distance(full_model))
out <- which(cooks.distance(full_model)>cooks_sd_cutoff)

full_model = glm(formula = y ~., data = bank_train[-out,], family = binomial)

#Remove loan variable as it contains not required information

full_model = glm(formula = y ~.-loan, data = bank_train[-out,], family = binomial)

summary(full_model)
## 
## Call:
## glm(formula = y ~ . - loan, family = binomial, data = bank_train[-out, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7826  -0.3428  -0.2350  -0.1276   3.3704  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -1.182e+02  1.455e+02  -0.812  0.41680    
## age                            1.973e-02  9.571e-03   2.062  0.03925 *  
## jobblue-collar                 2.433e-02  3.179e-01   0.077  0.93900    
## jobentrepreneur               -2.496e+00  1.050e+00  -2.377  0.01745 *  
## jobhousemaid                  -5.899e-01  6.218e-01  -0.949  0.34277    
## jobmanagement                 -7.085e-01  3.421e-01  -2.071  0.03839 *  
## jobretired                    -1.547e+00  4.791e-01  -3.229  0.00124 ** 
## jobself-employed              -1.768e+00  6.096e-01  -2.900  0.00373 ** 
## jobservices                   -5.282e-01  3.332e-01  -1.585  0.11292    
## jobstudent                    -2.742e-01  4.723e-01  -0.581  0.56151    
## jobtechnician                 -6.255e-02  2.462e-01  -0.254  0.79946    
## jobunemployed                 -1.310e+00  6.184e-01  -2.118  0.03417 *  
## jobunknown                    -1.259e+00  1.282e+00  -0.982  0.32599    
## maritalmarried                 1.330e-01  2.867e-01   0.464  0.64277    
## maritalsingle                  1.417e-01  3.208e-01   0.442  0.65861    
## maritalunknown                -1.240e+01  4.487e+02  -0.028  0.97796    
## educationbasic.6y             -1.192e-01  6.949e-01  -0.172  0.86378    
## educationbasic.9y              1.088e+00  4.643e-01   2.342  0.01916 *  
## educationhigh.school           1.281e+00  4.563e-01   2.808  0.00498 ** 
## educationprofessional.course   1.295e+00  4.818e-01   2.688  0.00718 ** 
## educationuniversity.degree     1.349e+00  4.626e-01   2.917  0.00354 ** 
## educationunknown               7.072e-01  5.894e-01   1.200  0.23016    
## defaultunknown                -2.684e-01  2.697e-01  -0.995  0.31965    
## housingunknown                -1.398e+00  8.609e-01  -1.624  0.10433    
## housingyes                    -2.721e-02  1.615e-01  -0.168  0.86623    
## contacttelephone              -2.641e+00  4.100e-01  -6.440 1.20e-10 ***
## monthaug                      -1.235e+00  5.329e-01  -2.318  0.02046 *  
## monthdec                       1.906e+00  8.156e-01   2.337  0.01944 *  
## monthjul                      -7.219e-01  4.567e-01  -1.581  0.11395    
## monthjun                       6.446e-01  5.131e-01   1.256  0.20895    
## monthmar                       3.213e+00  6.666e-01   4.820 1.44e-06 ***
## monthmay                      -1.341e-01  3.660e-01  -0.366  0.71415    
## monthnov                      -6.644e-01  5.148e-01  -1.290  0.19688    
## monthoct                      -5.030e-01  6.315e-01  -0.796  0.42576    
## monthsep                      -1.503e+00  7.468e-01  -2.013  0.04417 *  
## day_of_weekmon                -2.016e-01  2.517e-01  -0.801  0.42303    
## day_of_weekthu                 4.732e-02  2.456e-01   0.193  0.84718    
## day_of_weektue                -1.334e-01  2.581e-01  -0.517  0.60521    
## day_of_weekwed                -2.131e-01  2.636e-01  -0.808  0.41888    
## campaign                      -1.758e-01  5.964e-02  -2.948  0.00320 ** 
## pdaysNot Previously Contacted  3.631e-01  1.015e+00   0.358  0.72055    
## pdaysOne Week                  2.226e+00  8.206e-01   2.712  0.00668 ** 
## previous                      -1.670e-01  2.660e-01  -0.628  0.53011    
## poutcomenonexistent            1.198e+00  4.299e-01   2.787  0.00531 ** 
## poutcomesuccess                2.134e+00  9.915e-01   2.152  0.03140 *  
## emp.var.rate                  -4.037e-03  5.463e-01  -0.007  0.99410    
## cons.price.idx                 1.560e+00  9.626e-01   1.621  0.10508    
## cons.conf.idx                  1.437e-01  3.568e-02   4.026 5.68e-05 ***
## euribor3m                     -3.966e-01  5.288e-01  -0.750  0.45322    
## nr.employed                   -4.831e-03  1.183e-02  -0.408  0.68295    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1947.8  on 3187  degrees of freedom
## Residual deviance: 1219.6  on 3138  degrees of freedom
## AIC: 1319.6
## 
## Number of Fisher Scoring iterations: 14
#Maximized cutoff

preds <- prediction(predict(full_model, bank_test, type = "response"),
                   bank_test$y) 
#Predicted Probability and True Classification

auc <- round(as.numeric(performance(preds, measure = "auc")@y.values),3)

false.rates <-performance(preds, "fpr","fnr")
accuracy <-performance(preds, "acc","err")

plot(unlist(performance(preds, "sens")@x.values), unlist(performance(preds, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(preds, "spec")@x.values), unlist(performance(preds, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(preds, "sens")@y.values) - unlist(performance(preds, "spec")@y.values)))
min.x<-unlist(performance(preds, "sens")@x.values)[min.diff]
min.y<-unlist(performance(preds, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

#Plot ROC

perf <- performance(preds, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

# Using Confusion Matrix

predict = predict(full_model, bank_test, type = "response")
predict_choice = ifelse(predict>0.04541, 1,0)

caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 464  29
##          1 283  48
##                                           
##                Accuracy : 0.6214          
##                  95% CI : (0.5872, 0.6546)
##     No Information Rate : 0.9066          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0986          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.62338         
##             Specificity : 0.62115         
##          Pos Pred Value : 0.14502         
##          Neg Pred Value : 0.94118         
##              Prevalence : 0.09345         
##          Detection Rate : 0.05825         
##    Detection Prevalence : 0.40170         
##       Balanced Accuracy : 0.62226         
##                                           
##        'Positive' Class : 1               
## 
#cooks distance

cooks_distance = cooks.distance(full_model)
plot(cooks_distance,col="red", pch=19, cex=1) 
text(cooks.distance(full_model),labels = rownames(bank_train))

#Check goodness of fit

hoslem.test(full_model$y, fitted(full_model), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  full_model$y, fitted(full_model)
## X-squared = 7.7407, df = 8, p-value = 0.4592

Full VIF model (removed predictors with VIF > 10)

note: removing outliers gives cutoff of 0, removing outliers

##remove housing

full_model = glm(formula = y ~., data = bank_train, family = binomial)

alias(full_model) 
## Model :
## y ~ age + job + marital + education + default + housing + loan + 
##     contact + month + day_of_week + campaign + pdays + previous + 
##     poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
##     euribor3m + nr.employed
## 
## Complete :
##             (Intercept) age jobblue-collar jobentrepreneur jobhousemaid
## loanunknown 0           0   0              0               0           
##             jobmanagement jobretired jobself-employed jobservices jobstudent
## loanunknown 0             0          0                0           0         
##             jobtechnician jobunemployed jobunknown maritalmarried maritalsingle
## loanunknown 0             0             0          0              0            
##             maritalunknown educationbasic.6y educationbasic.9y
## loanunknown 0              0                 0                
##             educationhigh.school educationprofessional.course
## loanunknown 0                    0                           
##             educationuniversity.degree educationunknown defaultunknown
## loanunknown 0                          0                0             
##             housingunknown housingyes loanyes contacttelephone monthaug
## loanunknown 1              0          0       0                0       
##             monthdec monthjul monthjun monthmar monthmay monthnov monthoct
## loanunknown 0        0        0        0        0        0        0       
##             monthsep day_of_weekmon day_of_weekthu day_of_weektue
## loanunknown 0        0              0              0             
##             day_of_weekwed campaign pdaysNot Previously Contacted pdaysOne Week
## loanunknown 0              0        0                             0            
##             previous poutcomenonexistent poutcomesuccess emp.var.rate
## loanunknown 0        0                   0               0           
##             cons.price.idx cons.conf.idx euribor3m nr.employed
## loanunknown 0              0             0         0

##remove nr.employed

full_vif <- glm(formula = y ~ .-housing, data = bank_train, family = binomial)

vif(full_vif) 
##                      GVIF Df GVIF^(1/(2*Df))
## age              2.109707  1        1.452483
## job              6.617304 11        1.089692
## marital          1.486633  3        1.068318
## education        3.595784  6        1.112541
## default          1.172887  1        1.082999
## loan             1.053313  2        1.013070
## contact          2.871235  1        1.694472
## month           83.882090  9        1.279000
## day_of_week      1.153264  4        1.017984
## campaign         1.056683  1        1.027951
## pdays           10.416581  2        1.796517
## previous         4.325893  1        2.079878
## poutcome        21.494397  2        2.153185
## emp.var.rate   141.638992  1       11.901218
## cons.price.idx  64.913164  1        8.056871
## cons.conf.idx    5.532312  1        2.352087
## euribor3m      151.325063  1       12.301425
## nr.employed    178.904609  1       13.375523

##remove emp.var.rate

full_vif <- glm(formula = y ~ .-housing-nr.employed, data = bank_train, family = binomial)

vif(full_vif)
##                     GVIF Df GVIF^(1/(2*Df))
## age             2.104310  1        1.450624
## job             6.552971 11        1.089208
## marital         1.483034  3        1.067887
## education       3.561663  6        1.111658
## default         1.172785  1        1.082952
## loan            1.052920  2        1.012975
## contact         2.678550  1        1.636628
## month          18.780782  9        1.176960
## day_of_week     1.138171  4        1.016309
## campaign        1.055772  1        1.027508
## pdays          10.380567  2        1.794962
## previous        4.324459  1        2.079533
## poutcome       21.418551  2        2.151283
## emp.var.rate   97.756697  1        9.887199
## cons.price.idx 12.193816  1        3.491965
## cons.conf.idx   2.959152  1        1.720219
## euribor3m      68.202188  1        8.258462

Remove poutcome

full_vif <- glm(formula = y ~ .-housing-nr.employed-emp.var.rate, data = bank_train, family = binomial)

vif(full_vif)
##                     GVIF Df GVIF^(1/(2*Df))
## age             2.097682  1        1.448338
## job             6.443553 11        1.088375
## marital         1.479862  3        1.067506
## education       3.543395  6        1.111181
## default         1.169305  1        1.081344
## loan            1.052619  2        1.012903
## contact         2.461931  1        1.569054
## month           7.027310  9        1.114407
## day_of_week     1.136222  4        1.016092
## campaign        1.053083  1        1.026198
## pdays          10.440088  2        1.797530
## previous        4.347275  1        2.085012
## poutcome       21.471749  2        2.152617
## cons.price.idx  2.760575  1        1.661498
## cons.conf.idx   2.432893  1        1.559773
## euribor3m       2.850116  1        1.688229

##final full vif model

full_vif <- glm(formula = y ~ .-housing-nr.employed-emp.var.rate-poutcome, data = bank_train, family = binomial)

#remove outliers

inf <- 4/(nrow(bank_train))
out <- which(cooks.distance(full_model)>inf)

full_vif <- glm(formula = y ~ .-housing-nr.employed-emp.var.rate-poutcome, data = bank_train, family = binomial)

summary(full_vif)
## 
## Call:
## glm(formula = y ~ . - housing - nr.employed - emp.var.rate - 
##     poutcome, family = binomial, data = bank_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2921  -0.4128  -0.3294  -0.2513   2.8510  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -79.236937  13.979282  -5.668 1.44e-08 ***
## age                             0.015083   0.007601   1.984 0.047229 *  
## jobblue-collar                 -0.127529   0.253679  -0.503 0.615162    
## jobentrepreneur                -0.516907   0.429916  -1.202 0.229230    
## jobhousemaid                   -0.036444   0.420276  -0.087 0.930899    
## jobmanagement                  -0.286186   0.267723  -1.069 0.285086    
## jobretired                     -0.530367   0.352492  -1.505 0.132422    
## jobself-employed               -0.519916   0.379433  -1.370 0.170611    
## jobservices                    -0.189805   0.263834  -0.719 0.471889    
## jobstudent                      0.116014   0.375458   0.309 0.757327    
## jobtechnician                  -0.019800   0.214649  -0.092 0.926503    
## jobunemployed                  -0.023224   0.388138  -0.060 0.952288    
## jobunknown                      0.162708   0.623201   0.261 0.794028    
## maritalmarried                  0.230487   0.226339   1.018 0.308523    
## maritalsingle                   0.197062   0.259602   0.759 0.447798    
## maritalunknown                -11.892006 281.161028  -0.042 0.966263    
## educationbasic.6y               0.158814   0.393468   0.404 0.686487    
## educationbasic.9y               0.182169   0.306760   0.594 0.552612    
## educationhigh.school            0.338463   0.294624   1.149 0.250640    
## educationprofessional.course    0.394927   0.321869   1.227 0.219829    
## educationuniversity.degree      0.433401   0.295031   1.469 0.141832    
## educationunknown                0.357488   0.376153   0.950 0.341920    
## defaultunknown                  0.057384   0.192685   0.298 0.765847    
## loanunknown                    -0.240308   0.460321  -0.522 0.601639    
## loanyes                        -0.021621   0.174173  -0.124 0.901210    
## contacttelephone               -0.890142   0.241532  -3.685 0.000228 ***
## monthaug                       -0.279454   0.350735  -0.797 0.425586    
## monthdec                        1.242944   0.576848   2.155 0.031184 *  
## monthjul                        0.135859   0.336817   0.403 0.686682    
## monthjun                        0.426902   0.315689   1.352 0.176283    
## monthmar                        1.913444   0.447401   4.277 1.90e-05 ***
## monthmay                       -0.338625   0.268929  -1.259 0.207973    
## monthnov                       -0.037233   0.329562  -0.113 0.910049    
## monthoct                        0.086034   0.411515   0.209 0.834397    
## monthsep                       -0.366036   0.443472  -0.825 0.409153    
## day_of_weekmon                 -0.245488   0.202341  -1.213 0.225038    
## day_of_weekthu                 -0.013185   0.198240  -0.067 0.946971    
## day_of_weektue                 -0.048378   0.204243  -0.237 0.812762    
## day_of_weekwed                 -0.086808   0.209280  -0.415 0.678294    
## campaign                       -0.093578   0.039684  -2.358 0.018368 *  
## pdaysNot Previously Contacted  -0.613717   0.475146  -1.292 0.196482    
## pdaysOne Week                   1.592494   0.495671   3.213 0.001314 ** 
## previous                       -0.286880   0.114476  -2.506 0.012210 *  
## cons.price.idx                  0.871007   0.153494   5.675 1.39e-08 ***
## cons.conf.idx                   0.061791   0.017012   3.632 0.000281 ***
## euribor3m                      -0.548154   0.056037  -9.782  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2330.9  on 3292  degrees of freedom
## Residual deviance: 1793.5  on 3247  degrees of freedom
## AIC: 1885.5
## 
## Number of Fisher Scoring iterations: 13
#Maximized cutoff
pred <- prediction(predict(full_vif, bank_test, type = "response"),
                   bank_test$y) #Predicted Probability and True Classification

auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

## ROC Plot

perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

##Confusion Matrix

predict = predict(full_vif, bank_test, type = "response")
predict_choice = ifelse(predict>0.0744, 1,0)

caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 495  25
##          1 252  52
##                                           
##                Accuracy : 0.6638          
##                  95% CI : (0.6304, 0.6961)
##     No Information Rate : 0.9066          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1455          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.67532         
##             Specificity : 0.66265         
##          Pos Pred Value : 0.17105         
##          Neg Pred Value : 0.95192         
##              Prevalence : 0.09345         
##          Detection Rate : 0.06311         
##    Detection Prevalence : 0.36893         
##       Balanced Accuracy : 0.66899         
##                                           
##        'Positive' Class : 1               
## 

##Cooks Distance Diagnostics

cooks_distance = cooks.distance(full_vif)
plot(cooks_distance,col="red", pch=19, cex=1) 
text(cooks.distance(full_vif),labels = rownames(bank_train))

##Check goodness of fit

hoslem.test(full_vif$y, fitted(full_vif), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  full_vif$y, fitted(full_vif)
## X-squared = 8.6778, df = 8, p-value = 0.3702

AIC model based on vif

full_model = glm(formula = y ~., data = bank_train, family = binomial)

AIC = step(full_model, direction="both",test="Chisq", trace = F) 

cooks_sd_cutoff = 3*sd(cooks.distance(AIC))
out <- which(cooks.distance(AIC)>cooks_sd_cutoff)

AIC = glm(formula = y ~ contact + month + campaign + pdays + poutcome + 
    emp.var.rate + cons.price.idx + cons.conf.idx, family = binomial, 
    data = bank_train[-out,])

summary(AIC)
## 
## Call:
## glm(formula = y ~ contact + month + campaign + pdays + poutcome + 
##     emp.var.rate + cons.price.idx + cons.conf.idx, family = binomial, 
##     data = bank_train[-out, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7026  -0.3641  -0.2986  -0.2250   2.8925  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -215.03712   20.85477 -10.311  < 2e-16 ***
## contacttelephone                -2.51999    0.36208  -6.960 3.41e-12 ***
## monthaug                        -0.75554    0.44399  -1.702  0.08881 .  
## monthdec                         1.33077    0.90665   1.468  0.14216    
## monthjul                        -0.53414    0.40996  -1.303  0.19261    
## monthjun                         0.17025    0.36164   0.471  0.63779    
## monthmar                         2.88769    0.56138   5.144 2.69e-07 ***
## monthmay                         0.07237    0.32077   0.226  0.82150    
## monthnov                        -0.89119    0.40402  -2.206  0.02740 *  
## monthoct                        -1.02139    0.52680  -1.939  0.05252 .  
## monthsep                        -0.93958    0.56715  -1.657  0.09759 .  
## campaign                        -0.16813    0.05270  -3.191  0.00142 ** 
## pdaysNot Previously Contacted    0.86719    1.50206   0.577  0.56371    
## pdaysOne Week                    2.46473    1.29916   1.897  0.05781 .  
## poutcomenonexistent              1.64605    0.29484   5.583 2.37e-08 ***
## poutcomesuccess                  2.54628    1.78793   1.424  0.15440    
## emp.var.rate                    -0.79154    0.08229  -9.619  < 2e-16 ***
## cons.price.idx                   2.31436    0.22534  10.270  < 2e-16 ***
## cons.conf.idx                    0.12576    0.02261   5.563 2.66e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2054.0  on 3199  degrees of freedom
## Residual deviance: 1449.4  on 3181  degrees of freedom
## AIC: 1487.4
## 
## Number of Fisher Scoring iterations: 6

##Maximized cutoff

pred <- prediction(predict(AIC, bank_test, type = "response"),
                   bank_test$y) #Predicted Probability and True Classification

auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

## ROC Plot

perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

##Confusion Matrix

predict = predict(AIC, bank_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)

caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 517  24
##          1 230  53
##                                          
##                Accuracy : 0.6917         
##                  95% CI : (0.659, 0.7232)
##     No Information Rate : 0.9066         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1729         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.68831        
##             Specificity : 0.69210        
##          Pos Pred Value : 0.18728        
##          Neg Pred Value : 0.95564        
##              Prevalence : 0.09345        
##          Detection Rate : 0.06432        
##    Detection Prevalence : 0.34345        
##       Balanced Accuracy : 0.69021        
##                                          
##        'Positive' Class : 1              
## 

##cooks distance diagnostics

cooks_distance = cooks.distance(AIC)
plot(cooks_distance,col="red", pch=19, cex=1) 
text(cooks.distance(AIC),labels = rownames(bank_train))

## Goodness of fit

hoslem.test(AIC$y, fitted(AIC), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  AIC$y, fitted(AIC)
## X-squared = 6.0238, df = 8, p-value = 0.6446

AIC model based on Full VIF

full_vif <- glm(formula = y ~ .-housing-nr.employed-emp.var.rate-poutcome, data = bank_train, family = binomial)

AIC = step(full_vif, direction="both",test="Chisq", trace = F) 

cooks_sd_cutoff = 3*sd(cooks.distance(AIC))
out <- which(cooks.distance(AIC)>cooks_sd_cutoff)

AIC = glm(formula = y ~ contact + month + campaign + pdays + previous + 
    cons.price.idx + cons.conf.idx + euribor3m, family = binomial, 
    data = bank_train[-out,])

summary(AIC)
## 
## Call:
## glm(formula = y ~ contact + month + campaign + pdays + previous + 
##     cons.price.idx + cons.conf.idx + euribor3m, family = binomial, 
##     data = bank_train[-out, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8339  -0.3681  -0.3012  -0.2420   2.7231  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -142.62305   17.57048  -8.117 4.77e-16 ***
## contacttelephone                -2.32696    0.35308  -6.590 4.38e-11 ***
## monthaug                        -1.24802    0.42323  -2.949  0.00319 ** 
## monthdec                         2.01362    0.93995   2.142  0.03217 *  
## monthjul                        -0.58326    0.40054  -1.456  0.14534    
## monthjun                         0.39551    0.35062   1.128  0.25930    
## monthmar                         1.86541    0.68155   2.737  0.00620 ** 
## monthmay                        -0.19473    0.30570  -0.637  0.52414    
## monthnov                        -0.67752    0.39098  -1.733  0.08312 .  
## monthoct                        -0.97625    0.51196  -1.907  0.05653 .  
## monthsep                        -1.23626    0.54775  -2.257  0.02401 *  
## campaign                        -0.17183    0.05263  -3.265  0.00110 ** 
## pdaysNot Previously Contacted   -0.58964    0.94635  -0.623  0.53324    
## pdaysOne Week                    2.88708    0.95628   3.019  0.00254 ** 
## previous                        -0.62317    0.15580  -4.000 6.34e-05 ***
## cons.price.idx                   1.60803    0.19388   8.294  < 2e-16 ***
## cons.conf.idx                    0.15436    0.02195   7.031 2.05e-12 ***
## euribor3m                       -0.57404    0.06157  -9.324  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2095.4  on 3207  degrees of freedom
## Residual deviance: 1495.0  on 3190  degrees of freedom
## AIC: 1531
## 
## Number of Fisher Scoring iterations: 6

Maximized cutoff plot

pred <- prediction(predict(AIC, bank_test, type = "response"),
                   bank_test$y) #Predicted Probability and True Classification

auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

## ROC Curve Plot

perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

## Confusion Matrix

predict = predict(AIC, bank_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)

caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 513  25
##          1 234  52
##                                           
##                Accuracy : 0.6857          
##                  95% CI : (0.6528, 0.7173)
##     No Information Rate : 0.9066          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1633          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.67532         
##             Specificity : 0.68675         
##          Pos Pred Value : 0.18182         
##          Neg Pred Value : 0.95353         
##              Prevalence : 0.09345         
##          Detection Rate : 0.06311         
##    Detection Prevalence : 0.34709         
##       Balanced Accuracy : 0.68104         
##                                           
##        'Positive' Class : 1               
## 

Cooks distance diagnostic plot

cooks_distance = cooks.distance(AIC)
plot(cooks_distance,col="red", pch=19, cex=1) 
text(cooks.distance(AIC),labels = rownames(bank_train))

## Goodness of Fit

hoslem.test(AIC$y, fitted(AIC), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  AIC$y, fitted(AIC)
## X-squared = 9.1052, df = 8, p-value = 0.3335

BIC Based on Full Model

full_model = glm(formula = y ~., data = bank_train, family = binomial)

BIC = step(full_model, direction="both",test="Chisq", trace = F, k=log(nrow(bank_train)))

cooks_sd_cutoff = 3*sd(cooks.distance(BIC))
out <- which(cooks.distance(BIC)>cooks_sd_cutoff)

BIC = glm(formula = y ~ contact + poutcome + emp.var.rate + cons.price.idx + 
    cons.conf.idx, family = binomial, data = bank_train[-out,])

summary(BIC)
## 
## Call:
## glm(formula = y ~ contact + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx, family = binomial, data = bank_train[-out, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5597  -0.3171  -0.2561  -0.1861   2.8514  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -214.83405   18.09645 -11.872  < 2e-16 ***
## contacttelephone      -2.18559    0.26179  -8.349  < 2e-16 ***
## poutcomenonexistent    1.62480    0.27191   5.976 2.29e-09 ***
## poutcomesuccess        4.19409    0.42355   9.902  < 2e-16 ***
## emp.var.rate          -1.01411    0.06815 -14.881  < 2e-16 ***
## cons.price.idx         2.29874    0.19454  11.816  < 2e-16 ***
## cons.conf.idx          0.09904    0.01527   6.484 8.92e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1879.6  on 3170  degrees of freedom
## Residual deviance: 1219.9  on 3164  degrees of freedom
## AIC: 1233.9
## 
## Number of Fisher Scoring iterations: 6

Maximum Cutoff Plot

pred <- prediction(predict(BIC, bank_test, type = "response"),
                   bank_test$y) #Predicted Probability and True Classification

auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

## ROC Curve Plot

perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

## Confusion Matrix

predict = predict(BIC, bank_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)

caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 595  28
##          1 152  49
##                                           
##                Accuracy : 0.7816          
##                  95% CI : (0.7518, 0.8093)
##     No Information Rate : 0.9066          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2514          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.63636         
##             Specificity : 0.79652         
##          Pos Pred Value : 0.24378         
##          Neg Pred Value : 0.95506         
##              Prevalence : 0.09345         
##          Detection Rate : 0.05947         
##    Detection Prevalence : 0.24393         
##       Balanced Accuracy : 0.71644         
##                                           
##        'Positive' Class : 1               
## 

Cooks Distance Diagnostic Plot

cooks_distance = cooks.distance(BIC)
plot(cooks_distance,col="red", pch=19, cex=1) 
text(cooks.distance(BIC),labels = rownames(bank_train))

## Goodness of Fit

hoslem.test(BIC$y, fitted(BIC), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  BIC$y, fitted(BIC)
## X-squared = 42.631, df = 8, p-value = 1.031e-06
exp(coef(BIC))
##         (Intercept)    contacttelephone poutcomenonexistent     poutcomesuccess 
##        4.997541e-94        1.124110e-01        5.077415e+00        6.629361e+01 
##        emp.var.rate      cons.price.idx       cons.conf.idx 
##        3.627254e-01        9.961587e+00        1.104107e+00

BIC based on full_vif or best model

full_vif <- glm(formula = y ~ .-housing-nr.employed-emp.var.rate-poutcome, data = bank_train, family = binomial)

BIC = step(full_vif, direction="both",test="Chisq", trace = F, k=log(nrow(bank_train)))

summary(BIC)
## 
## Call:
## glm(formula = y ~ contact + pdays + cons.price.idx + cons.conf.idx + 
##     euribor3m, family = binomial, data = bank_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9713  -0.3520  -0.3361  -0.3014   2.7820  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -77.42934   11.53584  -6.712 1.92e-11 ***
## contacttelephone               -0.82708    0.17907  -4.619 3.86e-06 ***
## pdaysNot Previously Contacted  -0.49052    0.42677  -1.149  0.25040    
## pdaysOne Week                   1.26243    0.46777   2.699  0.00696 ** 
## cons.price.idx                  0.85963    0.12476   6.890 5.57e-12 ***
## cons.conf.idx                   0.06774    0.01175   5.764 8.24e-09 ***
## euribor3m                      -0.54655    0.04201 -13.011  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2330.9  on 3292  degrees of freedom
## Residual deviance: 1870.5  on 3286  degrees of freedom
## AIC: 1884.5
## 
## Number of Fisher Scoring iterations: 5
deviance <-residuals(BIC, type = "deviance")
pearson <-residuals(BIC, type = "pearson")

# standardized deviance residuals

deviance_std<-residuals(BIC, type = "deviance")/sqrt(1 - hatvalues(BIC))

# standardized pearson residuals

pearson_std <-residuals(BIC, type = "pearson")/sqrt(1 - hatvalues(BIC)) 

dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))

plot(deviance_std[BIC$model$y==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "Standard Deviance residuals", xlab = "ID")
points(deviance_std[BIC$model$y==0], col = "black")

plot(pearson_std[BIC$model$y==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "Standard Pearson residuals", xlab = "ID")
points(pearson_std[BIC$model$y==0], col = "black")

#outliers

cooks_sd_cutoff = 3*sd(cooks.distance(BIC))
out <- which(cooks.distance(BIC)>cooks_sd_cutoff)

#BIC

BIC = glm(formula = y ~ contact + pdays + cons.price.idx + cons.conf.idx + 
    euribor3m, family = binomial, data = bank_train[-out,])

#linearity plots
probabilities <- predict(BIC, type = "response")
predicted.classes <- ifelse(probabilities > 0.06352, 1, 0)


mydata = dplyr::select_if(bank_train[-out,c("cons.price.idx","cons.conf.idx","euribor3m")], is.numeric) 
predictors <- colnames(mydata)

mydata = mutate(mydata, logit = log(probabilities/(1-probabilities))) 
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)

ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'

Maximized Cutoff Plot

#Predicted Probability and True Classification
pred <- prediction(predict(BIC, bank_test, type = "response"),
                   bank_test$y) 

auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)

false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

## ROC Curve Plot

perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

## Confusion Matrix

predict = predict(BIC, bank_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)

caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 543  25
##          1 204  52
##                                           
##                Accuracy : 0.7221          
##                  95% CI : (0.6901, 0.7524)
##     No Information Rate : 0.9066          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1969          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.67532         
##             Specificity : 0.72691         
##          Pos Pred Value : 0.20313         
##          Neg Pred Value : 0.95599         
##              Prevalence : 0.09345         
##          Detection Rate : 0.06311         
##    Detection Prevalence : 0.31068         
##       Balanced Accuracy : 0.70112         
##                                           
##        'Positive' Class : 1               
## 

Cooks distance diagnostic plot

cooks_distance = cooks.distance(BIC)
plot(cooks_distance,col="red", pch=19, cex=1) 
text(cooks.distance(BIC),labels = rownames(bank_train))

## Goodness of fit

hoslem.test(BIC$y, fitted(BIC), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  BIC$y, fitted(BIC)
## X-squared = 20.446, df = 8, p-value = 0.008775
exp(coef(BIC))
##                   (Intercept)              contacttelephone 
##                  4.224681e-50                  3.123750e-01 
## pdaysNot Previously Contacted                 pdaysOne Week 
##                  3.174599e+05                  6.046893e+06 
##                cons.price.idx                 cons.conf.idx 
##                  3.048826e+00                  1.086699e+00 
##                     euribor3m 
##                  5.646773e-01
LR = Anova(BIC, test = "LR")
Wald = Anova(BIC, test = "Wald")
score = anova(BIC, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)
##          LR    Score      Wald
## 1  39.18013 70.96549  36.15493
## 2 103.43374 70.96549  72.98712
## 3  65.71214 70.96549  65.27956
## 4  43.71774 70.96549  43.70600
## 5 180.49998 70.96549 178.32134
data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)
##           p.LR      p.Score       p.Wald
## 1 3.864537e-10 3.635281e-17 1.822372e-09
## 2 3.464564e-23 3.635281e-17 1.415954e-16
## 3 5.218350e-16 3.635281e-17 6.499197e-16
## 4 3.793177e-11 3.635281e-17 3.816006e-11
## 5 3.769249e-41 3.635281e-17 1.127064e-40
exp(coef(BIC))
##                   (Intercept)              contacttelephone 
##                  4.224681e-50                  3.123750e-01 
## pdaysNot Previously Contacted                 pdaysOne Week 
##                  3.174599e+05                  6.046893e+06 
##                cons.price.idx                 cons.conf.idx 
##                  3.048826e+00                  1.086699e+00 
##                     euribor3m 
##                  5.646773e-01
summary(BIC)
## 
## Call:
## glm(formula = y ~ contact + pdays + cons.price.idx + cons.conf.idx + 
##     euribor3m, family = binomial, data = bank_train[-out, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6146  -0.3623  -0.3200  -0.2819   2.7912  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -113.68831  380.35312  -0.299    0.765    
## contacttelephone                -1.16355    0.19351  -6.013 1.82e-09 ***
## pdaysNot Previously Contacted   12.66811  380.14316   0.033    0.973    
## pdaysOne Week                   15.61506  380.14330   0.041    0.967    
## cons.price.idx                   1.11476    0.13797   8.080 6.50e-16 ***
## cons.conf.idx                    0.08314    0.01258   6.611 3.82e-11 ***
## euribor3m                       -0.57150    0.04280 -13.354  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2226.7  on 3238  degrees of freedom
## Residual deviance: 1702.0  on 3232  degrees of freedom
## AIC: 1716
## 
## Number of Fisher Scoring iterations: 13
tidy(BIC)
## # A tibble: 7 x 5
##   term                           estimate std.error statistic  p.value
##   <chr>                             <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                   -114.      380.       -0.299  7.65e- 1
## 2 contacttelephone                -1.16      0.194    -6.01   1.82e- 9
## 3 pdaysNot Previously Contacted   12.7     380.        0.0333 9.73e- 1
## 4 pdaysOne Week                   15.6     380.        0.0411 9.67e- 1
## 5 cons.price.idx                   1.11      0.138     8.08   6.50e-16
## 6 cons.conf.idx                    0.0831    0.0126    6.61   3.82e-11
## 7 euribor3m                       -0.572     0.0428  -13.4    1.13e-40
cooks_distance = cooks.distance(BIC)
plot(cooks_distance,col="red", pch=19, cex=1) 
text(cooks.distance(BIC),labels = rownames(bank_train))