summary(bank_train)
## age job marital education
## Min. :18.00 admin. :814 divorced: 360 basic.4y : 331
## 1st Qu.:32.00 blue-collar:670 married :2000 basic.6y : 184
## Median :38.00 technician :564 single : 925 basic.9y : 456
## Mean :40.27 services :328 unknown : 8 high.school : 751
## 3rd Qu.:47.00 management :249 professional.course: 433
## Max. :86.00 retired :138 university.degree :1001
## (Other) :530 unknown : 137
## default housing loan contact month
## no :2655 no :1487 no :2678 cellular :2119 may :1083
## unknown: 638 unknown: 82 unknown: 82 telephone:1174 jul : 569
## yes :1724 yes : 533 aug : 525
## jun : 421
## nov : 357
## apr : 165
## (Other): 173
## day_of_week campaign pdays
## Length:3293 Min. : 1.000 More than a week : 28
## Class :character 1st Qu.: 1.000 Not Previously Contacted:3155
## Mode :character Median : 2.000 One Week : 110
## Mean : 2.529
## 3rd Qu.: 3.000
## Max. :35.000
##
## previous poutcome emp.var.rate cons.price.idx
## Min. :0.000 failure : 373 Min. :-3.40000 Min. :92.20
## 1st Qu.:0.000 nonexistent:2800 1st Qu.:-1.80000 1st Qu.:93.08
## Median :0.000 success : 120 Median : 1.10000 Median :93.88
## Mean :0.201 Mean : 0.08521 Mean :93.58
## 3rd Qu.:0.000 3rd Qu.: 1.40000 3rd Qu.:93.99
## Max. :6.000 Max. : 1.40000 Max. :94.77
##
## cons.conf.idx euribor3m nr.employed y
## Min. :-50.80 Min. :0.635 Min. :4964 Min. :0.0000
## 1st Qu.:-42.70 1st Qu.:1.334 1st Qu.:5099 1st Qu.:0.0000
## Median :-41.80 Median :4.857 Median :5191 Median :0.0000
## Mean :-40.41 Mean :3.620 Mean :5166 Mean :0.1136
## 3rd Qu.:-36.40 3rd Qu.:4.961 3rd Qu.:5228 3rd Qu.:0.0000
## Max. :-26.90 Max. :5.045 Max. :5228 Max. :1.0000
##
#Using Full model
full_model = glm(formula = y ~., data = bank_train, family = binomial)
cooks_sd_cutoff = 3*sd(cooks.distance(full_model))
out <- which(cooks.distance(full_model)>cooks_sd_cutoff)
full_model = glm(formula = y ~., data = bank_train[-out,], family = binomial)
#Remove loan variable as it contains not required information
full_model = glm(formula = y ~.-loan, data = bank_train[-out,], family = binomial)
summary(full_model)
##
## Call:
## glm(formula = y ~ . - loan, family = binomial, data = bank_train[-out,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7826 -0.3428 -0.2350 -0.1276 3.3704
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.182e+02 1.455e+02 -0.812 0.41680
## age 1.973e-02 9.571e-03 2.062 0.03925 *
## jobblue-collar 2.433e-02 3.179e-01 0.077 0.93900
## jobentrepreneur -2.496e+00 1.050e+00 -2.377 0.01745 *
## jobhousemaid -5.899e-01 6.218e-01 -0.949 0.34277
## jobmanagement -7.085e-01 3.421e-01 -2.071 0.03839 *
## jobretired -1.547e+00 4.791e-01 -3.229 0.00124 **
## jobself-employed -1.768e+00 6.096e-01 -2.900 0.00373 **
## jobservices -5.282e-01 3.332e-01 -1.585 0.11292
## jobstudent -2.742e-01 4.723e-01 -0.581 0.56151
## jobtechnician -6.255e-02 2.462e-01 -0.254 0.79946
## jobunemployed -1.310e+00 6.184e-01 -2.118 0.03417 *
## jobunknown -1.259e+00 1.282e+00 -0.982 0.32599
## maritalmarried 1.330e-01 2.867e-01 0.464 0.64277
## maritalsingle 1.417e-01 3.208e-01 0.442 0.65861
## maritalunknown -1.240e+01 4.487e+02 -0.028 0.97796
## educationbasic.6y -1.192e-01 6.949e-01 -0.172 0.86378
## educationbasic.9y 1.088e+00 4.643e-01 2.342 0.01916 *
## educationhigh.school 1.281e+00 4.563e-01 2.808 0.00498 **
## educationprofessional.course 1.295e+00 4.818e-01 2.688 0.00718 **
## educationuniversity.degree 1.349e+00 4.626e-01 2.917 0.00354 **
## educationunknown 7.072e-01 5.894e-01 1.200 0.23016
## defaultunknown -2.684e-01 2.697e-01 -0.995 0.31965
## housingunknown -1.398e+00 8.609e-01 -1.624 0.10433
## housingyes -2.721e-02 1.615e-01 -0.168 0.86623
## contacttelephone -2.641e+00 4.100e-01 -6.440 1.20e-10 ***
## monthaug -1.235e+00 5.329e-01 -2.318 0.02046 *
## monthdec 1.906e+00 8.156e-01 2.337 0.01944 *
## monthjul -7.219e-01 4.567e-01 -1.581 0.11395
## monthjun 6.446e-01 5.131e-01 1.256 0.20895
## monthmar 3.213e+00 6.666e-01 4.820 1.44e-06 ***
## monthmay -1.341e-01 3.660e-01 -0.366 0.71415
## monthnov -6.644e-01 5.148e-01 -1.290 0.19688
## monthoct -5.030e-01 6.315e-01 -0.796 0.42576
## monthsep -1.503e+00 7.468e-01 -2.013 0.04417 *
## day_of_weekmon -2.016e-01 2.517e-01 -0.801 0.42303
## day_of_weekthu 4.732e-02 2.456e-01 0.193 0.84718
## day_of_weektue -1.334e-01 2.581e-01 -0.517 0.60521
## day_of_weekwed -2.131e-01 2.636e-01 -0.808 0.41888
## campaign -1.758e-01 5.964e-02 -2.948 0.00320 **
## pdaysNot Previously Contacted 3.631e-01 1.015e+00 0.358 0.72055
## pdaysOne Week 2.226e+00 8.206e-01 2.712 0.00668 **
## previous -1.670e-01 2.660e-01 -0.628 0.53011
## poutcomenonexistent 1.198e+00 4.299e-01 2.787 0.00531 **
## poutcomesuccess 2.134e+00 9.915e-01 2.152 0.03140 *
## emp.var.rate -4.037e-03 5.463e-01 -0.007 0.99410
## cons.price.idx 1.560e+00 9.626e-01 1.621 0.10508
## cons.conf.idx 1.437e-01 3.568e-02 4.026 5.68e-05 ***
## euribor3m -3.966e-01 5.288e-01 -0.750 0.45322
## nr.employed -4.831e-03 1.183e-02 -0.408 0.68295
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1947.8 on 3187 degrees of freedom
## Residual deviance: 1219.6 on 3138 degrees of freedom
## AIC: 1319.6
##
## Number of Fisher Scoring iterations: 14
#Maximized cutoff
preds <- prediction(predict(full_model, bank_test, type = "response"),
bank_test$y)
#Predicted Probability and True Classification
auc <- round(as.numeric(performance(preds, measure = "auc")@y.values),3)
false.rates <-performance(preds, "fpr","fnr")
accuracy <-performance(preds, "acc","err")
plot(unlist(performance(preds, "sens")@x.values), unlist(performance(preds, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(preds, "spec")@x.values), unlist(performance(preds, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(preds, "sens")@y.values) - unlist(performance(preds, "spec")@y.values)))
min.x<-unlist(performance(preds, "sens")@x.values)[min.diff]
min.y<-unlist(performance(preds, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
#Plot ROC
perf <- performance(preds, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
# Using Confusion Matrix
predict = predict(full_model, bank_test, type = "response")
predict_choice = ifelse(predict>0.04541, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 464 29
## 1 283 48
##
## Accuracy : 0.6214
## 95% CI : (0.5872, 0.6546)
## No Information Rate : 0.9066
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0986
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.62338
## Specificity : 0.62115
## Pos Pred Value : 0.14502
## Neg Pred Value : 0.94118
## Prevalence : 0.09345
## Detection Rate : 0.05825
## Detection Prevalence : 0.40170
## Balanced Accuracy : 0.62226
##
## 'Positive' Class : 1
##
#cooks distance
cooks_distance = cooks.distance(full_model)
plot(cooks_distance,col="red", pch=19, cex=1)
text(cooks.distance(full_model),labels = rownames(bank_train))
#Check goodness of fit
hoslem.test(full_model$y, fitted(full_model), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: full_model$y, fitted(full_model)
## X-squared = 7.7407, df = 8, p-value = 0.4592
note: removing outliers gives cutoff of 0, removing outliers
##remove housing
full_model = glm(formula = y ~., data = bank_train, family = binomial)
alias(full_model)
## Model :
## y ~ age + job + marital + education + default + housing + loan +
## contact + month + day_of_week + campaign + pdays + previous +
## poutcome + emp.var.rate + cons.price.idx + cons.conf.idx +
## euribor3m + nr.employed
##
## Complete :
## (Intercept) age jobblue-collar jobentrepreneur jobhousemaid
## loanunknown 0 0 0 0 0
## jobmanagement jobretired jobself-employed jobservices jobstudent
## loanunknown 0 0 0 0 0
## jobtechnician jobunemployed jobunknown maritalmarried maritalsingle
## loanunknown 0 0 0 0 0
## maritalunknown educationbasic.6y educationbasic.9y
## loanunknown 0 0 0
## educationhigh.school educationprofessional.course
## loanunknown 0 0
## educationuniversity.degree educationunknown defaultunknown
## loanunknown 0 0 0
## housingunknown housingyes loanyes contacttelephone monthaug
## loanunknown 1 0 0 0 0
## monthdec monthjul monthjun monthmar monthmay monthnov monthoct
## loanunknown 0 0 0 0 0 0 0
## monthsep day_of_weekmon day_of_weekthu day_of_weektue
## loanunknown 0 0 0 0
## day_of_weekwed campaign pdaysNot Previously Contacted pdaysOne Week
## loanunknown 0 0 0 0
## previous poutcomenonexistent poutcomesuccess emp.var.rate
## loanunknown 0 0 0 0
## cons.price.idx cons.conf.idx euribor3m nr.employed
## loanunknown 0 0 0 0
##remove nr.employed
full_vif <- glm(formula = y ~ .-housing, data = bank_train, family = binomial)
vif(full_vif)
## GVIF Df GVIF^(1/(2*Df))
## age 2.109707 1 1.452483
## job 6.617304 11 1.089692
## marital 1.486633 3 1.068318
## education 3.595784 6 1.112541
## default 1.172887 1 1.082999
## loan 1.053313 2 1.013070
## contact 2.871235 1 1.694472
## month 83.882090 9 1.279000
## day_of_week 1.153264 4 1.017984
## campaign 1.056683 1 1.027951
## pdays 10.416581 2 1.796517
## previous 4.325893 1 2.079878
## poutcome 21.494397 2 2.153185
## emp.var.rate 141.638992 1 11.901218
## cons.price.idx 64.913164 1 8.056871
## cons.conf.idx 5.532312 1 2.352087
## euribor3m 151.325063 1 12.301425
## nr.employed 178.904609 1 13.375523
##remove emp.var.rate
full_vif <- glm(formula = y ~ .-housing-nr.employed, data = bank_train, family = binomial)
vif(full_vif)
## GVIF Df GVIF^(1/(2*Df))
## age 2.104310 1 1.450624
## job 6.552971 11 1.089208
## marital 1.483034 3 1.067887
## education 3.561663 6 1.111658
## default 1.172785 1 1.082952
## loan 1.052920 2 1.012975
## contact 2.678550 1 1.636628
## month 18.780782 9 1.176960
## day_of_week 1.138171 4 1.016309
## campaign 1.055772 1 1.027508
## pdays 10.380567 2 1.794962
## previous 4.324459 1 2.079533
## poutcome 21.418551 2 2.151283
## emp.var.rate 97.756697 1 9.887199
## cons.price.idx 12.193816 1 3.491965
## cons.conf.idx 2.959152 1 1.720219
## euribor3m 68.202188 1 8.258462
full_vif <- glm(formula = y ~ .-housing-nr.employed-emp.var.rate, data = bank_train, family = binomial)
vif(full_vif)
## GVIF Df GVIF^(1/(2*Df))
## age 2.097682 1 1.448338
## job 6.443553 11 1.088375
## marital 1.479862 3 1.067506
## education 3.543395 6 1.111181
## default 1.169305 1 1.081344
## loan 1.052619 2 1.012903
## contact 2.461931 1 1.569054
## month 7.027310 9 1.114407
## day_of_week 1.136222 4 1.016092
## campaign 1.053083 1 1.026198
## pdays 10.440088 2 1.797530
## previous 4.347275 1 2.085012
## poutcome 21.471749 2 2.152617
## cons.price.idx 2.760575 1 1.661498
## cons.conf.idx 2.432893 1 1.559773
## euribor3m 2.850116 1 1.688229
##final full vif model
full_vif <- glm(formula = y ~ .-housing-nr.employed-emp.var.rate-poutcome, data = bank_train, family = binomial)
#remove outliers
inf <- 4/(nrow(bank_train))
out <- which(cooks.distance(full_model)>inf)
full_vif <- glm(formula = y ~ .-housing-nr.employed-emp.var.rate-poutcome, data = bank_train, family = binomial)
summary(full_vif)
##
## Call:
## glm(formula = y ~ . - housing - nr.employed - emp.var.rate -
## poutcome, family = binomial, data = bank_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2921 -0.4128 -0.3294 -0.2513 2.8510
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -79.236937 13.979282 -5.668 1.44e-08 ***
## age 0.015083 0.007601 1.984 0.047229 *
## jobblue-collar -0.127529 0.253679 -0.503 0.615162
## jobentrepreneur -0.516907 0.429916 -1.202 0.229230
## jobhousemaid -0.036444 0.420276 -0.087 0.930899
## jobmanagement -0.286186 0.267723 -1.069 0.285086
## jobretired -0.530367 0.352492 -1.505 0.132422
## jobself-employed -0.519916 0.379433 -1.370 0.170611
## jobservices -0.189805 0.263834 -0.719 0.471889
## jobstudent 0.116014 0.375458 0.309 0.757327
## jobtechnician -0.019800 0.214649 -0.092 0.926503
## jobunemployed -0.023224 0.388138 -0.060 0.952288
## jobunknown 0.162708 0.623201 0.261 0.794028
## maritalmarried 0.230487 0.226339 1.018 0.308523
## maritalsingle 0.197062 0.259602 0.759 0.447798
## maritalunknown -11.892006 281.161028 -0.042 0.966263
## educationbasic.6y 0.158814 0.393468 0.404 0.686487
## educationbasic.9y 0.182169 0.306760 0.594 0.552612
## educationhigh.school 0.338463 0.294624 1.149 0.250640
## educationprofessional.course 0.394927 0.321869 1.227 0.219829
## educationuniversity.degree 0.433401 0.295031 1.469 0.141832
## educationunknown 0.357488 0.376153 0.950 0.341920
## defaultunknown 0.057384 0.192685 0.298 0.765847
## loanunknown -0.240308 0.460321 -0.522 0.601639
## loanyes -0.021621 0.174173 -0.124 0.901210
## contacttelephone -0.890142 0.241532 -3.685 0.000228 ***
## monthaug -0.279454 0.350735 -0.797 0.425586
## monthdec 1.242944 0.576848 2.155 0.031184 *
## monthjul 0.135859 0.336817 0.403 0.686682
## monthjun 0.426902 0.315689 1.352 0.176283
## monthmar 1.913444 0.447401 4.277 1.90e-05 ***
## monthmay -0.338625 0.268929 -1.259 0.207973
## monthnov -0.037233 0.329562 -0.113 0.910049
## monthoct 0.086034 0.411515 0.209 0.834397
## monthsep -0.366036 0.443472 -0.825 0.409153
## day_of_weekmon -0.245488 0.202341 -1.213 0.225038
## day_of_weekthu -0.013185 0.198240 -0.067 0.946971
## day_of_weektue -0.048378 0.204243 -0.237 0.812762
## day_of_weekwed -0.086808 0.209280 -0.415 0.678294
## campaign -0.093578 0.039684 -2.358 0.018368 *
## pdaysNot Previously Contacted -0.613717 0.475146 -1.292 0.196482
## pdaysOne Week 1.592494 0.495671 3.213 0.001314 **
## previous -0.286880 0.114476 -2.506 0.012210 *
## cons.price.idx 0.871007 0.153494 5.675 1.39e-08 ***
## cons.conf.idx 0.061791 0.017012 3.632 0.000281 ***
## euribor3m -0.548154 0.056037 -9.782 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2330.9 on 3292 degrees of freedom
## Residual deviance: 1793.5 on 3247 degrees of freedom
## AIC: 1885.5
##
## Number of Fisher Scoring iterations: 13
#Maximized cutoff
pred <- prediction(predict(full_vif, bank_test, type = "response"),
bank_test$y) #Predicted Probability and True Classification
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
## ROC Plot
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
##Confusion Matrix
predict = predict(full_vif, bank_test, type = "response")
predict_choice = ifelse(predict>0.0744, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 495 25
## 1 252 52
##
## Accuracy : 0.6638
## 95% CI : (0.6304, 0.6961)
## No Information Rate : 0.9066
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1455
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.67532
## Specificity : 0.66265
## Pos Pred Value : 0.17105
## Neg Pred Value : 0.95192
## Prevalence : 0.09345
## Detection Rate : 0.06311
## Detection Prevalence : 0.36893
## Balanced Accuracy : 0.66899
##
## 'Positive' Class : 1
##
##Cooks Distance Diagnostics
cooks_distance = cooks.distance(full_vif)
plot(cooks_distance,col="red", pch=19, cex=1)
text(cooks.distance(full_vif),labels = rownames(bank_train))
##Check goodness of fit
hoslem.test(full_vif$y, fitted(full_vif), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: full_vif$y, fitted(full_vif)
## X-squared = 8.6778, df = 8, p-value = 0.3702
full_model = glm(formula = y ~., data = bank_train, family = binomial)
AIC = step(full_model, direction="both",test="Chisq", trace = F)
cooks_sd_cutoff = 3*sd(cooks.distance(AIC))
out <- which(cooks.distance(AIC)>cooks_sd_cutoff)
AIC = glm(formula = y ~ contact + month + campaign + pdays + poutcome +
emp.var.rate + cons.price.idx + cons.conf.idx, family = binomial,
data = bank_train[-out,])
summary(AIC)
##
## Call:
## glm(formula = y ~ contact + month + campaign + pdays + poutcome +
## emp.var.rate + cons.price.idx + cons.conf.idx, family = binomial,
## data = bank_train[-out, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7026 -0.3641 -0.2986 -0.2250 2.8925
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -215.03712 20.85477 -10.311 < 2e-16 ***
## contacttelephone -2.51999 0.36208 -6.960 3.41e-12 ***
## monthaug -0.75554 0.44399 -1.702 0.08881 .
## monthdec 1.33077 0.90665 1.468 0.14216
## monthjul -0.53414 0.40996 -1.303 0.19261
## monthjun 0.17025 0.36164 0.471 0.63779
## monthmar 2.88769 0.56138 5.144 2.69e-07 ***
## monthmay 0.07237 0.32077 0.226 0.82150
## monthnov -0.89119 0.40402 -2.206 0.02740 *
## monthoct -1.02139 0.52680 -1.939 0.05252 .
## monthsep -0.93958 0.56715 -1.657 0.09759 .
## campaign -0.16813 0.05270 -3.191 0.00142 **
## pdaysNot Previously Contacted 0.86719 1.50206 0.577 0.56371
## pdaysOne Week 2.46473 1.29916 1.897 0.05781 .
## poutcomenonexistent 1.64605 0.29484 5.583 2.37e-08 ***
## poutcomesuccess 2.54628 1.78793 1.424 0.15440
## emp.var.rate -0.79154 0.08229 -9.619 < 2e-16 ***
## cons.price.idx 2.31436 0.22534 10.270 < 2e-16 ***
## cons.conf.idx 0.12576 0.02261 5.563 2.66e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2054.0 on 3199 degrees of freedom
## Residual deviance: 1449.4 on 3181 degrees of freedom
## AIC: 1487.4
##
## Number of Fisher Scoring iterations: 6
##Maximized cutoff
pred <- prediction(predict(AIC, bank_test, type = "response"),
bank_test$y) #Predicted Probability and True Classification
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
## ROC Plot
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
##Confusion Matrix
predict = predict(AIC, bank_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 517 24
## 1 230 53
##
## Accuracy : 0.6917
## 95% CI : (0.659, 0.7232)
## No Information Rate : 0.9066
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1729
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.68831
## Specificity : 0.69210
## Pos Pred Value : 0.18728
## Neg Pred Value : 0.95564
## Prevalence : 0.09345
## Detection Rate : 0.06432
## Detection Prevalence : 0.34345
## Balanced Accuracy : 0.69021
##
## 'Positive' Class : 1
##
##cooks distance diagnostics
cooks_distance = cooks.distance(AIC)
plot(cooks_distance,col="red", pch=19, cex=1)
text(cooks.distance(AIC),labels = rownames(bank_train))
## Goodness of fit
hoslem.test(AIC$y, fitted(AIC), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: AIC$y, fitted(AIC)
## X-squared = 6.0238, df = 8, p-value = 0.6446
full_vif <- glm(formula = y ~ .-housing-nr.employed-emp.var.rate-poutcome, data = bank_train, family = binomial)
AIC = step(full_vif, direction="both",test="Chisq", trace = F)
cooks_sd_cutoff = 3*sd(cooks.distance(AIC))
out <- which(cooks.distance(AIC)>cooks_sd_cutoff)
AIC = glm(formula = y ~ contact + month + campaign + pdays + previous +
cons.price.idx + cons.conf.idx + euribor3m, family = binomial,
data = bank_train[-out,])
summary(AIC)
##
## Call:
## glm(formula = y ~ contact + month + campaign + pdays + previous +
## cons.price.idx + cons.conf.idx + euribor3m, family = binomial,
## data = bank_train[-out, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8339 -0.3681 -0.3012 -0.2420 2.7231
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -142.62305 17.57048 -8.117 4.77e-16 ***
## contacttelephone -2.32696 0.35308 -6.590 4.38e-11 ***
## monthaug -1.24802 0.42323 -2.949 0.00319 **
## monthdec 2.01362 0.93995 2.142 0.03217 *
## monthjul -0.58326 0.40054 -1.456 0.14534
## monthjun 0.39551 0.35062 1.128 0.25930
## monthmar 1.86541 0.68155 2.737 0.00620 **
## monthmay -0.19473 0.30570 -0.637 0.52414
## monthnov -0.67752 0.39098 -1.733 0.08312 .
## monthoct -0.97625 0.51196 -1.907 0.05653 .
## monthsep -1.23626 0.54775 -2.257 0.02401 *
## campaign -0.17183 0.05263 -3.265 0.00110 **
## pdaysNot Previously Contacted -0.58964 0.94635 -0.623 0.53324
## pdaysOne Week 2.88708 0.95628 3.019 0.00254 **
## previous -0.62317 0.15580 -4.000 6.34e-05 ***
## cons.price.idx 1.60803 0.19388 8.294 < 2e-16 ***
## cons.conf.idx 0.15436 0.02195 7.031 2.05e-12 ***
## euribor3m -0.57404 0.06157 -9.324 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2095.4 on 3207 degrees of freedom
## Residual deviance: 1495.0 on 3190 degrees of freedom
## AIC: 1531
##
## Number of Fisher Scoring iterations: 6
pred <- prediction(predict(AIC, bank_test, type = "response"),
bank_test$y) #Predicted Probability and True Classification
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
## ROC Curve Plot
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
## Confusion Matrix
predict = predict(AIC, bank_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 513 25
## 1 234 52
##
## Accuracy : 0.6857
## 95% CI : (0.6528, 0.7173)
## No Information Rate : 0.9066
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1633
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.67532
## Specificity : 0.68675
## Pos Pred Value : 0.18182
## Neg Pred Value : 0.95353
## Prevalence : 0.09345
## Detection Rate : 0.06311
## Detection Prevalence : 0.34709
## Balanced Accuracy : 0.68104
##
## 'Positive' Class : 1
##
cooks_distance = cooks.distance(AIC)
plot(cooks_distance,col="red", pch=19, cex=1)
text(cooks.distance(AIC),labels = rownames(bank_train))
## Goodness of Fit
hoslem.test(AIC$y, fitted(AIC), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: AIC$y, fitted(AIC)
## X-squared = 9.1052, df = 8, p-value = 0.3335
full_model = glm(formula = y ~., data = bank_train, family = binomial)
BIC = step(full_model, direction="both",test="Chisq", trace = F, k=log(nrow(bank_train)))
cooks_sd_cutoff = 3*sd(cooks.distance(BIC))
out <- which(cooks.distance(BIC)>cooks_sd_cutoff)
BIC = glm(formula = y ~ contact + poutcome + emp.var.rate + cons.price.idx +
cons.conf.idx, family = binomial, data = bank_train[-out,])
summary(BIC)
##
## Call:
## glm(formula = y ~ contact + poutcome + emp.var.rate + cons.price.idx +
## cons.conf.idx, family = binomial, data = bank_train[-out,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5597 -0.3171 -0.2561 -0.1861 2.8514
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -214.83405 18.09645 -11.872 < 2e-16 ***
## contacttelephone -2.18559 0.26179 -8.349 < 2e-16 ***
## poutcomenonexistent 1.62480 0.27191 5.976 2.29e-09 ***
## poutcomesuccess 4.19409 0.42355 9.902 < 2e-16 ***
## emp.var.rate -1.01411 0.06815 -14.881 < 2e-16 ***
## cons.price.idx 2.29874 0.19454 11.816 < 2e-16 ***
## cons.conf.idx 0.09904 0.01527 6.484 8.92e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1879.6 on 3170 degrees of freedom
## Residual deviance: 1219.9 on 3164 degrees of freedom
## AIC: 1233.9
##
## Number of Fisher Scoring iterations: 6
pred <- prediction(predict(BIC, bank_test, type = "response"),
bank_test$y) #Predicted Probability and True Classification
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
## ROC Curve Plot
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
## Confusion Matrix
predict = predict(BIC, bank_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 595 28
## 1 152 49
##
## Accuracy : 0.7816
## 95% CI : (0.7518, 0.8093)
## No Information Rate : 0.9066
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2514
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.63636
## Specificity : 0.79652
## Pos Pred Value : 0.24378
## Neg Pred Value : 0.95506
## Prevalence : 0.09345
## Detection Rate : 0.05947
## Detection Prevalence : 0.24393
## Balanced Accuracy : 0.71644
##
## 'Positive' Class : 1
##
cooks_distance = cooks.distance(BIC)
plot(cooks_distance,col="red", pch=19, cex=1)
text(cooks.distance(BIC),labels = rownames(bank_train))
## Goodness of Fit
hoslem.test(BIC$y, fitted(BIC), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: BIC$y, fitted(BIC)
## X-squared = 42.631, df = 8, p-value = 1.031e-06
exp(coef(BIC))
## (Intercept) contacttelephone poutcomenonexistent poutcomesuccess
## 4.997541e-94 1.124110e-01 5.077415e+00 6.629361e+01
## emp.var.rate cons.price.idx cons.conf.idx
## 3.627254e-01 9.961587e+00 1.104107e+00
full_vif <- glm(formula = y ~ .-housing-nr.employed-emp.var.rate-poutcome, data = bank_train, family = binomial)
BIC = step(full_vif, direction="both",test="Chisq", trace = F, k=log(nrow(bank_train)))
summary(BIC)
##
## Call:
## glm(formula = y ~ contact + pdays + cons.price.idx + cons.conf.idx +
## euribor3m, family = binomial, data = bank_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9713 -0.3520 -0.3361 -0.3014 2.7820
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -77.42934 11.53584 -6.712 1.92e-11 ***
## contacttelephone -0.82708 0.17907 -4.619 3.86e-06 ***
## pdaysNot Previously Contacted -0.49052 0.42677 -1.149 0.25040
## pdaysOne Week 1.26243 0.46777 2.699 0.00696 **
## cons.price.idx 0.85963 0.12476 6.890 5.57e-12 ***
## cons.conf.idx 0.06774 0.01175 5.764 8.24e-09 ***
## euribor3m -0.54655 0.04201 -13.011 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2330.9 on 3292 degrees of freedom
## Residual deviance: 1870.5 on 3286 degrees of freedom
## AIC: 1884.5
##
## Number of Fisher Scoring iterations: 5
deviance <-residuals(BIC, type = "deviance")
pearson <-residuals(BIC, type = "pearson")
# standardized deviance residuals
deviance_std<-residuals(BIC, type = "deviance")/sqrt(1 - hatvalues(BIC))
# standardized pearson residuals
pearson_std <-residuals(BIC, type = "pearson")/sqrt(1 - hatvalues(BIC))
dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))
plot(deviance_std[BIC$model$y==1], col = "red",
ylim = c(-3.5,3.5), ylab = "Standard Deviance residuals", xlab = "ID")
points(deviance_std[BIC$model$y==0], col = "black")
plot(pearson_std[BIC$model$y==1], col = "red",
ylim = c(-3.5,3.5), ylab = "Standard Pearson residuals", xlab = "ID")
points(pearson_std[BIC$model$y==0], col = "black")
#outliers
cooks_sd_cutoff = 3*sd(cooks.distance(BIC))
out <- which(cooks.distance(BIC)>cooks_sd_cutoff)
#BIC
BIC = glm(formula = y ~ contact + pdays + cons.price.idx + cons.conf.idx +
euribor3m, family = binomial, data = bank_train[-out,])
#linearity plots
probabilities <- predict(BIC, type = "response")
predicted.classes <- ifelse(probabilities > 0.06352, 1, 0)
mydata = dplyr::select_if(bank_train[-out,c("cons.price.idx","cons.conf.idx","euribor3m")], is.numeric)
predictors <- colnames(mydata)
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities)))
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)
ggplot(mydata, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'
#Predicted Probability and True Classification
pred <- prediction(predict(BIC, bank_test, type = "response"),
bank_test$y)
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
## ROC Curve Plot
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
## Confusion Matrix
predict = predict(BIC, bank_test, type = "response")
predict_choice = ifelse(predict>optimal, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(bank_test$y), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 543 25
## 1 204 52
##
## Accuracy : 0.7221
## 95% CI : (0.6901, 0.7524)
## No Information Rate : 0.9066
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1969
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.67532
## Specificity : 0.72691
## Pos Pred Value : 0.20313
## Neg Pred Value : 0.95599
## Prevalence : 0.09345
## Detection Rate : 0.06311
## Detection Prevalence : 0.31068
## Balanced Accuracy : 0.70112
##
## 'Positive' Class : 1
##
cooks_distance = cooks.distance(BIC)
plot(cooks_distance,col="red", pch=19, cex=1)
text(cooks.distance(BIC),labels = rownames(bank_train))
## Goodness of fit
hoslem.test(BIC$y, fitted(BIC), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: BIC$y, fitted(BIC)
## X-squared = 20.446, df = 8, p-value = 0.008775
exp(coef(BIC))
## (Intercept) contacttelephone
## 4.224681e-50 3.123750e-01
## pdaysNot Previously Contacted pdaysOne Week
## 3.174599e+05 6.046893e+06
## cons.price.idx cons.conf.idx
## 3.048826e+00 1.086699e+00
## euribor3m
## 5.646773e-01
LR = Anova(BIC, test = "LR")
Wald = Anova(BIC, test = "Wald")
score = anova(BIC, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)
## LR Score Wald
## 1 39.18013 70.96549 36.15493
## 2 103.43374 70.96549 72.98712
## 3 65.71214 70.96549 65.27956
## 4 43.71774 70.96549 43.70600
## 5 180.49998 70.96549 178.32134
data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)
## p.LR p.Score p.Wald
## 1 3.864537e-10 3.635281e-17 1.822372e-09
## 2 3.464564e-23 3.635281e-17 1.415954e-16
## 3 5.218350e-16 3.635281e-17 6.499197e-16
## 4 3.793177e-11 3.635281e-17 3.816006e-11
## 5 3.769249e-41 3.635281e-17 1.127064e-40
exp(coef(BIC))
## (Intercept) contacttelephone
## 4.224681e-50 3.123750e-01
## pdaysNot Previously Contacted pdaysOne Week
## 3.174599e+05 6.046893e+06
## cons.price.idx cons.conf.idx
## 3.048826e+00 1.086699e+00
## euribor3m
## 5.646773e-01
summary(BIC)
##
## Call:
## glm(formula = y ~ contact + pdays + cons.price.idx + cons.conf.idx +
## euribor3m, family = binomial, data = bank_train[-out, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6146 -0.3623 -0.3200 -0.2819 2.7912
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -113.68831 380.35312 -0.299 0.765
## contacttelephone -1.16355 0.19351 -6.013 1.82e-09 ***
## pdaysNot Previously Contacted 12.66811 380.14316 0.033 0.973
## pdaysOne Week 15.61506 380.14330 0.041 0.967
## cons.price.idx 1.11476 0.13797 8.080 6.50e-16 ***
## cons.conf.idx 0.08314 0.01258 6.611 3.82e-11 ***
## euribor3m -0.57150 0.04280 -13.354 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2226.7 on 3238 degrees of freedom
## Residual deviance: 1702.0 on 3232 degrees of freedom
## AIC: 1716
##
## Number of Fisher Scoring iterations: 13
tidy(BIC)
## # A tibble: 7 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -114. 380. -0.299 7.65e- 1
## 2 contacttelephone -1.16 0.194 -6.01 1.82e- 9
## 3 pdaysNot Previously Contacted 12.7 380. 0.0333 9.73e- 1
## 4 pdaysOne Week 15.6 380. 0.0411 9.67e- 1
## 5 cons.price.idx 1.11 0.138 8.08 6.50e-16
## 6 cons.conf.idx 0.0831 0.0126 6.61 3.82e-11
## 7 euribor3m -0.572 0.0428 -13.4 1.13e-40
cooks_distance = cooks.distance(BIC)
plot(cooks_distance,col="red", pch=19, cex=1)
text(cooks.distance(BIC),labels = rownames(bank_train))