Load the functions
pacman::p_load(tidyverse, caret, corrplot, caTools, knitr, car,
ROCR, IRdisplay, e1071,earth, fastDummies)
Read the csv.file
setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Predictive Analytic/EBA Predictive WK 2/data/bank marketing")
bank <- read.csv("bank_data.csv")
head(bank)
## age job marital education default balance housing loan contact
## 1 58 management married tertiary no 2143 yes no unknown
## 2 44 technician single secondary no 29 yes no unknown
## 3 33 entrepreneur married secondary no 2 yes yes unknown
## 4 47 blue-collar married unknown no 1506 yes no unknown
## 5 33 unknown single unknown no 1 no no unknown
## 6 35 management married tertiary no 231 yes no unknown
## day month duration campaign pdays previous poutcome y
## 1 5 may 261 1 -1 0 unknown no
## 2 5 may 151 1 -1 0 unknown no
## 3 5 may 76 1 -1 0 unknown no
## 4 5 may 92 1 -1 0 unknown no
## 5 5 may 198 1 -1 0 unknown no
## 6 5 may 139 1 -1 0 unknown no
dim(bank)
## [1] 45211 17
summary(bank)
## age job marital education
## Min. :18.00 blue-collar:9732 divorced: 5207 primary : 6851
## 1st Qu.:33.00 management :9458 married :27214 secondary:23202
## Median :39.00 technician :7597 single :12790 tertiary :13301
## Mean :40.94 admin. :5171 unknown : 1857
## 3rd Qu.:48.00 services :4154
## Max. :95.00 retired :2264
## (Other) :6835
## default balance housing loan contact
## no :44396 Min. : -8019 no :20081 no :37967 cellular :29285
## yes: 815 1st Qu.: 72 yes:25130 yes: 7244 telephone: 2906
## Median : 448 unknown :13020
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
##
## day month duration campaign
## Min. : 1.00 may :13766 Min. : 0.0 Min. : 1.000
## 1st Qu.: 8.00 jul : 6895 1st Qu.: 103.0 1st Qu.: 1.000
## Median :16.00 aug : 6247 Median : 180.0 Median : 2.000
## Mean :15.81 jun : 5341 Mean : 258.2 Mean : 2.764
## 3rd Qu.:21.00 nov : 3970 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :31.00 apr : 2932 Max. :4918.0 Max. :63.000
## (Other): 6060
## pdays previous poutcome y
## Min. : -1.0 Min. : 0.0000 failure: 4901 no :39922
## 1st Qu.: -1.0 1st Qu.: 0.0000 other : 1840 yes: 5289
## Median : -1.0 Median : 0.0000 success: 1511
## Mean : 40.2 Mean : 0.5803 unknown:36959
## 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :871.0 Max. :275.0000
##
str(bank)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
## $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
variable ‘day’ not suppose to have mean
convert day to factor
bank$day <- as.factor(bank$day)
str(bank)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
## $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ day : Factor w/ 31 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ month : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
summary(bank)
## age job marital education
## Min. :18.00 blue-collar:9732 divorced: 5207 primary : 6851
## 1st Qu.:33.00 management :9458 married :27214 secondary:23202
## Median :39.00 technician :7597 single :12790 tertiary :13301
## Mean :40.94 admin. :5171 unknown : 1857
## 3rd Qu.:48.00 services :4154
## Max. :95.00 retired :2264
## (Other) :6835
## default balance housing loan contact
## no :44396 Min. : -8019 no :20081 no :37967 cellular :29285
## yes: 815 1st Qu.: 72 yes:25130 yes: 7244 telephone: 2906
## Median : 448 unknown :13020
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
##
## day month duration campaign
## 20 : 2752 may :13766 Min. : 0.0 Min. : 1.000
## 18 : 2308 jul : 6895 1st Qu.: 103.0 1st Qu.: 1.000
## 21 : 2026 aug : 6247 Median : 180.0 Median : 2.000
## 17 : 1939 jun : 5341 Mean : 258.2 Mean : 2.764
## 6 : 1932 nov : 3970 3rd Qu.: 319.0 3rd Qu.: 3.000
## 5 : 1910 apr : 2932 Max. :4918.0 Max. :63.000
## (Other):32344 (Other): 6060
## pdays previous poutcome y
## Min. : -1.0 Min. : 0.0000 failure: 4901 no :39922
## 1st Qu.: -1.0 1st Qu.: 0.0000 other : 1840 yes: 5289
## Median : -1.0 Median : 0.0000 success: 1511
## Mean : 40.2 Mean : 0.5803 unknown:36959
## 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :871.0 Max. :275.0000
##
Explore the subscribe and no subscribe proportion
Only 12% of customer response to the bank offers.
table(bank$y)
##
## no yes
## 39922 5289
bank %>%
group_by(y) %>%
summarise(per = n()/nrow(bank)) %>%
ggplot(aes(x=y, y = per, fill = y)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = round(per,2)), vjust =2)

Relevel ‘unknown’ as the baseline for prediction
table(bank$job)
##
## admin. blue-collar entrepreneur housemaid management
## 5171 9732 1487 1240 9458
## retired self-employed services student technician
## 2264 1579 4154 938 7597
## unemployed unknown
## 1303 288
bank$job <- relevel(bank$job, ref = 'unknown')
table(bank$job)
##
## unknown admin. blue-collar entrepreneur housemaid
## 288 5171 9732 1487 1240
## management retired self-employed services student
## 9458 2264 1579 4154 938
## technician unemployed
## 7597 1303
table(bank$education)
##
## primary secondary tertiary unknown
## 6851 23202 13301 1857
bank$education <- relevel(bank$education, ref = 'unknown')
table(bank$education)
##
## unknown primary secondary tertiary
## 1857 6851 23202 13301
table(bank$contact)
##
## cellular telephone unknown
## 29285 2906 13020
bank$contact <- relevel(bank$contact, ref = 'unknown')
table(bank$contact)
##
## unknown cellular telephone
## 13020 29285 2906
table(bank$poutcome)
##
## failure other success unknown
## 4901 1840 1511 36959
bank$poutcome <- relevel(bank$poutcome, ref = 'unknown')
table(bank$poutcome)
##
## unknown failure other success
## 36959 4901 1840 1511
Split the data
set.seed(123)
#### Create a boolean flag to split data
splitData = sample.split(bank$y, SplitRatio = 0.7)
train_set = bank[splitData,]
nrow(train_set)/nrow(bank)
## [1] 0.6999845
test_set = bank[!splitData,]
nrow(test_set)/nrow(bank)
## [1] 0.3000155
Use all independent variables to create the 1st model
model = glm(y~., data = train_set, family = binomial)
summary(model)
##
## Call:
## glm(formula = y ~ ., family = binomial, data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.9959 -0.3742 -0.2494 -0.1492 3.3766
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.926e+00 3.849e-01 -10.198 < 2e-16 ***
## age 8.148e-04 2.624e-03 0.311 0.75615
## jobadmin. 3.386e-03 2.681e-01 0.013 0.98992
## jobblue-collar -2.406e-01 2.672e-01 -0.900 0.36788
## jobentrepreneur -2.472e-01 2.908e-01 -0.850 0.39522
## jobhousemaid -5.704e-01 3.022e-01 -1.887 0.05912 .
## jobmanagement -1.097e-01 2.661e-01 -0.412 0.68016
## jobretired 2.337e-01 2.730e-01 0.856 0.39201
## jobself-employed -2.309e-01 2.842e-01 -0.812 0.41654
## jobservices -1.768e-01 2.730e-01 -0.648 0.51730
## jobstudent 4.225e-01 2.832e-01 1.492 0.13576
## jobtechnician -1.221e-01 2.661e-01 -0.459 0.64648
## jobunemployed -9.039e-02 2.853e-01 -0.317 0.75137
## maritalmarried -1.025e-01 7.155e-02 -1.433 0.15189
## maritalsingle 1.090e-01 8.182e-02 1.332 0.18291
## educationprimary -2.387e-01 1.248e-01 -1.912 0.05593 .
## educationsecondary -2.064e-02 1.097e-01 -0.188 0.85072
## educationtertiary 1.472e-01 1.152e-01 1.278 0.20129
## defaultyes -2.857e-02 2.007e-01 -0.142 0.88680
## balance 1.429e-05 6.036e-06 2.368 0.01786 *
## housingyes -6.438e-01 5.280e-02 -12.194 < 2e-16 ***
## loanyes -4.421e-01 7.117e-02 -6.212 5.22e-10 ***
## contactcellular 1.575e+00 8.940e-02 17.618 < 2e-16 ***
## contacttelephone 1.368e+00 1.223e-01 11.192 < 2e-16 ***
## day2 -6.061e-02 2.289e-01 -0.265 0.79112
## day3 9.692e-02 2.317e-01 0.418 0.67571
## day4 7.813e-02 2.240e-01 0.349 0.72729
## day5 -1.595e-01 2.241e-01 -0.712 0.47673
## day6 -1.085e-01 2.258e-01 -0.480 0.63099
## day7 -1.868e-01 2.297e-01 -0.813 0.41613
## day8 8.307e-02 2.254e-01 0.369 0.71248
## day9 7.358e-02 2.318e-01 0.317 0.75095
## day10 5.701e-01 2.512e-01 2.270 0.02322 *
## day11 -2.031e-02 2.291e-01 -0.089 0.92936
## day12 3.731e-01 2.223e-01 1.678 0.09327 .
## day13 4.589e-01 2.251e-01 2.039 0.04146 *
## day14 1.479e-01 2.262e-01 0.654 0.51316
## day15 3.053e-01 2.235e-01 1.366 0.17192
## day16 3.468e-02 2.293e-01 0.151 0.87977
## day17 -4.758e-01 2.276e-01 -2.090 0.03661 *
## day18 -1.406e-02 2.226e-01 -0.063 0.94964
## day19 -5.243e-01 2.414e-01 -2.172 0.02984 *
## day20 -4.085e-01 2.261e-01 -1.806 0.07085 .
## day21 4.116e-02 2.293e-01 0.180 0.85753
## day22 2.215e-01 2.378e-01 0.932 0.35156
## day23 4.520e-01 2.515e-01 1.798 0.07225 .
## day24 1.152e-01 2.815e-01 0.409 0.68232
## day25 3.822e-01 2.432e-01 1.571 0.11608
## day26 4.832e-01 2.444e-01 1.977 0.04805 *
## day27 7.480e-01 2.380e-01 3.143 0.00167 **
## day28 4.584e-02 2.424e-01 0.189 0.85003
## day29 -5.330e-02 2.414e-01 -0.221 0.82526
## day30 3.728e-01 2.244e-01 1.661 0.09673 .
## day31 7.352e-02 3.054e-01 0.241 0.80978
## monthaug -8.489e-01 1.016e-01 -8.353 < 2e-16 ***
## monthdec 6.774e-01 2.125e-01 3.188 0.00143 **
## monthfeb -3.124e-01 1.155e-01 -2.705 0.00682 **
## monthjan -1.182e+00 1.542e-01 -7.662 1.83e-14 ***
## monthjul -9.537e-01 9.905e-02 -9.629 < 2e-16 ***
## monthjun 3.244e-01 1.168e-01 2.778 0.00546 **
## monthmar 1.489e+00 1.462e-01 10.182 < 2e-16 ***
## monthmay -6.286e-01 9.558e-02 -6.577 4.79e-11 ***
## monthnov -7.046e-01 1.109e-01 -6.352 2.13e-10 ***
## monthoct 7.808e-01 1.336e-01 5.842 5.14e-09 ***
## monthsep 6.327e-01 1.495e-01 4.231 2.33e-05 ***
## duration 4.163e-03 7.685e-05 54.167 < 2e-16 ***
## campaign -8.645e-02 1.209e-02 -7.150 8.68e-13 ***
## pdays -6.412e-06 3.594e-04 -0.018 0.98577
## previous 6.009e-03 6.391e-03 0.940 0.34712
## poutcomefailure 1.138e-01 1.103e-01 1.032 0.30227
## poutcomeother 3.734e-01 1.251e-01 2.984 0.00284 **
## poutcomesuccess 2.276e+00 1.009e-01 22.551 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22840 on 31646 degrees of freedom
## Residual deviance: 15028 on 31575 degrees of freedom
## AIC: 15172
##
## Number of Fisher Scoring iterations: 6
Check for multicollinearity
Appear month to have GVIF ~10
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## age 2.173545 1 1.474295
## job 4.258285 11 1.068075
## marital 1.451352 2 1.097598
## education 2.264738 3 1.145960
## default 1.017036 1 1.008482
## balance 1.046575 1 1.023022
## housing 1.448942 1 1.203720
## loan 1.072150 1 1.035447
## contact 1.990865 2 1.187847
## day 5.099757 30 1.027525
## month 11.274297 11 1.116406
## duration 1.141721 1 1.068513
## campaign 1.114992 1 1.055932
## pdays 3.671197 1 1.916037
## previous 1.196915 1 1.094036
## poutcome 4.147954 3 1.267571
Further refine the model by removing the insignificant variables
model = step(model, trace = F)
summary(model)
##
## Call:
## glm(formula = y ~ job + marital + education + balance + housing +
## loan + contact + day + month + duration + campaign + poutcome,
## family = binomial, data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.9992 -0.3742 -0.2493 -0.1492 3.3774
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.883e+00 3.578e-01 -10.855 < 2e-16 ***
## jobadmin. -6.252e-04 2.678e-01 -0.002 0.99814
## jobblue-collar -2.468e-01 2.668e-01 -0.925 0.35495
## jobentrepreneur -2.515e-01 2.906e-01 -0.866 0.38672
## jobhousemaid -5.710e-01 3.021e-01 -1.890 0.05879 .
## jobmanagement -1.125e-01 2.659e-01 -0.423 0.67231
## jobretired 2.444e-01 2.704e-01 0.904 0.36612
## jobself-employed -2.344e-01 2.840e-01 -0.826 0.40903
## jobservices -1.824e-01 2.726e-01 -0.669 0.50346
## jobstudent 4.114e-01 2.807e-01 1.466 0.14276
## jobtechnician -1.264e-01 2.658e-01 -0.476 0.63436
## jobunemployed -9.500e-02 2.851e-01 -0.333 0.73893
## maritalmarried -1.039e-01 7.122e-02 -1.459 0.14445
## maritalsingle 1.003e-01 7.685e-02 1.305 0.19187
## educationprimary -2.385e-01 1.248e-01 -1.910 0.05608 .
## educationsecondary -2.334e-02 1.093e-01 -0.214 0.83089
## educationtertiary 1.437e-01 1.145e-01 1.255 0.20947
## balance 1.447e-05 6.010e-06 2.407 0.01606 *
## housingyes -6.445e-01 5.237e-02 -12.305 < 2e-16 ***
## loanyes -4.433e-01 7.102e-02 -6.242 4.33e-10 ***
## contactcellular 1.574e+00 8.931e-02 17.627 < 2e-16 ***
## contacttelephone 1.373e+00 1.215e-01 11.294 < 2e-16 ***
## day2 -5.847e-02 2.287e-01 -0.256 0.79824
## day3 9.537e-02 2.316e-01 0.412 0.68053
## day4 7.739e-02 2.240e-01 0.345 0.72972
## day5 -1.602e-01 2.241e-01 -0.715 0.47463
## day6 -1.092e-01 2.257e-01 -0.484 0.62856
## day7 -1.869e-01 2.297e-01 -0.814 0.41584
## day8 8.250e-02 2.253e-01 0.366 0.71426
## day9 7.291e-02 2.318e-01 0.315 0.75307
## day10 5.697e-01 2.511e-01 2.269 0.02327 *
## day11 -2.052e-02 2.290e-01 -0.090 0.92860
## day12 3.730e-01 2.222e-01 1.679 0.09321 .
## day13 4.580e-01 2.250e-01 2.036 0.04178 *
## day14 1.467e-01 2.261e-01 0.649 0.51639
## day15 3.042e-01 2.234e-01 1.362 0.17325
## day16 3.317e-02 2.292e-01 0.145 0.88492
## day17 -4.767e-01 2.275e-01 -2.095 0.03617 *
## day18 -1.588e-02 2.224e-01 -0.071 0.94310
## day19 -5.254e-01 2.413e-01 -2.177 0.02945 *
## day20 -4.102e-01 2.260e-01 -1.815 0.06953 .
## day21 3.984e-02 2.292e-01 0.174 0.86201
## day22 2.203e-01 2.377e-01 0.927 0.35416
## day23 4.514e-01 2.514e-01 1.796 0.07253 .
## day24 1.165e-01 2.814e-01 0.414 0.67892
## day25 3.810e-01 2.432e-01 1.567 0.11719
## day26 4.813e-01 2.444e-01 1.970 0.04888 *
## day27 7.462e-01 2.380e-01 3.136 0.00171 **
## day28 4.453e-02 2.424e-01 0.184 0.85425
## day29 -5.470e-02 2.414e-01 -0.227 0.82071
## day30 3.713e-01 2.244e-01 1.654 0.09807 .
## day31 7.318e-02 3.054e-01 0.240 0.81063
## monthaug -8.477e-01 1.016e-01 -8.346 < 2e-16 ***
## monthdec 6.792e-01 2.125e-01 3.197 0.00139 **
## monthfeb -3.103e-01 1.154e-01 -2.689 0.00716 **
## monthjan -1.181e+00 1.542e-01 -7.660 1.87e-14 ***
## monthjul -9.539e-01 9.901e-02 -9.634 < 2e-16 ***
## monthjun 3.248e-01 1.168e-01 2.782 0.00541 **
## monthmar 1.490e+00 1.462e-01 10.188 < 2e-16 ***
## monthmay -6.290e-01 9.552e-02 -6.585 4.56e-11 ***
## monthnov -7.031e-01 1.102e-01 -6.379 1.78e-10 ***
## monthoct 7.845e-01 1.335e-01 5.874 4.25e-09 ***
## monthsep 6.350e-01 1.495e-01 4.247 2.16e-05 ***
## duration 4.163e-03 7.685e-05 54.168 < 2e-16 ***
## campaign -8.610e-02 1.208e-02 -7.130 1.01e-12 ***
## poutcomefailure 1.303e-01 6.838e-02 1.906 0.05668 .
## poutcomeother 3.959e-01 9.410e-02 4.207 2.59e-05 ***
## poutcomesuccess 2.295e+00 7.942e-02 28.895 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22840 on 31646 degrees of freedom
## Residual deviance: 15029 on 31579 degrees of freedom
## AIC: 15165
##
## Number of Fisher Scoring iterations: 6
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## job 3.023164 11 1.051572
## marital 1.187779 2 1.043960
## education 2.214603 3 1.141693
## balance 1.038248 1 1.018945
## housing 1.425948 1 1.194131
## loan 1.067412 1 1.033156
## contact 1.936997 2 1.179729
## day 5.012828 30 1.027231
## month 10.942682 11 1.114892
## duration 1.141639 1 1.068475
## campaign 1.113858 1 1.055395
## poutcome 1.253764 3 1.038411
test it on train set
trainpredict = predict(model, newdata = train_set, type = 'response')
Assign 0 or 1 for the predict values
Set probability > 0.5 as the threshold , higher chance for subscribe
p_class = ifelse(trainpredict>0.5, "yes","no")
confusion matrix
matrix_table = table(train_set$y, p_class)
matrix_table
## p_class
## no yes
## no 27250 695
## yes 2400 1302
Check the accuracy and the variable importance
accuracy = sum(diag(matrix_table))/sum(matrix_table)
round(accuracy,3)
## [1] 0.902
varImp(model)
## Overall
## jobadmin. 0.002334797
## jobblue-collar 0.925027689
## jobentrepreneur 0.865574141
## jobhousemaid 1.889773939
## jobmanagement 0.422977700
## jobretired 0.903759503
## jobself-employed 0.825601836
## jobservices 0.669058631
## jobstudent 1.465574384
## jobtechnician 0.475597820
## jobunemployed 0.333268521
## maritalmarried 1.459420034
## maritalsingle 1.305079328
## educationprimary 1.910434777
## educationsecondary 0.213565378
## educationtertiary 1.255028093
## balance 2.407488216
## housingyes 12.305386816
## loanyes 6.241720517
## contactcellular 17.627221438
## contacttelephone 11.293934834
## day2 0.255624026
## day3 0.411739075
## day4 0.345493193
## day5 0.714964501
## day6 0.483754674
## day7 0.813659487
## day8 0.366138039
## day9 0.314590871
## day10 2.268937203
## day11 0.089603380
## day12 1.678678490
## day13 2.035716410
## day14 0.648919166
## day15 1.361824566
## day16 0.144729358
## day17 2.094986925
## day18 0.071371931
## day19 2.177441577
## day20 1.814978146
## day21 0.173819746
## day22 0.926546571
## day23 1.795783899
## day24 0.413930546
## day25 1.566695326
## day26 1.969621559
## day27 3.135748785
## day28 0.183698788
## day29 0.226634899
## day30 1.654270784
## day31 0.239618945
## monthaug 8.346220239
## monthdec 3.197032272
## monthfeb 2.689171032
## monthjan 7.659516854
## monthjul 9.634404989
## monthjun 2.781675813
## monthmar 10.188318857
## monthmay 6.584615188
## monthnov 6.379094554
## monthoct 5.874309493
## monthsep 4.247330304
## duration 54.167622664
## campaign 7.129581919
## poutcomefailure 1.905764967
## poutcomeother 4.206631509
## poutcomesuccess 28.894944094
test the trained on test set
testPredict = predict(model, newdata = test_set, type = 'response')
p_class = ifelse(testPredict>0.5, "yes", "no")
matrix_table = table(test_set$y, p_class)
matrix_table
## p_class
## no yes
## no 11688 289
## yes 1012 575
Check the accuracy
accuracy = sum(diag(matrix_table))/sum(matrix_table)
round(accuracy, 3)
## [1] 0.904
Sort the highest probability to lowest.
head(sort(testPredict, decreasing = T), 10)
## 24149 31338 12348 30748 30155 24055 41589
## 1.0000000 0.9999815 0.9999345 0.9998955 0.9998137 0.9997795 0.9993971
## 17643 43902 39992
## 0.9992076 0.9990779 0.9989185
p-value for the model
with(model, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = F))
## [1] 0
lift chart
pred = prediction(trainpredict, train_set$y)
perf = performance(pred, "lift", "rpp")
plot(perf, main = "lift curve", xlab = 'Proportion of Customers (sorted prob)')

Cumulative gain chart
gain = performance(pred, "tpr", "rpp")
plot(gain, col="orange", lwd = 2)

##### Add baseline and ideal line
plot(x=c(0,1), y=c(0,1), type = "l", col = "red", lwd =2,
ylab = "True Positive Rate",
xlab = "Rate of Positive Predictions")
lines(x=c(0,0.12,1), y = c(0,1,1), col="darkgreen", lwd = 2)
gain.x = unlist(slot(gain, 'x.values'))
gain.y = unlist(slot(gain, 'y.values'))
lines(x=gain.x, y=gain.y, col = "orange", lwd=2)
