Load the functions

pacman::p_load(tidyverse, caret, corrplot, caTools, knitr, car,
               ROCR, IRdisplay, e1071,earth, fastDummies)

Read the csv.file

setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Predictive Analytic/EBA Predictive WK 2/data/bank marketing")
bank <- read.csv("bank_data.csv")
head(bank)
##   age          job marital education default balance housing loan contact
## 1  58   management married  tertiary      no    2143     yes   no unknown
## 2  44   technician  single secondary      no      29     yes   no unknown
## 3  33 entrepreneur married secondary      no       2     yes  yes unknown
## 4  47  blue-collar married   unknown      no    1506     yes   no unknown
## 5  33      unknown  single   unknown      no       1      no   no unknown
## 6  35   management married  tertiary      no     231     yes   no unknown
##   day month duration campaign pdays previous poutcome  y
## 1   5   may      261        1    -1        0  unknown no
## 2   5   may      151        1    -1        0  unknown no
## 3   5   may       76        1    -1        0  unknown no
## 4   5   may       92        1    -1        0  unknown no
## 5   5   may      198        1    -1        0  unknown no
## 6   5   may      139        1    -1        0  unknown no
dim(bank)
## [1] 45211    17
summary(bank)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :13766   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 6895   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 6247   Median : 180.0   Median : 2.000  
##  Mean   :15.81   jun    : 5341   Mean   : 258.2   Mean   : 2.764  
##  3rd Qu.:21.00   nov    : 3970   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2932   Max.   :4918.0   Max.   :63.000  
##                  (Other): 6060                                    
##      pdays          previous           poutcome       y        
##  Min.   : -1.0   Min.   :  0.0000   failure: 4901   no :39922  
##  1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1840   yes: 5289  
##  Median : -1.0   Median :  0.0000   success: 1511              
##  Mean   : 40.2   Mean   :  0.5803   unknown:36959              
##  3rd Qu.: -1.0   3rd Qu.:  0.0000                              
##  Max.   :871.0   Max.   :275.0000                              
## 
str(bank)
## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

variable ‘day’ not suppose to have mean

convert day to factor

bank$day <- as.factor(bank$day)
str(bank)
## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ day      : Factor w/ 31 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
summary(bank)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day            month          duration         campaign     
##  20     : 2752   may    :13766   Min.   :   0.0   Min.   : 1.000  
##  18     : 2308   jul    : 6895   1st Qu.: 103.0   1st Qu.: 1.000  
##  21     : 2026   aug    : 6247   Median : 180.0   Median : 2.000  
##  17     : 1939   jun    : 5341   Mean   : 258.2   Mean   : 2.764  
##  6      : 1932   nov    : 3970   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  5      : 1910   apr    : 2932   Max.   :4918.0   Max.   :63.000  
##  (Other):32344   (Other): 6060                                    
##      pdays          previous           poutcome       y        
##  Min.   : -1.0   Min.   :  0.0000   failure: 4901   no :39922  
##  1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1840   yes: 5289  
##  Median : -1.0   Median :  0.0000   success: 1511              
##  Mean   : 40.2   Mean   :  0.5803   unknown:36959              
##  3rd Qu.: -1.0   3rd Qu.:  0.0000                              
##  Max.   :871.0   Max.   :275.0000                              
## 

Explore the subscribe and no subscribe proportion

Only 12% of customer response to the bank offers.

table(bank$y)
## 
##    no   yes 
## 39922  5289
bank %>%
  group_by(y) %>%
  summarise(per = n()/nrow(bank)) %>%
  ggplot(aes(x=y, y = per, fill = y)) +
  geom_bar(stat = 'identity') +
  geom_text(aes(label = round(per,2)), vjust =2)

Relevel ‘unknown’ as the baseline for prediction

table(bank$job)
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##          5171          9732          1487          1240          9458 
##       retired self-employed      services       student    technician 
##          2264          1579          4154           938          7597 
##    unemployed       unknown 
##          1303           288
bank$job <- relevel(bank$job, ref = 'unknown')
table(bank$job)
## 
##       unknown        admin.   blue-collar  entrepreneur     housemaid 
##           288          5171          9732          1487          1240 
##    management       retired self-employed      services       student 
##          9458          2264          1579          4154           938 
##    technician    unemployed 
##          7597          1303
table(bank$education)
## 
##   primary secondary  tertiary   unknown 
##      6851     23202     13301      1857
bank$education <- relevel(bank$education, ref = 'unknown')
table(bank$education)
## 
##   unknown   primary secondary  tertiary 
##      1857      6851     23202     13301
table(bank$contact)
## 
##  cellular telephone   unknown 
##     29285      2906     13020
bank$contact <- relevel(bank$contact, ref = 'unknown')
table(bank$contact)
## 
##   unknown  cellular telephone 
##     13020     29285      2906
table(bank$poutcome)
## 
## failure   other success unknown 
##    4901    1840    1511   36959
bank$poutcome <- relevel(bank$poutcome, ref = 'unknown')
table(bank$poutcome)
## 
## unknown failure   other success 
##   36959    4901    1840    1511

Split the data

set.seed(123)

#### Create a boolean flag to split data
splitData = sample.split(bank$y, SplitRatio = 0.7)
train_set = bank[splitData,]
nrow(train_set)/nrow(bank)
## [1] 0.6999845
test_set = bank[!splitData,]
nrow(test_set)/nrow(bank)
## [1] 0.3000155

Use all independent variables to create the 1st model

model = glm(y~., data = train_set, family = binomial)
summary(model)
## 
## Call:
## glm(formula = y ~ ., family = binomial, data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9959  -0.3742  -0.2494  -0.1492   3.3766  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.926e+00  3.849e-01 -10.198  < 2e-16 ***
## age                 8.148e-04  2.624e-03   0.311  0.75615    
## jobadmin.           3.386e-03  2.681e-01   0.013  0.98992    
## jobblue-collar     -2.406e-01  2.672e-01  -0.900  0.36788    
## jobentrepreneur    -2.472e-01  2.908e-01  -0.850  0.39522    
## jobhousemaid       -5.704e-01  3.022e-01  -1.887  0.05912 .  
## jobmanagement      -1.097e-01  2.661e-01  -0.412  0.68016    
## jobretired          2.337e-01  2.730e-01   0.856  0.39201    
## jobself-employed   -2.309e-01  2.842e-01  -0.812  0.41654    
## jobservices        -1.768e-01  2.730e-01  -0.648  0.51730    
## jobstudent          4.225e-01  2.832e-01   1.492  0.13576    
## jobtechnician      -1.221e-01  2.661e-01  -0.459  0.64648    
## jobunemployed      -9.039e-02  2.853e-01  -0.317  0.75137    
## maritalmarried     -1.025e-01  7.155e-02  -1.433  0.15189    
## maritalsingle       1.090e-01  8.182e-02   1.332  0.18291    
## educationprimary   -2.387e-01  1.248e-01  -1.912  0.05593 .  
## educationsecondary -2.064e-02  1.097e-01  -0.188  0.85072    
## educationtertiary   1.472e-01  1.152e-01   1.278  0.20129    
## defaultyes         -2.857e-02  2.007e-01  -0.142  0.88680    
## balance             1.429e-05  6.036e-06   2.368  0.01786 *  
## housingyes         -6.438e-01  5.280e-02 -12.194  < 2e-16 ***
## loanyes            -4.421e-01  7.117e-02  -6.212 5.22e-10 ***
## contactcellular     1.575e+00  8.940e-02  17.618  < 2e-16 ***
## contacttelephone    1.368e+00  1.223e-01  11.192  < 2e-16 ***
## day2               -6.061e-02  2.289e-01  -0.265  0.79112    
## day3                9.692e-02  2.317e-01   0.418  0.67571    
## day4                7.813e-02  2.240e-01   0.349  0.72729    
## day5               -1.595e-01  2.241e-01  -0.712  0.47673    
## day6               -1.085e-01  2.258e-01  -0.480  0.63099    
## day7               -1.868e-01  2.297e-01  -0.813  0.41613    
## day8                8.307e-02  2.254e-01   0.369  0.71248    
## day9                7.358e-02  2.318e-01   0.317  0.75095    
## day10               5.701e-01  2.512e-01   2.270  0.02322 *  
## day11              -2.031e-02  2.291e-01  -0.089  0.92936    
## day12               3.731e-01  2.223e-01   1.678  0.09327 .  
## day13               4.589e-01  2.251e-01   2.039  0.04146 *  
## day14               1.479e-01  2.262e-01   0.654  0.51316    
## day15               3.053e-01  2.235e-01   1.366  0.17192    
## day16               3.468e-02  2.293e-01   0.151  0.87977    
## day17              -4.758e-01  2.276e-01  -2.090  0.03661 *  
## day18              -1.406e-02  2.226e-01  -0.063  0.94964    
## day19              -5.243e-01  2.414e-01  -2.172  0.02984 *  
## day20              -4.085e-01  2.261e-01  -1.806  0.07085 .  
## day21               4.116e-02  2.293e-01   0.180  0.85753    
## day22               2.215e-01  2.378e-01   0.932  0.35156    
## day23               4.520e-01  2.515e-01   1.798  0.07225 .  
## day24               1.152e-01  2.815e-01   0.409  0.68232    
## day25               3.822e-01  2.432e-01   1.571  0.11608    
## day26               4.832e-01  2.444e-01   1.977  0.04805 *  
## day27               7.480e-01  2.380e-01   3.143  0.00167 ** 
## day28               4.584e-02  2.424e-01   0.189  0.85003    
## day29              -5.330e-02  2.414e-01  -0.221  0.82526    
## day30               3.728e-01  2.244e-01   1.661  0.09673 .  
## day31               7.352e-02  3.054e-01   0.241  0.80978    
## monthaug           -8.489e-01  1.016e-01  -8.353  < 2e-16 ***
## monthdec            6.774e-01  2.125e-01   3.188  0.00143 ** 
## monthfeb           -3.124e-01  1.155e-01  -2.705  0.00682 ** 
## monthjan           -1.182e+00  1.542e-01  -7.662 1.83e-14 ***
## monthjul           -9.537e-01  9.905e-02  -9.629  < 2e-16 ***
## monthjun            3.244e-01  1.168e-01   2.778  0.00546 ** 
## monthmar            1.489e+00  1.462e-01  10.182  < 2e-16 ***
## monthmay           -6.286e-01  9.558e-02  -6.577 4.79e-11 ***
## monthnov           -7.046e-01  1.109e-01  -6.352 2.13e-10 ***
## monthoct            7.808e-01  1.336e-01   5.842 5.14e-09 ***
## monthsep            6.327e-01  1.495e-01   4.231 2.33e-05 ***
## duration            4.163e-03  7.685e-05  54.167  < 2e-16 ***
## campaign           -8.645e-02  1.209e-02  -7.150 8.68e-13 ***
## pdays              -6.412e-06  3.594e-04  -0.018  0.98577    
## previous            6.009e-03  6.391e-03   0.940  0.34712    
## poutcomefailure     1.138e-01  1.103e-01   1.032  0.30227    
## poutcomeother       3.734e-01  1.251e-01   2.984  0.00284 ** 
## poutcomesuccess     2.276e+00  1.009e-01  22.551  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22840  on 31646  degrees of freedom
## Residual deviance: 15028  on 31575  degrees of freedom
## AIC: 15172
## 
## Number of Fisher Scoring iterations: 6

Check for multicollinearity

Appear month to have GVIF ~10

vif(model)
##                GVIF Df GVIF^(1/(2*Df))
## age        2.173545  1        1.474295
## job        4.258285 11        1.068075
## marital    1.451352  2        1.097598
## education  2.264738  3        1.145960
## default    1.017036  1        1.008482
## balance    1.046575  1        1.023022
## housing    1.448942  1        1.203720
## loan       1.072150  1        1.035447
## contact    1.990865  2        1.187847
## day        5.099757 30        1.027525
## month     11.274297 11        1.116406
## duration   1.141721  1        1.068513
## campaign   1.114992  1        1.055932
## pdays      3.671197  1        1.916037
## previous   1.196915  1        1.094036
## poutcome   4.147954  3        1.267571

Further refine the model by removing the insignificant variables

model = step(model, trace = F)
summary(model)
## 
## Call:
## glm(formula = y ~ job + marital + education + balance + housing + 
##     loan + contact + day + month + duration + campaign + poutcome, 
##     family = binomial, data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9992  -0.3742  -0.2493  -0.1492   3.3774  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.883e+00  3.578e-01 -10.855  < 2e-16 ***
## jobadmin.          -6.252e-04  2.678e-01  -0.002  0.99814    
## jobblue-collar     -2.468e-01  2.668e-01  -0.925  0.35495    
## jobentrepreneur    -2.515e-01  2.906e-01  -0.866  0.38672    
## jobhousemaid       -5.710e-01  3.021e-01  -1.890  0.05879 .  
## jobmanagement      -1.125e-01  2.659e-01  -0.423  0.67231    
## jobretired          2.444e-01  2.704e-01   0.904  0.36612    
## jobself-employed   -2.344e-01  2.840e-01  -0.826  0.40903    
## jobservices        -1.824e-01  2.726e-01  -0.669  0.50346    
## jobstudent          4.114e-01  2.807e-01   1.466  0.14276    
## jobtechnician      -1.264e-01  2.658e-01  -0.476  0.63436    
## jobunemployed      -9.500e-02  2.851e-01  -0.333  0.73893    
## maritalmarried     -1.039e-01  7.122e-02  -1.459  0.14445    
## maritalsingle       1.003e-01  7.685e-02   1.305  0.19187    
## educationprimary   -2.385e-01  1.248e-01  -1.910  0.05608 .  
## educationsecondary -2.334e-02  1.093e-01  -0.214  0.83089    
## educationtertiary   1.437e-01  1.145e-01   1.255  0.20947    
## balance             1.447e-05  6.010e-06   2.407  0.01606 *  
## housingyes         -6.445e-01  5.237e-02 -12.305  < 2e-16 ***
## loanyes            -4.433e-01  7.102e-02  -6.242 4.33e-10 ***
## contactcellular     1.574e+00  8.931e-02  17.627  < 2e-16 ***
## contacttelephone    1.373e+00  1.215e-01  11.294  < 2e-16 ***
## day2               -5.847e-02  2.287e-01  -0.256  0.79824    
## day3                9.537e-02  2.316e-01   0.412  0.68053    
## day4                7.739e-02  2.240e-01   0.345  0.72972    
## day5               -1.602e-01  2.241e-01  -0.715  0.47463    
## day6               -1.092e-01  2.257e-01  -0.484  0.62856    
## day7               -1.869e-01  2.297e-01  -0.814  0.41584    
## day8                8.250e-02  2.253e-01   0.366  0.71426    
## day9                7.291e-02  2.318e-01   0.315  0.75307    
## day10               5.697e-01  2.511e-01   2.269  0.02327 *  
## day11              -2.052e-02  2.290e-01  -0.090  0.92860    
## day12               3.730e-01  2.222e-01   1.679  0.09321 .  
## day13               4.580e-01  2.250e-01   2.036  0.04178 *  
## day14               1.467e-01  2.261e-01   0.649  0.51639    
## day15               3.042e-01  2.234e-01   1.362  0.17325    
## day16               3.317e-02  2.292e-01   0.145  0.88492    
## day17              -4.767e-01  2.275e-01  -2.095  0.03617 *  
## day18              -1.588e-02  2.224e-01  -0.071  0.94310    
## day19              -5.254e-01  2.413e-01  -2.177  0.02945 *  
## day20              -4.102e-01  2.260e-01  -1.815  0.06953 .  
## day21               3.984e-02  2.292e-01   0.174  0.86201    
## day22               2.203e-01  2.377e-01   0.927  0.35416    
## day23               4.514e-01  2.514e-01   1.796  0.07253 .  
## day24               1.165e-01  2.814e-01   0.414  0.67892    
## day25               3.810e-01  2.432e-01   1.567  0.11719    
## day26               4.813e-01  2.444e-01   1.970  0.04888 *  
## day27               7.462e-01  2.380e-01   3.136  0.00171 ** 
## day28               4.453e-02  2.424e-01   0.184  0.85425    
## day29              -5.470e-02  2.414e-01  -0.227  0.82071    
## day30               3.713e-01  2.244e-01   1.654  0.09807 .  
## day31               7.318e-02  3.054e-01   0.240  0.81063    
## monthaug           -8.477e-01  1.016e-01  -8.346  < 2e-16 ***
## monthdec            6.792e-01  2.125e-01   3.197  0.00139 ** 
## monthfeb           -3.103e-01  1.154e-01  -2.689  0.00716 ** 
## monthjan           -1.181e+00  1.542e-01  -7.660 1.87e-14 ***
## monthjul           -9.539e-01  9.901e-02  -9.634  < 2e-16 ***
## monthjun            3.248e-01  1.168e-01   2.782  0.00541 ** 
## monthmar            1.490e+00  1.462e-01  10.188  < 2e-16 ***
## monthmay           -6.290e-01  9.552e-02  -6.585 4.56e-11 ***
## monthnov           -7.031e-01  1.102e-01  -6.379 1.78e-10 ***
## monthoct            7.845e-01  1.335e-01   5.874 4.25e-09 ***
## monthsep            6.350e-01  1.495e-01   4.247 2.16e-05 ***
## duration            4.163e-03  7.685e-05  54.168  < 2e-16 ***
## campaign           -8.610e-02  1.208e-02  -7.130 1.01e-12 ***
## poutcomefailure     1.303e-01  6.838e-02   1.906  0.05668 .  
## poutcomeother       3.959e-01  9.410e-02   4.207 2.59e-05 ***
## poutcomesuccess     2.295e+00  7.942e-02  28.895  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22840  on 31646  degrees of freedom
## Residual deviance: 15029  on 31579  degrees of freedom
## AIC: 15165
## 
## Number of Fisher Scoring iterations: 6
vif(model)
##                GVIF Df GVIF^(1/(2*Df))
## job        3.023164 11        1.051572
## marital    1.187779  2        1.043960
## education  2.214603  3        1.141693
## balance    1.038248  1        1.018945
## housing    1.425948  1        1.194131
## loan       1.067412  1        1.033156
## contact    1.936997  2        1.179729
## day        5.012828 30        1.027231
## month     10.942682 11        1.114892
## duration   1.141639  1        1.068475
## campaign   1.113858  1        1.055395
## poutcome   1.253764  3        1.038411

test it on train set

trainpredict = predict(model, newdata = train_set, type = 'response')

Assign 0 or 1 for the predict values

Set probability > 0.5 as the threshold , higher chance for subscribe

p_class = ifelse(trainpredict>0.5, "yes","no")

confusion matrix

matrix_table = table(train_set$y, p_class)
matrix_table
##      p_class
##          no   yes
##   no  27250   695
##   yes  2400  1302

Check the accuracy and the variable importance

accuracy = sum(diag(matrix_table))/sum(matrix_table)
round(accuracy,3)
## [1] 0.902
varImp(model)
##                         Overall
## jobadmin.           0.002334797
## jobblue-collar      0.925027689
## jobentrepreneur     0.865574141
## jobhousemaid        1.889773939
## jobmanagement       0.422977700
## jobretired          0.903759503
## jobself-employed    0.825601836
## jobservices         0.669058631
## jobstudent          1.465574384
## jobtechnician       0.475597820
## jobunemployed       0.333268521
## maritalmarried      1.459420034
## maritalsingle       1.305079328
## educationprimary    1.910434777
## educationsecondary  0.213565378
## educationtertiary   1.255028093
## balance             2.407488216
## housingyes         12.305386816
## loanyes             6.241720517
## contactcellular    17.627221438
## contacttelephone   11.293934834
## day2                0.255624026
## day3                0.411739075
## day4                0.345493193
## day5                0.714964501
## day6                0.483754674
## day7                0.813659487
## day8                0.366138039
## day9                0.314590871
## day10               2.268937203
## day11               0.089603380
## day12               1.678678490
## day13               2.035716410
## day14               0.648919166
## day15               1.361824566
## day16               0.144729358
## day17               2.094986925
## day18               0.071371931
## day19               2.177441577
## day20               1.814978146
## day21               0.173819746
## day22               0.926546571
## day23               1.795783899
## day24               0.413930546
## day25               1.566695326
## day26               1.969621559
## day27               3.135748785
## day28               0.183698788
## day29               0.226634899
## day30               1.654270784
## day31               0.239618945
## monthaug            8.346220239
## monthdec            3.197032272
## monthfeb            2.689171032
## monthjan            7.659516854
## monthjul            9.634404989
## monthjun            2.781675813
## monthmar           10.188318857
## monthmay            6.584615188
## monthnov            6.379094554
## monthoct            5.874309493
## monthsep            4.247330304
## duration           54.167622664
## campaign            7.129581919
## poutcomefailure     1.905764967
## poutcomeother       4.206631509
## poutcomesuccess    28.894944094

test the trained on test set

testPredict = predict(model, newdata = test_set, type = 'response')
p_class = ifelse(testPredict>0.5, "yes", "no")
matrix_table = table(test_set$y, p_class)
matrix_table
##      p_class
##          no   yes
##   no  11688   289
##   yes  1012   575

Check the accuracy

accuracy = sum(diag(matrix_table))/sum(matrix_table)
round(accuracy, 3)
## [1] 0.904

Sort the highest probability to lowest.

head(sort(testPredict, decreasing = T), 10)
##     24149     31338     12348     30748     30155     24055     41589 
## 1.0000000 0.9999815 0.9999345 0.9998955 0.9998137 0.9997795 0.9993971 
##     17643     43902     39992 
## 0.9992076 0.9990779 0.9989185

p-value for the model

with(model, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = F))
## [1] 0

lift chart

pred = prediction(trainpredict, train_set$y)
perf = performance(pred, "lift", "rpp")
plot(perf, main = "lift curve", xlab = 'Proportion of Customers (sorted prob)')

Cumulative gain chart

gain = performance(pred, "tpr", "rpp")
plot(gain, col="orange", lwd = 2)

##### Add baseline and ideal line
plot(x=c(0,1), y=c(0,1), type = "l", col = "red", lwd =2,
     ylab = "True Positive Rate",
     xlab = "Rate of Positive Predictions")
lines(x=c(0,0.12,1), y = c(0,1,1), col="darkgreen", lwd = 2)

gain.x = unlist(slot(gain, 'x.values'))
gain.y = unlist(slot(gain, 'y.values'))

lines(x=gain.x, y=gain.y, col = "orange", lwd=2)