## Load the Package
library(tidyverse)
library(xlsx)
library(VIF)
## Set the directory
getwd()
## [1] "C:/Users/SK/Desktop/SK/NUS EBA/Semester 3/Advanced Customer Analytics/Week 2"
setwd("C:/Users/SK/Desktop/SK/NUS EBA/Semester 3/Advanced Customer Analytics/Week 2")
## Read and Explore the Data
campaign <- read.csv("bank-additional.csv")
str(campaign)
## 'data.frame': 41188 obs. of 21 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
## $ default : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
summary(campaign)
## age job marital
## Min. :17.00 admin. :10422 divorced: 4612
## 1st Qu.:32.00 blue-collar: 9254 married :24928
## Median :38.00 technician : 6743 single :11568
## Mean :40.02 services : 3969 unknown : 80
## 3rd Qu.:47.00 management : 2924
## Max. :98.00 retired : 1720
## (Other) : 6156
## education default housing
## university.degree :12168 no :32588 no :18622
## high.school : 9515 unknown: 8597 unknown: 990
## basic.9y : 6045 yes : 3 yes :21576
## professional.course: 5243
## basic.4y : 4176
## basic.6y : 2292
## (Other) : 1749
## loan contact month day_of_week
## no :33950 cellular :26144 may :13769 fri:7827
## unknown: 990 telephone:15044 jul : 7174 mon:8514
## yes : 6248 aug : 6178 thu:8623
## jun : 5318 tue:8090
## nov : 4101 wed:8134
## apr : 2632
## (Other): 2016
## duration campaign pdays previous
## Min. : 0.0 Min. : 1.000 Min. : 0.0 Min. :0.000
## 1st Qu.: 102.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.000
## Median : 180.0 Median : 2.000 Median :999.0 Median :0.000
## Mean : 258.3 Mean : 2.568 Mean :962.5 Mean :0.173
## 3rd Qu.: 319.0 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :4918.0 Max. :56.000 Max. :999.0 Max. :7.000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure : 4252 Min. :-3.40000 Min. :92.20 Min. :-50.8
## nonexistent:35563 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7
## success : 1373 Median : 1.10000 Median :93.75 Median :-41.8
## Mean : 0.08189 Mean :93.58 Mean :-40.5
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4
## Max. : 1.40000 Max. :94.77 Max. :-26.9
##
## euribor3m nr.employed y
## Min. :0.634 Min. :4964 no :36548
## 1st Qu.:1.344 1st Qu.:5099 yes: 4640
## Median :4.857 Median :5191
## Mean :3.621 Mean :5167
## 3rd Qu.:4.961 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
head(campaign)
## age job marital education default housing loan contact month
## 1 56 housemaid married basic.4y no no no telephone may
## 2 57 services married high.school unknown no no telephone may
## 3 37 services married high.school no yes no telephone may
## 4 40 admin. married basic.6y no no no telephone may
## 5 56 services married high.school no no yes telephone may
## 6 45 services married basic.9y unknown no no telephone may
## day_of_week duration campaign pdays previous poutcome emp.var.rate
## 1 mon 261 1 999 0 nonexistent 1.1
## 2 mon 149 1 999 0 nonexistent 1.1
## 3 mon 226 1 999 0 nonexistent 1.1
## 4 mon 151 1 999 0 nonexistent 1.1
## 5 mon 307 1 999 0 nonexistent 1.1
## 6 mon 198 1 999 0 nonexistent 1.1
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 1 93.994 -36.4 4.857 5191 no
## 2 93.994 -36.4 4.857 5191 no
## 3 93.994 -36.4 4.857 5191 no
## 4 93.994 -36.4 4.857 5191 no
## 5 93.994 -36.4 4.857 5191 no
## 6 93.994 -36.4 4.857 5191 no
## duration variable is dropped due to useful for prediction
## Explore the Categorical Variables
colnames(campaign)
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
cat_var = c('job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y')
for (col in cat_var) {
x <- table(campaign[,col])
barplot(x, main = col)
}











## Marital, default, housing, loan, job and education have 'unknown' data which unable ignore
## Assume the job and education are closely related
table(campaign$job)
##
## admin. blue-collar entrepreneur housemaid management
## 10422 9254 1456 1060 2924
## retired self-employed services student technician
## 1720 1421 3969 875 6743
## unemployed unknown
## 1014 330
table(campaign$education)
##
## basic.4y basic.6y basic.9y
## 4176 2292 6045
## high.school illiterate professional.course
## 9515 18 5243
## university.degree unknown
## 12168 1731
## if the age > 60, replace the job from 'unknown' to 'retired'
campaign$job[campaign$age >60 & campaign$job == 'unknown'] <- 'retired'
## if job is management, replace the education from 'unknown' to 'university.degree'
campaign$education[campaign$job == 'management' & campaign$education == 'unknown'] <- 'university.degree'
## if job is service, replace the education from 'unknown' to 'high.school'
campaign$education[campaign$job == 'service' & campaign$education == 'unknown'] <- 'high.school'
## if job is housemaid, replace the education from 'unknown' to 'basic.4y'
campaign$education[campaign$job == 'housemaid' & campaign$education == 'unknown'] <- 'basic.4y'
## if education is 'basic.4y', replace the job from 'unknown' to 'blue-collar'
campaign$job[campaign$education == 'basic.4y' & campaign$job == 'unknown'] <- 'blue-collar'
## if education is 'basic.6y', replace the job from 'unknown' to 'blue-collar'
campaign$job[campaign$education == 'basic.6y' & campaign$job == 'unknown'] <- 'blue-collar'
## if education is 'basic.9y', replace the job from 'unknown' to 'blue-collar'
campaign$job[campaign$education == 'basic.9y' & campaign$job == 'unknown'] <- 'blue-collar'
## if education is 'professional.course', replace the job from 'unknown' to 'blue-collar'
campaign$job[campaign$education == 'professional' & campaign$job == 'unknown'] <- 'technician'
table(campaign$job)
##
## admin. blue-collar entrepreneur housemaid management
## 10422 9355 1456 1060 2924
## retired self-employed services student technician
## 1741 1421 3969 875 6743
## unemployed unknown
## 1014 208
## Check the cross tabulation between housing and marital
library(gmodels)
CrossTable(campaign$marital, campaign$housing)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 41188
##
##
## | campaign$housing
## campaign$marital | no | unknown | yes | Row Total |
## -----------------|-----------|-----------|-----------|-----------|
## divorced | 2092 | 121 | 2399 | 4612 |
## | 0.022 | 0.929 | 0.119 | |
## | 0.454 | 0.026 | 0.520 | 0.112 |
## | 0.112 | 0.122 | 0.111 | |
## | 0.051 | 0.003 | 0.058 | |
## -----------------|-----------|-----------|-----------|-----------|
## married | 11389 | 588 | 12951 | 24928 |
## | 1.246 | 0.208 | 0.882 | |
## | 0.457 | 0.024 | 0.520 | 0.605 |
## | 0.612 | 0.594 | 0.600 | |
## | 0.277 | 0.014 | 0.314 | |
## -----------------|-----------|-----------|-----------|-----------|
## single | 5097 | 280 | 6191 | 11568 |
## | 3.390 | 0.014 | 2.840 | |
## | 0.441 | 0.024 | 0.535 | 0.281 |
## | 0.274 | 0.283 | 0.287 | |
## | 0.124 | 0.007 | 0.150 | |
## -----------------|-----------|-----------|-----------|-----------|
## unknown | 44 | 1 | 35 | 80 |
## | 1.695 | 0.443 | 1.138 | |
## | 0.550 | 0.012 | 0.438 | 0.002 |
## | 0.002 | 0.001 | 0.002 | |
## | 0.001 | 0.000 | 0.001 | |
## -----------------|-----------|-----------|-----------|-----------|
## Column Total | 18622 | 990 | 21576 | 41188 |
## | 0.452 | 0.024 | 0.524 | |
## -----------------|-----------|-----------|-----------|-----------|
##
##
## Assume majority married will get housing
table(campaign$housing)
##
## no unknown yes
## 18622 990 21576
campaign$housing[campaign$marital == 'married' & campaign$housing == 'unknown'] <- 'yes'
table(campaign$housing)
##
## no unknown yes
## 18622 402 22164
## Split Train and test set by 80:20
## set intital seed
set.seed(123)
## Create a boolean flag to split data
library(caTools)
splitData = sample.split(campaign$y, SplitRatio = 0.8)
## Train set
train_set = campaign[splitData, ]
nrow(train_set)/nrow(campaign)
## [1] 0.7999903
## Test set
test_set = campaign[!splitData,]
nrow(test_set)/nrow(campaign)
## [1] 0.2000097
## Data Imbalance - 'no' (29238) subscribe is must higher than 'yes' (3712)
library(ROSE)
table(train_set$y)
##
## no yes
## 29238 3712
Train_balance <- ovun.sample(y ~ ., data = train_set, N=nrow(train_set), method = 'both', p = 0.5, seed=1)$data
table(Train_balance$y)
##
## no yes
## 16583 16367
## Create the Logistic Modelling
## Since only concentrate on customer features, campaign features need to exclude in the modelling
colnames(Train_balance)
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
model1 <- glm(y ~ age+job+marital+education+default+loan+housing+contact+pdays+previous+poutcome, data = Train_balance , family = binomial)
summary(model1)
##
## Call:
## glm(formula = y ~ age + job + marital + education + default +
## loan + housing + contact + pdays + previous + poutcome, family = binomial,
## data = Train_balance)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9373 -1.0468 -0.5623 1.1193 2.0623
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.181e-01 2.703e-01 3.397 0.000683 ***
## age 9.395e-03 1.465e-03 6.413 1.43e-10 ***
## jobblue-collar -2.783e-01 4.629e-02 -6.013 1.82e-09 ***
## jobentrepreneur -2.017e-01 7.080e-02 -2.849 0.004389 **
## jobhousemaid -2.801e-01 8.516e-02 -3.289 0.001006 **
## jobmanagement -1.529e-01 5.089e-02 -3.004 0.002663 **
## jobretired 6.981e-01 7.003e-02 9.968 < 2e-16 ***
## jobself-employed -2.145e-01 6.869e-02 -3.122 0.001796 **
## jobservices -1.850e-01 5.034e-02 -3.675 0.000238 ***
## jobstudent 9.941e-01 8.055e-02 12.341 < 2e-16 ***
## jobtechnician -1.121e-01 4.190e-02 -2.675 0.007466 **
## jobunemployed 1.133e-01 7.839e-02 1.445 0.148454
## jobunknown -1.246e-01 1.705e-01 -0.731 0.464945
## maritalmarried 2.204e-01 4.080e-02 5.403 6.55e-08 ***
## maritalsingle 4.308e-01 4.609e-02 9.346 < 2e-16 ***
## maritalunknown 3.915e-01 2.796e-01 1.400 0.161388
## educationbasic.6y 1.792e-01 6.923e-02 2.589 0.009638 **
## educationbasic.9y 3.523e-02 5.559e-02 0.634 0.526228
## educationhigh.school 6.176e-02 5.524e-02 1.118 0.263570
## educationilliterate 2.106e+00 6.216e-01 3.389 0.000702 ***
## educationprofessional.course 7.066e-02 6.090e-02 1.160 0.245939
## educationuniversity.degree 1.878e-01 5.610e-02 3.348 0.000815 ***
## educationunknown 2.234e-01 7.718e-02 2.894 0.003804 **
## defaultunknown -6.295e-01 3.555e-02 -17.709 < 2e-16 ***
## defaultyes -1.060e+01 8.448e+01 -0.126 0.900120
## loanunknown 8.258e-02 9.719e-02 0.850 0.395500
## loanyes -1.319e-01 3.392e-02 -3.887 0.000102 ***
## housingunknown -3.080e-01 1.620e-01 -1.901 0.057344 .
## housingyes -1.126e-02 2.471e-02 -0.456 0.648562
## contacttelephone -7.887e-01 2.815e-02 -28.021 < 2e-16 ***
## pdays -1.608e-03 2.161e-04 -7.438 1.02e-13 ***
## previous 1.956e-01 6.389e-02 3.062 0.002197 **
## poutcomenonexistent 1.058e-01 8.195e-02 1.290 0.196890
## poutcomesuccess 8.700e-01 2.174e-01 4.001 6.30e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 45677 on 32949 degrees of freedom
## Residual deviance: 39376 on 32916 degrees of freedom
## AIC: 39444
##
## Number of Fisher Scoring iterations: 9
library(car)
vif(model1)
## GVIF Df GVIF^(1/(2*Df))
## age 1.872105 1 1.368249
## job 4.905744 11 1.074968
## marital 1.417892 3 1.059922
## education 3.406711 7 1.091501
## default 1.088225 2 1.021362
## loan 1.601970 2 1.125029
## housing 1.632628 2 1.130373
## contact 1.058853 1 1.029006
## pdays 9.252287 1 3.041757
## previous 6.667708 1 2.582191
## poutcome 37.771617 2 2.479085
## poutcome is higher GVIF. Need to remove this variable
model2 <- glm(y ~ age+job+marital+education+default+loan+housing+contact+pdays+previous, data = Train_balance , family = binomial)
summary(model2)
##
## Call:
## glm(formula = y ~ age + job + marital + education + default +
## loan + housing + contact + pdays + previous, family = binomial,
## data = Train_balance)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8501 -1.0467 -0.5619 1.1191 2.0631
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.841e+00 1.269e-01 14.501 < 2e-16 ***
## age 9.402e-03 1.464e-03 6.421 1.36e-10 ***
## jobblue-collar -2.791e-01 4.628e-02 -6.031 1.63e-09 ***
## jobentrepreneur -2.009e-01 7.075e-02 -2.840 0.004518 **
## jobhousemaid -2.794e-01 8.512e-02 -3.282 0.001031 **
## jobmanagement -1.525e-01 5.088e-02 -2.997 0.002724 **
## jobretired 7.007e-01 7.000e-02 10.010 < 2e-16 ***
## jobself-employed -2.140e-01 6.868e-02 -3.116 0.001831 **
## jobservices -1.856e-01 5.034e-02 -3.686 0.000228 ***
## jobstudent 9.919e-01 8.057e-02 12.312 < 2e-16 ***
## jobtechnician -1.122e-01 4.191e-02 -2.676 0.007444 **
## jobunemployed 1.115e-01 7.837e-02 1.423 0.154748
## jobunknown -1.269e-01 1.706e-01 -0.744 0.456868
## maritalmarried 2.197e-01 4.077e-02 5.388 7.14e-08 ***
## maritalsingle 4.308e-01 4.606e-02 9.352 < 2e-16 ***
## maritalunknown 3.981e-01 2.788e-01 1.428 0.153374
## educationbasic.6y 1.787e-01 6.923e-02 2.582 0.009831 **
## educationbasic.9y 3.523e-02 5.559e-02 0.634 0.526226
## educationhigh.school 6.088e-02 5.522e-02 1.103 0.270237
## educationilliterate 2.105e+00 6.216e-01 3.387 0.000708 ***
## educationprofessional.course 7.067e-02 6.089e-02 1.161 0.245819
## educationuniversity.degree 1.882e-01 5.607e-02 3.357 0.000788 ***
## educationunknown 2.260e-01 7.712e-02 2.930 0.003385 **
## defaultunknown -6.301e-01 3.555e-02 -17.724 < 2e-16 ***
## defaultyes -1.060e+01 8.448e+01 -0.125 0.900129
## loanunknown 8.215e-02 9.716e-02 0.845 0.397834
## loanyes -1.328e-01 3.393e-02 -3.914 9.09e-05 ***
## housingunknown -3.060e-01 1.619e-01 -1.890 0.058777 .
## housingyes -1.114e-02 2.471e-02 -0.451 0.651955
## contacttelephone -7.882e-01 2.804e-02 -28.104 < 2e-16 ***
## pdays -2.426e-03 8.203e-05 -29.574 < 2e-16 ***
## previous 1.089e-01 2.901e-02 3.756 0.000173 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 45677 on 32949 degrees of freedom
## Residual deviance: 39391 on 32918 degrees of freedom
## AIC: 39455
##
## Number of Fisher Scoring iterations: 9
vif(model2)
## GVIF Df GVIF^(1/(2*Df))
## age 1.872616 1 1.368436
## job 4.896951 11 1.074881
## marital 1.417353 3 1.059855
## education 3.399982 7 1.091346
## default 1.087940 2 1.021295
## loan 1.602438 2 1.125111
## housing 1.633748 2 1.130567
## contact 1.050878 1 1.025123
## pdays 1.340462 1 1.157783
## previous 1.380166 1 1.174804
## drop housing as not significant
model3 <-glm(y ~ age+job+marital+education+default+loan+contact+pdays+previous, data = Train_balance , family = binomial)
summary(model3)
##
## Call:
## glm(formula = y ~ age + job + marital + education + default +
## loan + contact + pdays + previous, family = binomial, data = Train_balance)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8524 -1.0471 -0.5626 1.1187 2.0669
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.833e+00 1.264e-01 14.502 < 2e-16 ***
## age 9.327e-03 1.463e-03 6.373 1.85e-10 ***
## jobblue-collar -2.776e-01 4.627e-02 -5.999 1.98e-09 ***
## jobentrepreneur -2.007e-01 7.073e-02 -2.838 0.004543 **
## jobhousemaid -2.775e-01 8.512e-02 -3.260 0.001114 **
## jobmanagement -1.517e-01 5.087e-02 -2.983 0.002859 **
## jobretired 7.048e-01 6.997e-02 10.073 < 2e-16 ***
## jobself-employed -2.150e-01 6.867e-02 -3.131 0.001739 **
## jobservices -1.846e-01 5.034e-02 -3.667 0.000245 ***
## jobstudent 9.903e-01 8.056e-02 12.293 < 2e-16 ***
## jobtechnician -1.104e-01 4.189e-02 -2.637 0.008377 **
## jobunemployed 1.122e-01 7.835e-02 1.432 0.152151
## jobunknown -1.243e-01 1.706e-01 -0.729 0.466132
## maritalmarried 2.262e-01 4.059e-02 5.574 2.49e-08 ***
## maritalsingle 4.298e-01 4.604e-02 9.335 < 2e-16 ***
## maritalunknown 4.017e-01 2.788e-01 1.441 0.149637
## educationbasic.6y 1.789e-01 6.922e-02 2.584 0.009760 **
## educationbasic.9y 3.505e-02 5.559e-02 0.631 0.528345
## educationhigh.school 6.122e-02 5.521e-02 1.109 0.267536
## educationilliterate 2.103e+00 6.216e-01 3.384 0.000715 ***
## educationprofessional.course 6.952e-02 6.087e-02 1.142 0.253434
## educationuniversity.degree 1.895e-01 5.606e-02 3.381 0.000723 ***
## educationunknown 2.257e-01 7.712e-02 2.926 0.003429 **
## defaultunknown -6.295e-01 3.555e-02 -17.709 < 2e-16 ***
## defaultyes -1.061e+01 8.448e+01 -0.126 0.900067
## loanunknown -3.068e-02 7.717e-02 -0.398 0.690888
## loanyes -1.334e-01 3.388e-02 -3.937 8.24e-05 ***
## contacttelephone -7.874e-01 2.796e-02 -28.159 < 2e-16 ***
## pdays -2.426e-03 8.203e-05 -29.573 < 2e-16 ***
## previous 1.088e-01 2.900e-02 3.753 0.000175 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 45677 on 32949 degrees of freedom
## Residual deviance: 39394 on 32920 degrees of freedom
## AIC: 39454
##
## Number of Fisher Scoring iterations: 9
vif(model3)
## GVIF Df GVIF^(1/(2*Df))
## age 1.870606 1 1.367701
## job 4.883839 11 1.074750
## marital 1.385968 3 1.055907
## education 3.396783 7 1.091273
## default 1.087859 2 1.021276
## loan 1.004740 2 1.001183
## contact 1.044824 1 1.022166
## pdays 1.340687 1 1.157880
## previous 1.380318 1 1.174869
## test it on train set
trainPredict = predict(model3, newdata = train_set, type = 'response')
## Set threshold 70% for subcriber
p_class = ifelse(trainPredict >0.7, 'yes', 'no')
matrix_table = table(train_set$y, p_class)
matrix_table
## p_class
## no yes
## no 28122 1116
## yes 2645 1067
## Accuracy
accuracy = sum(diag(matrix_table))/sum(matrix_table)
round(accuracy, 3)*100
## [1] 88.6
## Lift curve
library(ROCR)
pred = prediction(trainPredict, train_set$y)
perf = performance(pred, "lift", "rpp")
plot(perf, main = "lift curve", xlab = 'Proportion of Customers (sort prob)')

## test it on test set
testPredict = predict(model3, newdata = test_set, type = 'response')
p_class = ifelse(testPredict > 0.7, 'yes', 'no')
matrix_table = table(test_set$y, p_class)
matrix_table
## p_class
## no yes
## no 7029 281
## yes 660 268
accuracy = sum(diag(matrix_table))/sum(matrix_table)
round(accuracy,3)*100
## [1] 88.6
## Model can achieve 88.6% of accuracy which is very high.
## This model will be use as scoring for campaign data.
scoring = predict(model3, newdata = campaign, type = 'response')
## Sort the highest probability of subcribe customer based on model prediction first
score_sort <- sort(scoring, decreasing = TRUE, na.last = TRUE)
## Marketing can decide to select the bin 1 for 1st priority for target phone call campaign
## Assume marketing budget allow to spend on first 500 customers for bin 1
## The first target 500 customers which have highest probability to subscribe (bin 1)
score_sort[1:500]
## 37356 39727 41175 40470 39326 40894 40561
## 0.9929294 0.9852576 0.9849662 0.9843630 0.9841378 0.9839739 0.9836442
## 39640 39592 39656 40330 39187 40745 39505
## 0.9835752 0.9832921 0.9828894 0.9828878 0.9825450 0.9825352 0.9825338
## 37717 38915 40365 40369 39866 39698 40485
## 0.9824045 0.9823950 0.9823792 0.9822527 0.9818196 0.9816380 0.9815531
## 38775 41032 39009 40930 37748 40323 40662
## 0.9815429 0.9814441 0.9814125 0.9812925 0.9812735 0.9812666 0.9809283
## 40239 40352 39262 40118 40347 39349 38968
## 0.9808254 0.9808254 0.9807514 0.9807478 0.9806880 0.9805492 0.9804702
## 39327 37101 41109 40401 28219 39627 38129
## 0.9804550 0.9803243 0.9802716 0.9800744 0.9800563 0.9800489 0.9800196
## 40833 39709 38814 37129 39259 38756 38835
## 0.9798392 0.9798347 0.9797457 0.9795853 0.9795423 0.9795172 0.9794944
## 40068 38978 41043 40279 40515 40967 39611
## 0.9793815 0.9793707 0.9793546 0.9793339 0.9792622 0.9792457 0.9791087
## 39337 37141 37496 41179 39341 39720 40380
## 0.9790724 0.9790667 0.9790635 0.9790341 0.9788224 0.9787375 0.9786066
## 38749 37462 40451 39148 39981 38826 39620
## 0.9785437 0.9785199 0.9784553 0.9784311 0.9784286 0.9783178 0.9782143
## 39894 38762 39566 40560 40952 40838 37494
## 0.9782006 0.9781924 0.9781490 0.9781034 0.9780823 0.9780806 0.9780637
## 37765 38913 27833 40162 37764 38805 37277
## 0.9780513 0.9780112 0.9778494 0.9778301 0.9776774 0.9776191 0.9776100
## 40471 38704 41099 41046 40639 40211 35974
## 0.9775862 0.9774817 0.9774804 0.9774786 0.9774038 0.9773186 0.9773138
## 39181 37861 35857 39590 40455 37917 39728
## 0.9772630 0.9772436 0.9772229 0.9770981 0.9770600 0.9769816 0.9769100
## 37885 40732 39582 39577 40374 38813 39589
## 0.9768911 0.9768474 0.9768251 0.9768174 0.9768174 0.9767967 0.9767701
## 36246 40064 39059 39062 40735 37971 39066
## 0.9767674 0.9767299 0.9767258 0.9767258 0.9766917 0.9766329 0.9766312
## 40301 37864 40372 39983 39920 39942 37820
## 0.9765882 0.9765503 0.9765194 0.9764778 0.9763817 0.9763702 0.9763506
## 37517 38275 38977 40727 38728 38180 40359
## 0.9762957 0.9762635 0.9762510 0.9762072 0.9761267 0.9760307 0.9760134
## 36461 40643 35873 40845 36173 41059 37934
## 0.9759903 0.9758605 0.9758119 0.9757942 0.9757701 0.9756227 0.9756220
## 38833 40969 31670 40202 39585 39677 40113
## 0.9754659 0.9753908 0.9753394 0.9753392 0.9753160 0.9752990 0.9751494
## 40260 40837 40366 37559 41021 39228 40333
## 0.9750933 0.9750380 0.9748664 0.9747815 0.9746427 0.9745949 0.9745827
## 37955 39216 36369 36024 38841 39881 40467
## 0.9745651 0.9745348 0.9744112 0.9743701 0.9743583 0.9742291 0.9742102
## 38902 41054 37394 38072 39663 39506 39087
## 0.9741953 0.9741776 0.9741622 0.9740987 0.9740987 0.9740805 0.9740678
## 38315 37528 37876 40687 40574 37598 36000
## 0.9740479 0.9739420 0.9739244 0.9738269 0.9738186 0.9737822 0.9737789
## 38271 38446 40742 40729 40587 39185 37027
## 0.9737566 0.9737566 0.9736489 0.9736054 0.9735924 0.9735722 0.9735603
## 40800 38727 38492 40896 40414 39892 38742
## 0.9735527 0.9734630 0.9734363 0.9732764 0.9732510 0.9732287 0.9732125
## 30424 38458 38317 39154 38931 39858 38034
## 0.9731184 0.9730796 0.9729714 0.9728342 0.9728093 0.9727384 0.9727305
## 40700 37694 37285 37302 38087 41102 40943
## 0.9726777 0.9726487 0.9725319 0.9725319 0.9724861 0.9723995 0.9722406
## 39734 40760 40274 37238 38765 39517 38016
## 0.9721666 0.9721604 0.9721582 0.9721398 0.9721185 0.9719486 0.9717843
## 29499 40557 40336 41013 38588 39521 38844
## 0.9717796 0.9717619 0.9717512 0.9716114 0.9715227 0.9713986 0.9713530
## 40703 41006 40459 37804 38916 39472 40508
## 0.9711626 0.9711210 0.9710020 0.9708583 0.9708572 0.9708471 0.9708252
## 40951 40698 38540 37780 41023 40000 38149
## 0.9707891 0.9707383 0.9706225 0.9706158 0.9706011 0.9705922 0.9705641
## 37926 40419 37327 39136 39303 39743 40736
## 0.9705348 0.9702669 0.9702264 0.9701820 0.9701739 0.9700860 0.9700860
## 38811 39135 38867 38723 37806 40337 39615
## 0.9700461 0.9699329 0.9698621 0.9697906 0.9697541 0.9697089 0.9697013
## 40289 40422 38433 38416 40377 39977 40719
## 0.9696106 0.9695526 0.9694490 0.9693127 0.9692664 0.9692022 0.9691475
## 40172 36780 39940 40984 38576 29156 37359
## 0.9691062 0.9689225 0.9689115 0.9688797 0.9686290 0.9685552 0.9684607
## 38432 30137 37739 38893 39448 40263 40164
## 0.9683044 0.9679812 0.9678707 0.9678311 0.9677196 0.9676468 0.9675070
## 40079 38488 40576 37567 41027 30139 30341
## 0.9674382 0.9672969 0.9672333 0.9670253 0.9670016 0.9668300 0.9668227
## 38784 38832 39691 39675 40292 36296 39125
## 0.9666491 0.9666491 0.9665130 0.9664002 0.9662215 0.9661399 0.9660885
## 39499 36957 40914 29504 36941 39651 37676
## 0.9659034 0.9658613 0.9657214 0.9656604 0.9650046 0.9648482 0.9646062
## 37184 41093 39474 37846 38453 39379 41091
## 0.9638985 0.9637610 0.9636734 0.9631620 0.9630768 0.9628193 0.9628068
## 37582 40680 40443 40277 40974 39667 39292
## 0.9618324 0.9617792 0.9617412 0.9608836 0.9608836 0.9605868 0.9603376
## 41127 37170 39540 40389 40126 39838 39606
## 0.9601718 0.9599958 0.9597035 0.9595541 0.9595160 0.9589399 0.9583997
## 39106 40715 40392 41082 39646 40453 40381
## 0.9581655 0.9580221 0.9579594 0.9578869 0.9578563 0.9577789 0.9576130
## 40396 40023 39359 32383 40025 39976 39221
## 0.9575414 0.9574723 0.9574303 0.9573649 0.9572743 0.9572480 0.9561152
## 39315 40887 41011 37828 39390 39657 39035
## 0.9561152 0.9560549 0.9560292 0.9558482 0.9556910 0.9554664 0.9551566
## 37322 39332 40175 40597 39612 30150 39756
## 0.9551345 0.9550323 0.9550142 0.9549099 0.9548371 0.9548060 0.9547650
## 40106 39335 36721 39649 41115 40500 39398
## 0.9547339 0.9547026 0.9547012 0.9546764 0.9545713 0.9545230 0.9544490
## 39284 39661 40846 38506 39847 38054 37443
## 0.9544340 0.9544340 0.9544340 0.9543624 0.9542990 0.9542390 0.9541444
## 39829 40592 39639 41031 40942 40699 39956
## 0.9541164 0.9541156 0.9540965 0.9540266 0.9539679 0.9538759 0.9537601
## 40614 40418 38777 40817 40120 41045 39975
## 0.9537064 0.9536158 0.9535333 0.9534942 0.9534654 0.9532664 0.9532190
## 37916 40178 40390 39659 39078 41064 40644
## 0.9530860 0.9530853 0.9530262 0.9530052 0.9529907 0.9529474 0.9528758
## 37421 39930 40258 40031 38288 39644 40247
## 0.9527898 0.9527459 0.9527168 0.9526412 0.9526116 0.9525968 0.9525479
## 36237 41029 39664 38928 39665 40448 41125
## 0.9525440 0.9525138 0.9522351 0.9521510 0.9521247 0.9520545 0.9520151
## 40222 38035 38918 38047 38781 40016 40856
## 0.9517395 0.9517243 0.9516976 0.9516509 0.9516509 0.9515970 0.9515127
## 41004 35835 40343 38693 38782 40066 41034
## 0.9513894 0.9513628 0.9512825 0.9512571 0.9512025 0.9510577 0.9509944
## 40373 39624 37284 39273 39263 40710 39843
## 0.9509672 0.9509294 0.9508906 0.9508812 0.9508756 0.9508583 0.9508321
## 40657 36429 30312 40767 39576 39397 40876
## 0.9508228 0.9508161 0.9508113 0.9507881 0.9507709 0.9507677 0.9507334
## 39225 37626 40383 39571 39608 39322 38683
## 0.9507119 0.9505887 0.9502504 0.9499810 0.9499810 0.9499487 0.9499433
## 40309 36589 40045 37308 39908 39287 39296
## 0.9499319 0.9499052 0.9499052 0.9498656 0.9497536 0.9497002 0.9496341
## 29511 40159 38205 38807 40959 27992 40173
## 0.9495711 0.9495578 0.9494984 0.9494595 0.9493345 0.9492262 0.9491898
## 40787 39595 40829 40603 39855 41110 39739
## 0.9491610 0.9491092 0.9490708 0.9490604 0.9490558 0.9490400 0.9490101
## 38717 37591 37246 40795 37770 38776 41038
## 0.9489381 0.9488941 0.9488926 0.9488743 0.9487736 0.9487565 0.9486568
## 38848 38839 39604 40116 40911 37255 36225
## 0.9486163 0.9486101 0.9485240 0.9485181 0.9484015 0.9483995 0.9483295
## 39393 39824 40481 39681 40792 38999 30409
## 0.9482006 0.9481650 0.9481428 0.9479037 0.9478717 0.9478195 0.9476558
## 40382 39645 38279 30645 40361 40473 40518
## 0.9475978 0.9474329 0.9474129 0.9473702 0.9473227 0.9472024 0.9471768
## 40971 41098 39064 40814 41025 39912 37988
## 0.9471743 0.9470999 0.9470924 0.9470440 0.9470410 0.9468420 0.9467861
## 40596 40634 40027 40303 24911 39160 39945
## 0.9467505 0.9467430 0.9467388 0.9467341 0.9467008 0.9466987 0.9466811
## 37440 38711 38716 40488 39003 38824 37552
## 0.9466638 0.9466603 0.9466307 0.9466307 0.9466041 0.9465798 0.9465377
## 40748 26749 39715 41048 37709 37759 38799
## 0.9465195 0.9464936 0.9464889 0.9464768 0.9463898 0.9463494 0.9462749
## 37688 40567 37753 39853 38783 39222 37197
## 0.9461623 0.9461588 0.9461169 0.9461009 0.9460193 0.9460193 0.9460031
## 37692 39831 32318
## 0.9459146 0.9459146 0.9457516